# Import Libraries 

In [1]:
import pandas as pd
import numpy as np

from datetime import time
from datetime import datetime
from datetime import date
from dateutil import parser
import feedparser as fp
import pprint
import requests

#for zero-shot classification
from transformers import pipeline

#for fuzzy wuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Creating Sample Dataset

In [25]:
rss = "https://www.finextra.com/rss/headlines.aspx"
# source = fp.parse(rss)


response = requests.get(rss)
source = fp.parse(response.content)

In [26]:
entries = []
for entry in source['entries']:
    entries.append(entry)
    
for entry in entries:
    print(entry['title'])

print(len(entries))

Lloyds tells employees to be back in office two days a week
NextGen Nordics: Cross border payments are a political weapon
Generative AI governance targeted in proposed EU rules
DoJ investigates Mastercard debit card programme
Austrade and SFA launch fintech collaboration initiative
Amazon stock plummets amid cloud slowdown
HM Treasury opens consultation for taxing DeFi cryptoassets
Sweden's BankID launches digital identity card
Digital wallet infrastructure platform Ansa emerges from stealth
Visa staffs up for 'ambitious' crypto plans
JPMorgan uses ChatGPT to analyse Fed speeches - Bloomberg.
IDPartner raises $3.1m for bank-based ID verification system
ASX updates on Chess replacement partnership programme
Paysend opens Dublin office
Lanistar appoints Ed Blankson as new CFO
Binance launches AI-driven Web3 education tool
LATAM fintech Clara raises $60m
Techcombank and Personetics partner for AI-driven money management
TransferMate secures e-money licence from Central Bank o Ireland
Sila

# Zero-Shot Classification Pipeline 

In [32]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification 

#model_name = "valhalla/distilbart-mnli-12-1"
model_name = "facebook/bart-large-mnli"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

classifier = pipeline(task="zero-shot-classification", model=model, tokenizer=tokenizer, framework = "pt", batch_size=64)

In [27]:
# output = []

# #modify date format (easier to manipulate later)
# for i in range(len(source['entries'])):
#     source['entries'][i]['published'] = str(parser.parse(source['entries'][i]['published']).date())

# #default to today's news
# for entry in source['entries']:
#     if entry.published == str(date.today()):
#         output.append(entry)

In [28]:
# for entry in output:
#     print(entry['title'])

# print(len(output))

In [33]:
def zero_shot(text, query):
    
    for entry in text:
        results = classifier(entry['title'], query, multi_label=True)
        
        topics = []
        
        for idx, score in enumerate(results['scores']):
            score = score * 100
            if score>=80:
                topics.append((results['labels'][idx], np.round(score,2)))
                entry['label'] = results['labels']
            else:
                entry['label'] = []
    
    filtered = []
    for entry in text:
        if entry['label'] !=[]:
            filtered.append(entry)
    
    return filtered

In [34]:
startTime = datetime.now()

query = "Crypto"

zero_shot = zero_shot(entries, query)

for entry in zero_shot:
    print(entry['title'])

print("---")

print('Total number of news: ' + str(len(zero_shot)))

endTime = datetime.now()

print("Program runtime = ", str(endTime - startTime)[5:], " sec" )

HM Treasury opens consultation for taxing DeFi cryptoassets
Visa staffs up for 'ambitious' crypto plans
Examining the Real Value Proposition of Crypto and Digital Assets
---
Total number of news: 3
Program runtime =  09.653801  sec


# Fuzzy Wuzzy 

In [39]:
#fuzzy matches threshold
matchThreshold = 90

def count_match(x):
    count = 0
    label = list()
    for t in x:
        if t[1]>matchThreshold:
            count+=1
            label.append(t[0])
    return label

In [40]:
def fuzzywuzzy(entries, query):
    
    df = pd.DataFrame(query, columns = ['Keyword'])
    
    for news in entries:
        news['label'] = count_match(process.extract(news['title'], df.Keyword, scorer = fuzz.token_set_ratio))
        
    filtered = []
    
    for entry in entries:
        if entry['label'] != []:
            filtered.append(entry)
    
    return filtered

In [42]:
startTime = datetime.now()

query = ['Crypto']
fuzzywuzzy = fuzzywuzzy(entries, query)

for entry in fuzzywuzzy:
    print(entry['title'])
    
print('---')

print('Total number of news: ' + str(len(fuzzywuzzy)))

endTime = datetime.now()
print("Program runtime = ", str(endTime - startTime)[5:], " sec")

Visa staffs up for 'ambitious' crypto plans
Examining the Real Value Proposition of Crypto and Digital Assets
---
Total number of news: 2
Program runtime =  00.002499  sec
