In [1]:
import sys 
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
import torch.nn.functional as F

## Training 

In [2]:
#load training data
df = pd.read_excel("few-shot-learning-data.xlsx")
df

Unnamed: 0,label,text
0,5G,5G network
1,5G,VMware has announced research grants to work t...
2,5G,T-Mobile and Ericsson announced that Ericsson’...
3,AI,"Artificial Intelligence, ChatGPT"
4,AI,"RBC Capital Markets launch Aiden Arrival, the ..."
...,...,...
154,Tech Talent,Canada's biggest banks started fiscal 2022 on ...
155,Tech Talent,BMO received the Best in Tech – Learner Engage...
156,Tech Talent,The cryptocurrency sector may have been in the...
157,Tech Spend,HSBC’s annual report says the bank increased t...


In [3]:
#load all unique labels
labels = df['label'].unique()

labels

array(['5G', 'AI', 'Award', 'Banking as a Service', 'Biometrics',
       'Blockchain', 'CBDC', 'Cloud', 'Crypto', 'Cybersecurity',
       'Decentralized ID', 'DeFi', 'Digital', 'Digital ID', 'Digital iD',
       'Digital Lending', 'Embedded Banking', 'Fines', 'FinTech',
       'Tech Innovation', 'Metaverse', 'NFTs', 'Open Banking',
       'Passwordless authentication', 'Payments', 'Quantum',
       'Real-Time Payments', 'Sustainability', 'Sustainable Finance',
       'Tech Talent', 'Tech Spend'], dtype=object)

In [4]:
# group the values by key and convert to dictionary
labeled_data = df.groupby('label')['text'].apply(list).to_dict()

# print the resulting dictionary
labeled_data

{'5G': ['5G network',
  'VMware has announced research grants to work towards enabling humans to interact in quasi real-time with cyber-physical-systems in the physical or digital world over intelligent communications networks (5G and beyo. The grants are for 6G-life and the Centre for Tactile Internet with Human-in-the-Loop (CeTI) at Dresden University of Technology and Telecommunication Networks Group (TKN) at Technical University of Berlin. Operating on terahertz frequency bands, 6G is expected to deliver a peak rate of 1000 GB/s and to be 100 times faster than 5G with enhanced reliability and wider network coverage',
  'T-Mobile and Ericsson announced that Ericsson’s industry-leading RAN and Core equipment will power T-Mobile’s 5G Advanced Network Solutions (5G ANS) — a suite of supercharged 5G network options for enterprise and government customers. 5G Advanced Network Solutions allows customers to choose the level of performance they need, giving them the flexibility to unlock th

In [5]:
len(labeled_data)

31

In [6]:
#load pretrained model from hugging face

tokenizer_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
model = BertForSequenceClassification.from_pretrained(tokenizer_name, num_labels=len(labels))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
# Encode the labels as numerical values
label_map = {tag: i for i, tag in enumerate(labels)}
label_map

{'5G': 0,
 'AI': 1,
 'Award': 2,
 'Banking as a Service': 3,
 'Biometrics': 4,
 'Blockchain': 5,
 'CBDC': 6,
 'Cloud': 7,
 'Crypto': 8,
 'Cybersecurity': 9,
 'Decentralized ID': 10,
 'DeFi': 11,
 'Digital': 12,
 'Digital ID': 13,
 'Digital iD': 14,
 'Digital Lending': 15,
 'Embedded Banking': 16,
 'Fines': 17,
 'FinTech': 18,
 'Tech Innovation': 19,
 'Metaverse': 20,
 'NFTs': 21,
 'Open Banking': 22,
 'Passwordless authentication': 23,
 'Payments': 24,
 'Quantum': 25,
 'Real-Time Payments': 26,
 'Sustainability': 27,
 'Sustainable Finance': 28,
 'Tech Talent': 29,
 'Tech Spend': 30}

In [9]:
# Meta-training
train_texts = []
train_labels = []


for tags in labels:
    train_texts.extend(labeled_data[tags])
    train_labels.extend([label_map[tags]] * len(labeled_data[tags]))
    
        
# for label in labels:
#     for text in labeled_data[label]:
#         train_texts.append(text)
#         train_labels.append([label_map[label]])
    
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset=Dataset(train_encodings,train_labels)

# Define the training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# Define the optimizer and the scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataset) * 10)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

for epoch in range(20):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()
        scheduler.step()
        # Print loss
    print(f"Epoch {epoch + 1}: Loss = {loss.item():.4f}")

Epoch 1: Loss = 3.4827
Epoch 2: Loss = 3.2333
Epoch 3: Loss = 3.1583
Epoch 4: Loss = 2.8723
Epoch 5: Loss = 2.8306
Epoch 6: Loss = 2.6910
Epoch 7: Loss = 2.1962
Epoch 8: Loss = 2.0645
Epoch 9: Loss = 1.7602
Epoch 10: Loss = 1.5013
Epoch 11: Loss = 0.9982
Epoch 12: Loss = 0.7303
Epoch 13: Loss = 0.6325
Epoch 14: Loss = 0.5347
Epoch 15: Loss = 0.3457
Epoch 16: Loss = 0.3642
Epoch 17: Loss = 0.2501
Epoch 18: Loss = 0.2515
Epoch 19: Loss = 0.1759
Epoch 20: Loss = 0.2006


In [10]:
# Meta-validation
val_texts = ["Bank announces new open banking rules.", "BMO enters Twitch Metaverse.", "ChatGPT will take over your job."]
val_labels = ['Open Banking', 'Metaverse', 'AI']

val_encodings = tokenizer(val_texts, truncation=True, padding=True)

val_labels_encoded = [label_map[label] for label in val_labels]

val_dataset = Dataset(val_encodings,val_labels_encoded)

model.eval()
val_predictions=[]

val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

with torch.no_grad():
    for batch in val_loader:

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        val_outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        val_loss = val_outputs.loss
        val_logits = val_outputs.logits
#         val_probs = F.softmax(val_logits, dim=1)
#         val_preds = torch.argmax(val_probs, dim=1)
        val_probs = F.sigmoid(val_logits)
        val_predictions = (val_probs > 0.9).squeeze().long()
        
#         val_predictions.extend(val_preds.tolist())
        

print("Meta-validation loss:", loss.item())
# print("Meta-validation predictions:", val_predictions)


Meta-validation loss: 0.20063716173171997


In [11]:
labels = df['label'].unique()
print("Meta-validation predictions:" )

count = 0
for row in val_predictions:
    for i in range(len(row)):
        if row[i] == 1:
            print(labels[i] + str(": ") + val_texts[count])
            count +=1
    print('---')


Meta-validation predictions:
Open Banking: Bank announces new open banking rules.
---
Metaverse: BMO enters Twitch Metaverse.
---
AI: ChatGPT will take over your job.
---


In [12]:
# Meta-testing
test_texts = ["ChatGPT the new AI bot.", "Accenture takes meeting in the metaverse.", "Fintech specializes in open banking"] #sample text
test_labels = ["Open Banking", "Metaverse", "AI"]

test_encodings = tokenizer(test_texts, truncation=True, padding=True)

test_labels_encoded = [label_map[label] for label in test_labels]

test_dataset = Dataset(test_encodings, test_labels_encoded)

test_predictions=[]

test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        test_outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        test_loss = test_outputs.loss
        test_logits = test_outputs.logits
#         test_probs = F.softmax(test_logits, dim=1)
#         test_preds = torch.argmax(test_probs, dim=1)
        test_probs = F.sigmoid(test_logits)
        test_predictions = (test_probs > 0.8).squeeze().long()
#         test_predictions.extend(test_preds.tolist())
#         print(test_probs)

print("Meta-testing loss:", test_loss.item())
print("Meta-testing predictions:", test_predictions)
# print(test_probs[0][test_preds[0]])

Meta-testing loss: 4.258919715881348
Meta-testing predictions: tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0]], device='cuda:0')


In [13]:
labels = df['label'].unique()

#sample text: ["ChatGPT the new AI bot.", 
            # "Accenture takes meeting in the metaverse.", 
            # "Fintech specializes in open banking"]


for row in test_predictions:
    for i in range(len(row)):
        if row[i] == 1:
            print(labels[i])
    print('---')

AI
---
Metaverse
---
FinTech
Open Banking
---


## Save model to reload again for next use

In [14]:
import pickle

In [15]:
pickle.dump(model, open('model_1.pkl', 'wb'))

In [16]:
#load model
# model_label = pickle.load(open('model_1.pkl', 'rb'))

## Prediction: run model on today's scraped news

In [None]:
#Prediction/inference on new texts

def pred(text):
    
    new_text = [str(text)]
    
    model_label = pickle.load(open('model_1.pkl', 'rb')) #load trained model

    tokenizer_name = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(tokenizer_name)

    labels = ['5G', 'AI', 'Award', 'Banking as a Service', 'Biometrics',
           'Blockchain', 'CBDC', 'Cloud', 'Crypto', 'Cybersecurity',
           'Decentralized ID', 'DeFi', 'Digital', 'Digital ID', 'Digital iD',
           'Digital Lending', 'Embedded Banking', 'Fines', 'FinTech',
           'Tech Innovation', 'Metaverse', 'NFTs', 'Open Banking',
           'Passwordless authentication', 'Payments', 'Quantum',
           'Real-Time Payments', 'Sustainability', 'Sustainable Finance',
           'Tech Talent', 'Tech Spend']

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the new text
    encoded_new_text = tokenizer(
        new_text,
        add_special_tokens=True,
        max_length=64,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )

    # encoded_input = {k: v.to(device) for k, v in encoded_dict.items()}
    outputs = model_label(input_ids=encoded_new_text['input_ids'].to(device), 
                    attention_mask=encoded_new_text['attention_mask'].to(device))
    #     probs = F.softmax(outputs.logits, dim=1)
    #     predictions = torch.argmax(outputs.logits)
    probs = F.sigmoid(outputs.logits)
    predictions = (probs > 0.9).squeeze().long()

#     print(f"The predicted label for the new text is {predictions}")

    tags = list()

    for i in range(len(predictions)):
        if predictions[i] == 1:
#             print(labels[i])
            tags.append(labels[i])
#     print('--')
#     print(probs[0])
    return tags

df = pd.read_excel("todays_relevant_news.xlsx")
df['Label']=df["cleanText"].apply(lambda x: pred(str(x)))
df

In [None]:
#keep news with keywords match 
refined_news=df[df['Label'].str.len()>0]

#change format of Label column to string
refined_news.loc[:, 'Label']=refined_news['Label'].map(lambda x: ', '.join(map(str, x)))
refined_news.to_excel('output.xlsx', index=False)

## Train model again with new labels (work-in-progress)

looking to add
- web3
- AR/VR

In [28]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

# # Load the trained model and tokenizer
# model_name = "distilbert-base-uncased"
# model = AutoModelForSequenceClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# New labels to add to the model
new_labels = ['Web3', 'AR/VR']

# Update the label list
labels = ['5G', 'AI', 'Award', 'Banking as a Service', 'Biometrics',
       'Blockchain', 'CBDC', 'Cloud', 'Crypto', 'Cybersecurity',
       'Decentralized ID', 'DeFi', 'Digital', 'Digital ID', 'Digital iD',
       'Digital Lending', 'Embedded Banking', 'Fines', 'FinTech',
       'Fintech', 'Tech Innovation', 'Metaverse', 'NFTs', 'Open Banking',
       'Passwordless authentication', 'Payments', 'Quantum',
       'Real-Time Payments', 'Sustainability', 'Sustainable Finance',
       'Tech Talent', 'Tech Spend']

labels.extend(new_labels)

# Create a new label map
label_map = {label: i for i, label in enumerate(labels)}

# Define the dataset (sample texts)
texts = ["New games in Web3.", "Virtual Reality headsets are going out of style by Apple/Google"]

labels = ['5G', 'AI', 'Award', 'Banking as a Service', 'Biometrics',
       'Blockchain', 'CBDC', 'Cloud', 'Crypto', 'Cybersecurity',
       'Decentralized ID', 'DeFi', 'Digital', 'Digital ID', 'Digital iD',
       'Digital Lending', 'Embedded Banking', 'Fines', 'FinTech',
       'Fintech', 'Tech Innovation', 'Metaverse', 'NFTs', 'Open Banking',
       'Passwordless authentication', 'Payments', 'Quantum',
       'Real-Time Payments', 'Sustainability', 'Sustainable Finance',
       'Tech Talent', 'Tech Spend']

labels_encoded = [label_map[label] for label in labels]
encodings = tokenizer(texts, truncation=True, padding=True)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = Dataset(encodings, labels_encoded)

# Define the dataloader
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the optimizer and the scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataset) * 10)

for epoch in range(20):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()
        scheduler.step()
        # Print loss
    print(f"Epoch {epoch + 1}: Loss = {loss.item():.4f}")
    
# Save the model and tokenizer
model.save_pretrained("new_model_name")
tokenizer.save_pretrained("new_model_name")

Epoch 1: Loss = 0.1475
Epoch 2: Loss = 0.0738
Epoch 3: Loss = 0.0578
Epoch 4: Loss = 0.0657
Epoch 5: Loss = 0.0314
Epoch 6: Loss = 0.0241
Epoch 7: Loss = 0.0294
Epoch 8: Loss = 0.0188
Epoch 9: Loss = 0.0157
Epoch 10: Loss = 0.0163
Epoch 11: Loss = 0.0138
Epoch 12: Loss = 0.0125
Epoch 13: Loss = 0.0080
Epoch 14: Loss = 0.0060
Epoch 15: Loss = 0.0054
Epoch 16: Loss = 0.0064
Epoch 17: Loss = 0.0074
Epoch 18: Loss = 0.0057
Epoch 19: Loss = 0.0071
Epoch 20: Loss = 0.0060


('new_model_name\\tokenizer_config.json',
 'new_model_name\\special_tokens_map.json',
 'new_model_name\\vocab.txt',
 'new_model_name\\added_tokens.json')

In [29]:
model = BertModel.from_pretrained("./new_model_name")

Some weights of the model checkpoint at ./new_model_name were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def pred(text):
    
    new_text = [str(text)]
    
    model_label = BertModel.from_pretrained("./new_model_name")
    tokenizer = BertTokenizer.from_pretrained("./new_model_name")

#     tokenizer_name = "bert-base-uncased"
#     tokenizer = BertTokenizer.from_pretrained(tokenizer_name)

    labels = ['5G', 'AI', 'Award', 'AR/VR', 'Banking as a Service', 'Biometrics',
           'Blockchain', 'CBDC', 'Cloud', 'Crypto', 'Cybersecurity',
           'Decentralized ID', 'DeFi', 'Digital', 'Digital ID', 'Digital iD',
           'Digital Lending', 'Embedded Banking', 'Fines', 'FinTech',
           'Tech Innovation', 'Metaverse', 'NFTs', 'Open Banking',
           'Passwordless authentication', 'Payments', 'Quantum',
           'Real-Time Payments', 'Sustainability', 'Sustainable Finance',
           'Tech Talent', 'Tech Spend', 'Web3']

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_label.to(device)

    # Tokenize the new text
    encoded_new_text = tokenizer(
        new_text,
        add_special_tokens=True,
        max_length=64,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )

    # encoded_input = {k: v.to(device) for k, v in encoded_dict.items()}
    outputs = model_label(input_ids=encoded_new_text['input_ids'].to(device), 
                    attention_mask=encoded_new_text['attention_mask'].to(device))
    #     probs = F.softmax(outputs.logits, dim=1)
    #     predictions = torch.argmax(outputs.logits)
    probs = F.sigmoid(outputs.logits)
    predictions = (probs > 0.9).squeeze().long()

#     print(f"The predicted label for the new text is {predictions}")

    tags = list()

    for i in range(len(predictions)):
        if predictions[i] == 1:
#             print(labels[i])
            tags.append(labels[i])
#     print('--')
#     print(probs[0])
    return tags

# df = pd.read_excel("todays_relevant_news.xlsx")
# df['Label']=df["cleanText"].apply(lambda x: pred(str(x)))
# df

print(pred('Companies are shifting to Web3'))