In [198]:
import numpy as np
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval
import matplotlib.pyplot as plt
import plotly.express as px
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import nlpaug.augmenter.word as naw
import inspect
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

In [199]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('.', '.')
dpm.load_task1()
original_train_data = dpm.train_task1_df

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [168]:
original_train_data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4


In [169]:
# Split into the training and dev set
def split_train_test(original_train_data, train_csv_path, test_csv_path):
    train_semeval_parids = pd.read_csv(train_csv_path)
    dev_semeval_parids = pd.read_csv(test_csv_path)

    train_semeval_parids['par_id'] = train_semeval_parids['par_id'].astype(str)
    dev_semeval_parids['par_id'] = dev_semeval_parids['par_id'].astype(str)

    train_data_df = pd.merge(train_semeval_parids['par_id'], original_train_data, on='par_id', how='left')
    test_data_df = pd.merge(dev_semeval_parids['par_id'], original_train_data, on='par_id', how='left')

    return train_data_df, test_data_df
    
train_csv_path = 'train_semeval_parids-labels.csv'
test_csv_path = 'dev_semeval_parids-labels.csv'

train_data_df, test_data_df = split_train_test(original_train_data, train_csv_path, test_csv_path)

In [170]:
# Define class weighting
k_p = len((train_data_df[train_data_df.label==1]))
k_n = len((train_data_df[train_data_df.label==0]))
class_weights = [len(train_data_df) /k_n, len(train_data_df) / k_p]
class_weights

[1.1047355230180715, 10.547858942065492]

In [171]:
def preprocess_text(text):
    '''
    Preprocess a single string
    '''
    # Removing punctuation
    text = re.sub(f'[{string.punctuation}]', '', text)
    
    # Removing stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    
    # Reduce words to root form
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    text = ' '.join(tokens)
    
    # Remove numbers
    text = re.sub(r'\d+', '[NUM]', text)
    return text

def apply_preprocessing(df):
    preprocess_df = df.copy(deep=True)
    preprocess_df['text'] = preprocess_df['text'].apply(preprocess_text)
    return preprocess_df

In [172]:
def add_tokens(df):
    '''
    Adds the country and keyword to the text as special tokens, encased in the <e></e> tag
    '''
    added_tokens_df = df.copy(deep=True)
    added_tokens_df['text'] = added_tokens_df.apply(lambda row: f"{row['text']} <e>{row['country']}</e> <e>{row['keyword']}</e>", axis=1)

    return added_tokens_df

In [173]:
class RoBERTaBinaryClassifier(nn.Module):
    def __init__(self, freeze=True):
        super(RoBERTaBinaryClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        
        if freeze:
            for name, param in self.roberta.named_parameters():
                if not name.startswith('pooler.dense.'):
                    param.requires_grad = False
        
        self.fc1 = nn.Linear(self.roberta.config.hidden_size, 512)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.1) 
        self.fc2 = nn.Linear(512, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state[:, 0, :]
        x = self.fc1(last_hidden_state)
        x = self.tanh(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        probabilities = self.sigmoid(logits)
        return probabilities

In [174]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.encodings = self.tokenize_data()

    def tokenize_data(self):
        encodings = self.tokenizer(self.texts, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        self.weights = [class_weights[l] for l in self.labels]
        return encodings

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'label': torch.tensor(self.labels[idx], dtype=torch.float),
            'weights': torch.tensor(self.weights[idx], dtype=torch.float),
        }
        return encoding

In [175]:
def train_model(model, train_loader, optimizer, device, weighted_dataloader):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        weight = batch['weights'].to(device)
        if weighted_dataloader:
            criterion = nn.BCELoss()
        else:
            criterion = nn.BCELoss(weight=weight)

        optimizer.zero_grad()
        
        probabilities = model(input_ids, attention_mask)
        # Special case for small batches
        if len(labels) == 1:
            probabilities = probabilities.squeeze(0)
        else:
            probabilities = probabilities.squeeze()

        loss = criterion(probabilities, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    return total_loss / len(train_loader)

def evaluate_on_test(model, test_loader, device):
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            probabilities = model(input_ids, attention_mask)
            predictions.extend(probabilities.round().cpu().numpy().astype(int))
            true_labels.extend(labels.cpu().numpy().astype(int))

    f1 = f1_score(true_labels, predictions)
    acc = accuracy_score(true_labels, predictions)

    print(f"F1 Score: {f1:.4f}")
    print(f"Accurcay: {acc:.4f}")
    return f1, acc

In [176]:
def plot_stats(stats):
    epochs = list(stats.keys())
    train_loss = list(map(lambda x: x['Train Loss'], stats.values()))
    f1_scores = list(map(lambda x: x['F1'], stats.values()))
    accuracy = list(map(lambda x: x['Acc'], stats.values()))
    
    fig = make_subplots(rows=3, cols=1, subplot_titles=("Train Loss", "F1 Score", "Accuracy"))

    fig.add_trace(go.Scatter(x=epochs, y=train_loss, name='Train Loss', mode='lines+markers', line=dict(color='blue')), row=1, col=1)

    fig.add_trace(go.Scatter(x=epochs, y=f1_scores, name='F1 Score', mode='lines+markers', line=dict(color='green')), row=2, col=1)

    fig.add_trace(go.Scatter(x=epochs, y=accuracy, name='Accuracy', mode='lines+markers', line=dict(color='green')), row=3, col=1)

    fig.update_layout(height=1200, width=600, title_text="Metrics Over Epochs", showlegend=False)
    fig.update_xaxes(title_text="Epochs", row=4, col=1)
    fig.update_yaxes(title_text="Train Loss", row=1, col=1)
    fig.update_yaxes(title_text="F1 Score", row=2, col=1)
    fig.update_yaxes(title_text="Accuracy", row=3, col=1)
    fig.show()

In [177]:
def custom_train_and_eval(train_data_df, test_data_df, freeze, preprocess, special_tokens, weighted_dataloader):
    # Define device
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    print("Device:", device)
    if str(device) == 'mps':
        torch.mps.empty_cache()
    
    # Hyperparameters
    MAX_LENGTH = 512
    BATCH_SIZE = 32 if freeze else 2
    LR = 1e-5
    EPOCHS = 20
    
    # Preprocessing
    train_data_df = apply_preprocessing(train_data_df) if preprocess else train_data_df
    test_data_df = apply_preprocessing(test_data_df) if preprocess else test_data_df

    # Add special tokens
    train_data_df = add_tokens(train_data_df) if special_tokens else train_data_df
    test_data_df = add_tokens(test_data_df) if special_tokens else test_data_df

    # Define the tokenizer
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    # Define datasets
    train_dataset = CustomDataset(list(train_data_df['text'].values), list(train_data_df['label'].values), tokenizer, MAX_LENGTH)
    test_dataset = CustomDataset(list(test_data_df['text'].values), list(test_data_df['label'].values), tokenizer, MAX_LENGTH)

    # Use either weighter dataloaders or weighted loss
    if weighted_dataloader:
        samples_weight = torch.tensor([class_weights[int(l)] for l in train_dataset.labels])
        sampler = torch.utils.data.WeightedRandomSampler(samples_weight, len(train_dataset))
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler)
    else:
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)        
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    model = RoBERTaBinaryClassifier(freeze=freeze).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)

    model_name = f'model_{"Frozen" if freeze else "Unfrozen"}_{"PreProcessed" if preprocess else "NotPreProcessed"}_{"SpecialTokens" if special_tokens else "NoSpecialToken"}_{"WeightedDataloader" if weighted_dataloader else "WeightedLoss"}'

    train_stats = {}

    for epoch in range(EPOCHS):
        train_loss = train_model(model, train_loader, optimizer, device, weighted_dataloader)
        lr_scheduler.step()

        print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {train_loss:.4f}")
        f1, acc = evaluate_on_test(model, test_loader, device)
        train_stats[epoch] = {
            'Train Loss': train_loss,
            'F1': f1,
            'Acc': acc
        }

        torch.save(model.state_dict(), f"./models/{model_name}_{epoch + 1}")
        torch.save(train_stats, f"./models/train_stats_{model_name}")

    # Figure for stats
    plot_stats(train_stats)

In [36]:
custom_train_and_eval(train_data_df=train_data_df, test_data_df=test_data_df, freeze=False, preprocess=True, special_tokens=True, weighted_dataloader=True)

Device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 4188/4188 [25:14<00:00,  2.77it/s]


Epoch 1/20, Train Loss: 0.3442


Evaluation: 100%|██████████| 1047/1047 [01:26<00:00, 12.14it/s]


F1 Score: 0.5050
Accurcay: 0.8586


Training: 100%|██████████| 4188/4188 [25:09<00:00,  2.78it/s]


Epoch 2/20, Train Loss: 0.1336


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.02it/s]


F1 Score: 0.4913
Accurcay: 0.8329


Training: 100%|██████████| 4188/4188 [25:09<00:00,  2.77it/s]


Epoch 3/20, Train Loss: 0.0865


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.02it/s]


F1 Score: 0.5476
Accurcay: 0.9093


Training: 100%|██████████| 4188/4188 [25:05<00:00,  2.78it/s]


Epoch 4/20, Train Loss: 0.0568


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.02it/s]


F1 Score: 0.5450
Accurcay: 0.9155


Training: 100%|██████████| 4188/4188 [25:05<00:00,  2.78it/s]


Epoch 5/20, Train Loss: 0.0441


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.02it/s]


F1 Score: 0.5695
Accurcay: 0.9097


Training: 100%|██████████| 4188/4188 [25:06<00:00,  2.78it/s]


Epoch 6/20, Train Loss: 0.0342


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.02it/s]


F1 Score: 0.5227
Accurcay: 0.9198


Training: 100%|██████████| 4188/4188 [25:05<00:00,  2.78it/s]


Epoch 7/20, Train Loss: 0.0258


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.03it/s]


F1 Score: 0.5248
Accurcay: 0.9222


Training: 100%|██████████| 4188/4188 [25:06<00:00,  2.78it/s]


Epoch 8/20, Train Loss: 0.0283


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.02it/s]


F1 Score: 0.5360
Accurcay: 0.9107


Training: 100%|██████████| 4188/4188 [25:04<00:00,  2.78it/s]


Epoch 9/20, Train Loss: 0.0245


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.02it/s]


F1 Score: 0.5568
Accurcay: 0.9217


Training: 100%|██████████| 4188/4188 [25:06<00:00,  2.78it/s]


Epoch 10/20, Train Loss: 0.0223


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.02it/s]


F1 Score: 0.5614
Accurcay: 0.9164


Training: 100%|██████████| 4188/4188 [25:13<00:00,  2.77it/s]


Epoch 11/20, Train Loss: 0.0223


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.03it/s]


F1 Score: 0.5648
Accurcay: 0.9279


Training: 100%|██████████| 4188/4188 [25:05<00:00,  2.78it/s]


Epoch 12/20, Train Loss: 0.0144


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.03it/s]


F1 Score: 0.5745
Accurcay: 0.9236


Training: 100%|██████████| 4188/4188 [25:06<00:00,  2.78it/s]


Epoch 13/20, Train Loss: 0.0182


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.03it/s]


F1 Score: 0.5526
Accurcay: 0.9126


Training: 100%|██████████| 4188/4188 [25:09<00:00,  2.77it/s]


Epoch 14/20, Train Loss: 0.0141


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.03it/s]


F1 Score: 0.5523
Accurcay: 0.9265


Training: 100%|██████████| 4188/4188 [25:06<00:00,  2.78it/s]


Epoch 15/20, Train Loss: 0.0093


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.02it/s]


F1 Score: 0.5323
Accurcay: 0.9169


Training: 100%|██████████| 4188/4188 [25:05<00:00,  2.78it/s]


Epoch 16/20, Train Loss: 0.0027


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.02it/s]


F1 Score: 0.5000
Accurcay: 0.9226


Training: 100%|██████████| 4188/4188 [25:06<00:00,  2.78it/s]


Epoch 17/20, Train Loss: 0.0110


Evaluation: 100%|██████████| 1047/1047 [01:27<00:00, 12.02it/s]


F1 Score: 0.3622
Accurcay: 0.9226


Training: 100%|██████████| 4188/4188 [24:37<00:00,  2.83it/s]


Epoch 18/20, Train Loss: 0.0076


Evaluation: 100%|██████████| 1047/1047 [01:26<00:00, 12.16it/s]


F1 Score: 0.5849
Accurcay: 0.9241


Training: 100%|██████████| 4188/4188 [24:26<00:00,  2.85it/s]


Epoch 19/20, Train Loss: 0.0084


Evaluation: 100%|██████████| 1047/1047 [01:25<00:00, 12.19it/s]


F1 Score: 0.5329
Accurcay: 0.9255


Training: 100%|██████████| 4188/4188 [48:18<00:00,  1.44it/s]    


Epoch 20/20, Train Loss: 0.0095


Evaluation: 100%|██████████| 1047/1047 [11:53<00:00,  1.47it/s]  


F1 Score: 0.5309
Accurcay: 0.9274


In [178]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
MAX_LENGTH = 512
BATCH_SIZE = 32

In [179]:
model = RoBERTaBinaryClassifier(freeze=False).to(device)
model.load_state_dict(torch.load("./models/model_Unfrozen_NotPreProcessed_SpecialTokens_WeightedDataloader_18"))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [180]:
# Apply the same prepocessing and special tokenization as the model was train on
preprocess = False
special_tokens = True
test_data_df = apply_preprocessing(test_data_df) if preprocess else test_data_df
test_data_df = add_tokens(test_data_df) if special_tokens else test_data_df
test_dataset = CustomDataset(list(test_data_df['text'].values), list(test_data_df['label'].values), tokenizer, MAX_LENGTH)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [181]:
def evaluate_on_test_grouped(model, test_data_df, key):
    model.eval()
    scores_per_key = {}

    unique_keys = test_data_df[key].unique()
    
    for unique_key in unique_keys:
        keyword_data = test_data_df[test_data_df[key] == unique_key]
        test_dataset = CustomDataset(list(keyword_data['text'].values), list(keyword_data['label'].values), tokenizer, MAX_LENGTH)
        test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

        predictions = []
        true_labels = []

        with torch.no_grad():
            for batch in tqdm(test_loader, desc=f"Evaluation for {unique_key}"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                logits = model(input_ids, attention_mask)
                preds = logits.round().cpu().numpy().astype(int)
                
                predictions.extend(preds)
                true_labels.extend(labels.cpu().numpy().astype(int))
        f1 = f1_score(true_labels, predictions)
        acc = accuracy_score(true_labels, predictions)
        scores_per_key[unique_key] = {"acc": acc, "f1": f1}

    return scores_per_key

In [183]:
scores_per_keyword = evaluate_on_test_grouped(model, test_data_df, 'keyword')
scores_per_keyword

Evaluation for hopeless: 100%|██████████| 7/7 [00:16<00:00,  2.30s/it]
Evaluation for refugee: 100%|██████████| 6/6 [00:08<00:00,  1.42s/it]
Evaluation for in-need: 100%|██████████| 8/8 [00:18<00:00,  2.32s/it]
Evaluation for homeless: 100%|██████████| 7/7 [00:10<00:00,  1.57s/it]
Evaluation for poor-families: 100%|██████████| 6/6 [00:07<00:00,  1.22s/it]
Evaluation for women: 100%|██████████| 8/8 [00:10<00:00,  1.29s/it]
Evaluation for immigrant: 100%|██████████| 7/7 [00:12<00:00,  1.77s/it]
Evaluation for migrant: 100%|██████████| 7/7 [00:12<00:00,  1.82s/it]
Evaluation for vulnerable: 100%|██████████| 7/7 [00:10<00:00,  1.52s/it]
Evaluation for disabled: 100%|██████████| 7/7 [00:16<00:00,  2.31s/it]


{'hopeless': {'acc': 0.8940092165898618, 'f1': 0.5818181818181819},
 'refugee': {'acc': 0.9202127659574468, 'f1': 0.4827586206896552},
 'in-need': {'acc': 0.9336283185840708, 'f1': 0.7945205479452054},
 'homeless': {'acc': 0.8820754716981132, 'f1': 0.5454545454545454},
 'poor-families': {'acc': 0.8368421052631579, 'f1': 0.5507246376811595},
 'women': {'acc': 0.9356223175965666, 'f1': 0.34782608695652173},
 'immigrant': {'acc': 0.981651376146789, 'f1': 0.6},
 'migrant': {'acc': 0.9806763285024155, 'f1': 0.5},
 'vulnerable': {'acc': 0.937799043062201, 'f1': 0.6285714285714286},
 'disabled': {'acc': 0.9278350515463918, 'f1': 0.4615384615384615}}

In [184]:
scores_per_orig_label = evaluate_on_test_grouped(model, test_data_df, 'orig_label')
scores_per_orig_label

Evaluation for 3: 100%|██████████| 3/3 [00:11<00:00,  3.69s/it]
Evaluation for 4: 100%|██████████| 3/3 [00:04<00:00,  1.63s/it]
Evaluation for 2: 100%|██████████| 1/1 [00:00<00:00,  1.47it/s]
Evaluation for 0: 100%|██████████| 54/54 [01:10<00:00,  1.30s/it]
Evaluation for 1: 100%|██████████| 6/6 [00:08<00:00,  1.49s/it]


{'3': {'acc': 0.48314606741573035, 'f1': 0.6515151515151515},
 '4': {'acc': 0.6956521739130435, 'f1': 0.8205128205128205},
 '2': {'acc': 0.2777777777777778, 'f1': 0.4347826086956522},
 '0': {'acc': 0.9765258215962441, 'f1': 0.0},
 '1': {'acc': 0.8324607329842932, 'f1': 0.0}}

In [185]:
def plot_metrics_per_group(key, scores_per_key, metric, with_sample):
    sample_counts = train_data_df[key].value_counts(normalize=True)
    sample_counts = sample_counts.to_dict()

    keywords = list(scores_per_key.keys())
    scores = [scores_per_key[keyword][metric] for keyword in keywords]
    sample_percentages = [sample_counts[keyword] for keyword in keywords]

    x_values = []
    for keyword in keywords:
        x_values.append(keyword + " Score")
        if with_sample:
            x_values.append(keyword + " Sample")

    y_values = []
    for score, sample in zip(scores, sample_percentages):
        y_values.append(score)
        if with_sample:
            y_values.append(sample)

    colors = ['coral' if i % 2 == 0 else 'royalblue' for i in range(len(x_values))] if with_sample else ['royalblue' for i in range(len(x_values))]

    fig = go.Figure()

    fig.add_trace(go.Bar(x=x_values, y=y_values, marker_color=colors))

    fig.update_layout(
        title_text=f'Score and Sample Percentages by {key} in the training data',
        xaxis=dict(title=f'Keyword & Metric', tickangle=-45),
        yaxis=dict(title='Value'),
        width=1000,
        height=600,
        legend_title="Metric",
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
    )

    fig.data[0].marker.line.color = 'rgba(0,0,0,0)'
    fig.data[0].marker.line.width = 0.5
    fig.data[0].showlegend = False
    # fig.add_trace(go.Bar(x=[None], y=[None], marker_color='royalblue', name=metric))
    # fig.add_trace(go.Bar(x=[None], y=[None], marker_color='royalblue', name='Sample Percentage'))

    fig.show()

In [163]:
plot_metrics_per_group('keyword', scores_per_keyword, 'f1', False)

In [140]:
plot_metrics_per_group('orig_label', scores_per_orig_label)

In [186]:
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [189]:
def evaluate_on_test_and_save_to_file(model, test_loader, device):
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            probabilities = model(input_ids, attention_mask)
            predictions.extend(probabilities.round().cpu().numpy().astype(int))
            true_labels.extend(labels.cpu().numpy().astype(int))

    f1 = f1_score(true_labels, predictions)
    print(f"Final f1 score{f1}")
    labels2file(predictions, "dev.txt")

In [190]:
test_dataset = CustomDataset(list(test_data_df['text'].values), list(test_data_df['label'].values), tokenizer, MAX_LENGTH)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [192]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        probabilities = model(input_ids, attention_mask)
        predictions.extend(probabilities.round().cpu().numpy().astype(int))
        true_labels.extend(labels.cpu().numpy().astype(int))

f1 = f1_score(true_labels, predictions)
print(f"Final f1 score{f1}")
labels2file(predictions, "dev.txt")

Evaluation: 100%|██████████| 66/66 [01:31<00:00,  1.39s/it]


Final f1 score0.5848563968668408


In [207]:
dpm = DontPatronizeMe('.', './task4_test.tsv')
dpm.load_test()
final_test_df = dpm.test_set_df
final_test_df

Unnamed: 0,par_id,art_id,keyword,country,text
0,t_0,@@7258997,vulnerable,us,"In the meantime , conservatives are working to..."
1,t_1,@@16397324,women,pk,In most poor households with no education chil...
2,t_2,@@16257812,migrant,ca,The real question is not whether immigration i...
3,t_3,@@3509652,migrant,gb,"In total , the country 's immigrant population..."
4,t_4,@@477506,vulnerable,ca,"Members of the church , which is part of Ken C..."
...,...,...,...,...,...
3827,t_3893,@@20319448,migrant,jm,In a letter dated Thursday to European Commiss...
3828,t_3894,@@9990672,poor-families,au,They discovered that poor families with health...
3829,t_3895,@@37984,migrant,ca,"She married at 19 , to Milan ( Emil ) Badovina..."
3830,t_3896,@@9691377,immigrant,us,The United Kingdom is n't going to devolve int...
