# Post-Processing

In [None]:
%pip install datasets transformers scikit-learn pandas torch simpletransformers scipy wandb

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [4]:
from simpletransformers.classification import (
    ClassificationModel, ClassificationArgs
)
import pandas as pd
import logging
import torch

In [5]:
# Confirm CUDA is available
cuda_available = torch.cuda.is_available()
cuda_available

True

In [6]:
ROOT_CSV_PATH = '/home/nli/data'
OUTPUT_PATH = '/home/nli/outputs'

In [7]:
df_snli_mt_dev = pd.read_csv(os.path.join(ROOT_CSV_PATH, 'df_snli_train.csv'), delimiter=";", encoding='utf-8')
df_snli_mt_eval = pd.read_csv(os.path.join(ROOT_CSV_PATH, 'df_snli_test.csv'), delimiter=";", encoding='utf-8')

In [8]:
cols_to_drop = df_snli_mt_dev.columns[:2]
cols_to_drop

Index(['Unnamed: 0.1', 'Unnamed: 0'], dtype='object')

In [9]:
df_snli_mt_dev.drop(columns=cols_to_drop, axis=1, inplace=True)
df_snli_mt_eval.drop(columns=cols_to_drop, axis=1, inplace=True)

In [10]:
df_snli_mt_dev.head(5)

Unnamed: 0,premise,hypothesis,classification
0,Persuna fuq żiemel taqbeż minn fuq ajruplan im...,Persuna qed titħarreġ iż-żiemel tagħha għal ko...,neutral
1,Persuna fuq żiemel taqbeż minn fuq ajruplan im...,"Persuna tkun f'diner, tordna omelette.",contradiction
2,Persuna fuq żiemel taqbeż minn fuq ajruplan im...,"Persuna tkun barra, fuq żiemel.",entailment
3,Tfal jitbissmu u jxejru mal-kamera,Qed jitbissmu lejn il-ġenituri tagħhom,neutral
4,Tfal jitbissmu u jxejru mal-kamera,Hemm tfal preżenti,entailment


In [11]:
df_snli_mt_dev.columns = ["text_a","text_b","labels"]
df_snli_mt_eval.columns = ["text_a","text_b","labels"]

In [12]:
def map_to_num(label):
  if label == 'entailment':
    return 0
  elif label == 'contradiction':
    return 2
  else:
    return 1

def map_to_label(num):
  if num == 0:
    return "entailment"
  elif num == 2:
    return "contradiction"
  else:
    return "neutral"

In [13]:
dev_labels = [map_to_num(x) for x in df_snli_mt_dev['labels'].to_list()]
df_snli_mt_dev['labels'] = dev_labels
df_snli_mt_dev["labels"] = df_snli_mt_dev["labels"].astype(int)

In [14]:
eval_labels = [map_to_num(x) for x in df_snli_mt_eval['labels'].to_list()]
df_snli_mt_eval['labels'] = eval_labels
df_snli_mt_eval["labels"] = df_snli_mt_eval["labels"].astype(int)

In [15]:
df_snli_mt_dev.head(10)

Unnamed: 0,text_a,text_b,labels
0,Persuna fuq żiemel taqbeż minn fuq ajruplan im...,Persuna qed titħarreġ iż-żiemel tagħha għal ko...,1
1,Persuna fuq żiemel taqbeż minn fuq ajruplan im...,"Persuna tkun f'diner, tordna omelette.",2
2,Persuna fuq żiemel taqbeż minn fuq ajruplan im...,"Persuna tkun barra, fuq żiemel.",0
3,Tfal jitbissmu u jxejru mal-kamera,Qed jitbissmu lejn il-ġenituri tagħhom,1
4,Tfal jitbissmu u jxejru mal-kamera,Hemm tfal preżenti,0
5,Tfal jitbissmu u jxejru mal-kamera,It-tfal qed iqarqu,2
6,Tifel qed jaqbeż fuq skateboard f’nofs pont aħ...,It-tifel jiskejkja mal-bankina.,2
7,Tifel qed jaqbeż fuq skateboard f’nofs pont aħ...,It-tifel jagħmel trick skateboarding.,0
8,Tifel qed jaqbeż fuq skateboard f’nofs pont aħ...,It-tifel qed jilbes tagħmir tas-sigurtà.,1
9,Raġel anzjan joqgħod bil-meraq tal-larinġ tieg...,Raġel anzjan jixrob il-meraq tiegħu waqt li ji...,1


In [16]:
model_args = ClassificationArgs()
model_args.num_train_epochs = 4
model_args.learning_rate = 5e-5
model_args.train_batch_size = 4
model_args.gradient_accumulation_steps = 2
model_args.fp16 = True
model_args.use_multiprocessing = False
model_args.use_multiprocessing_for_evaluation = False
model_args.use_multiprocessed_decoding = False
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.use_multiprocessing=False
model_args.wandb_project = 'dissertation'
# model_args.manual_seed = 4
model_args.max_seq_length = 512
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 50000
model_args.evaluate_during_training_verbose = True
model_args.save_eval_checkpoints = False
model_args.save_steps = 250000
model_args.output_dir = OUTPUT_PATH

In [17]:
model = ClassificationModel("bert", "MLRS/BERTu", num_labels=3, args=model_args, use_cuda=cuda_available)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at MLRS/BERTu and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average = 'macro')

def recall_multiclass(labels, preds):
    return recall_score(labels, preds, average = 'macro')

def precision_multiclass(labels, preds):
    return precision_score(labels, preds, average = 'macro')

In [None]:
model.train_model(df_snli_mt_dev,eval_df=df_snli_mt_eval, precision=precision_multiclass, f1 = f1_multiclass, recall=recall_multiclass,  acc=accuracy_score)

In [20]:
result, model_outputs, wrong_predictions = model.eval_model(
    df_snli_mt_eval
)

Running Evaluation:   0%|          | 0/1228 [00:00<?, ?it/s]

VBox(children=(Label(value='0.026 MB of 0.035 MB uploaded\r'), FloatProgress(value=0.7323070286592435, max=1.0…

0,1
Training loss,█▄▄▅▆▂▂▂▁▃▄▃▄▂▃▃▂▁▃▂▄▃▃▃▂▂▂▄▁▂▂▁▁▁▂▃▃▁▂▁
acc,▁▅█▇▇
eval_loss,▃▁▁▄█
f1,▁▅█▇▇
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▂▄████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
mcc,▁▅█▇▇
precision,▁▅█▇▇
recall,▁▅█▇▇
train_loss,█▁▂▅▁

0,1
Training loss,0.09474
acc,0.85882
eval_loss,0.53201
f1,0.85857
global_step,42920.0
lr,0.0
mcc,0.78823
precision,0.85858
recall,0.85861
train_loss,0.00584


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112041947328382, max=1.0…

In [21]:
print("Accuracy: " + str(100*(1-len(wrong_predictions)/len(model_outputs))) + "%")

Accuracy: 85.88151465798046%


In [None]:
predictions, raw_outputs = model.predict(
    [
        [
            "Tifel jilgħab bil-ballun f'nofs ta' triq",
            "Tifel rieqed fis-sodda",
        ]
    ]
)
map_to_label(predictions[0])