In [1]:
import pandas as pd
import numpy as np
import sys

In [2]:
train_set = pd.read_csv('../data/train.csv')
test_set = pd.read_csv('../data/test.csv')
val_set = pd.read_csv('../data/val.csv')

In [3]:
train_set['preferred'].replace({'T':0, 'N':1,'O':2}, inplace=True)
val_set['preferred'].replace({'T':0, 'N':1,'O':2}, inplace=True)
test_set['preferred'].replace({'T':0, 'N':1,'O':2}, inplace=True)

In [4]:
model_list = ['distilbert-base-uncased', 'xlnet-base-cased', 'roberta-base', 'roberta-large', 'albert-base-v2', 'microsoft/deberta-base']   

# Choose the model you want to use, defualt is roberta-large
cmodel = model_list[0]

In [5]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(cmodel, num_labels = 3)
tokenizer = AutoTokenizer.from_pretrained(cmodel)  

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

### Create dataset object

In [6]:
import torch

class getDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

#tokenizer.add_special_tokens({'pad_token': '[PAD]'})
train_encodings = tokenizer(train_set['proc_sent'].tolist(), truncation=True, padding='max_length', max_length=100)
val_encodings = tokenizer(val_set['proc_sent'].tolist(), truncation=True, padding='max_length', max_length=100)
test_encodings = tokenizer(test_set['proc_sent'].tolist(), truncation=True, padding='max_length', max_length=100)    

train_dataset = getDataset(train_encodings, train_set['preferred'].tolist())
val_dataset = getDataset(val_encodings, val_set['preferred'].tolist())
test_dataset = getDataset(test_encodings, test_set['preferred'].tolist())

### Fine-tuning the pre-trained model

In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
)

trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,          # evaluation dataset
)

trainer.train()

***** Running training *****
  Num examples = 5334
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 334
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mamanul002[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=334, training_loss=0.8300920132391467, metrics={'train_runtime': 58.6827, 'train_samples_per_second': 181.791, 'train_steps_per_second': 5.692, 'total_flos': 276013166133600.0, 'train_loss': 0.8300920132391467, 'epoch': 2.0})

In [8]:
# Uncomment to save the fine-tuned model with propoer filepath
#filepath = "../models/" + str(cmodel)
#trainer.save_model(filepath)

### Performance Evaluation on Pixie

In [9]:
test_encodings = tokenizer(test_set['proc_sent'].tolist(), truncation=True, padding='max_length', max_length=100)    
test_dataset = getDataset(test_encodings, test_set['preferred'].tolist())

outputs = trainer.predict(test_dataset)
y_pred = outputs.predictions.argmax(1)
y_test = test_dataset.labels

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))

***** Running Prediction *****
  Num examples = 1778
  Batch size = 32


              precision    recall  f1-score   support

           0     0.8148    0.7955    0.8050       802
           1     0.5096    0.5728    0.5394       323
           2     0.7595    0.7351    0.7471       653

    accuracy                         0.7328      1778
   macro avg     0.6947    0.7011    0.6972      1778
weighted avg     0.7391    0.7328    0.7355      1778

[[638  78  86]
 [ 72 185  66]
 [ 73 100 480]]


### Performance evalution for Implicit Comparisons 

In [10]:
X_test_imp = test_set[test_set['comparison'] == 1]['proc_sent']
y_test_imp = test_set[test_set['comparison'] == 1]['preferred']

test_encodings = tokenizer(X_test_imp.tolist(), truncation=True, padding='max_length', max_length=100)    
test_dataset = getDataset(test_encodings, y_test_imp.tolist())

outputs = trainer.predict(test_dataset)
y_pred_imp = outputs.predictions.argmax(1)
#y_test = test_dataset.labels

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test_imp, y_pred_imp, digits=4))
print(confusion_matrix(y_test_imp, y_pred_imp))

***** Running Prediction *****
  Num examples = 1004
  Batch size = 32


              precision    recall  f1-score   support

           0     0.8074    0.7806    0.7938       392
           1     0.4706    0.5000    0.4848       160
           2     0.7890    0.7942    0.7916       452

    accuracy                         0.7420      1004
   macro avg     0.6890    0.6916    0.6901      1004
weighted avg     0.7454    0.7420    0.7436      1004

[[306  32  54]
 [ 38  80  42]
 [ 35  58 359]]


In [11]:
### Performance evalution for Explicit Comparisons 

In [12]:
X_test_exp = test_set[test_set['comparison'] == 2]['proc_sent']
y_test_exp = test_set[test_set['comparison'] == 2]['preferred']

test_encodings = tokenizer(X_test_exp.tolist(), truncation=True, padding='max_length', max_length=100)    
test_dataset = getDataset(test_encodings, y_test_exp.tolist())

outputs = trainer.predict(test_dataset)
y_pred_exp = outputs.predictions.argmax(1)
#y_test = test_dataset.labels

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test_exp, y_pred_exp, digits=4))
print(confusion_matrix(y_test_exp, y_pred_exp))

***** Running Prediction *****
  Num examples = 774
  Batch size = 32


              precision    recall  f1-score   support

           0     0.8218    0.8098    0.8157       410
           1     0.5440    0.6442    0.5899       163
           2     0.6836    0.6020    0.6402       201

    accuracy                         0.7209       774
   macro avg     0.6831    0.6853    0.6819       774
weighted avg     0.7274    0.7209    0.7226       774

[[332  46  32]
 [ 34 105  24]
 [ 38  42 121]]
