# Teacher fine-tuning

Required Packages

In [None]:
%%time
%%capture

# Install required packages

!pip install transformers
# !pip install datasets
# !pip install fairseq
!pip install sentencepiece

## Initialization

In [None]:
# Import required packages

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.utils import shuffle

from numpy.lib.function_base import average

from tqdm.notebook import tqdm

from collections import Counter

import os
import re
import json
import copy
import collections
import time
import pickle

from transformers import BertConfig, BertTokenizer, BertweetTokenizer, RobertaTokenizer, AlbertTokenizer, DistilBertTokenizer, XLMRobertaTokenizer, XLNetTokenizer, T5Tokenizer
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoTokenizer, XLMRobertaTokenizer
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification, DistilBertForSequenceClassification, RobertaForSequenceClassification, AlbertForSequenceClassification, XLMRobertaForSequenceClassification, XLNetForSequenceClassification, T5Model
from transformers import TrainingArguments
from transformers import Trainer
# from fairseq.models.roberta import XLMRModel

In [None]:
tqdm.pandas()

## Train, evaluation, and test sets

In [None]:
TRAIN_PATH = '../input/testinput-1/train.tsv' 
TEST_PATH = '../input/testinput-1/test.tsv'

CONTENT_HEADER = 'sentence'
LABEL_HEADER = 'label'

# general config
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32

EPOCHS = 1
EVERY_EPOCH = 500
LEARNING_RATE = 5e-5

MODELS = ['bert-base-uncased', 'bert-large-uncased', 
          'roberta-base', 'roberta-large', f"cardiffnlp/twitter-roberta-base-sentiment",
          'xlm-roberta-large',
          'xlnet-base-cased', 'xlnet-large-cased',
          't5-base', 't5-large',
          'gpt2-medium']
MODEL_NAMES = ['bert-base-uncased', 'bert-large-uncased', 
          'roberta-base', 'roberta-large', "cardiffnlp-twitter-roberta-base-sentiment",
          'xlm-roberta-large',
          'xlnet-base-cased', 'xlnet-large-cased',
          't5-base', 'tf-large',
          'gpt2-medium']

MODEL_INDEX = 0
OUTPUT_PATH = '/model' + MODEL_NAMES[MODEL_INDEX] + '.bin'
MODEL_PATH =  '/model' + MODEL_NAMES[MODEL_INDEX] + '.pkl'


EVAL_FILE = 'evaluations.csv'
MODEL_RESULTS_FILE = 'model_results.csv'


### Train

In [None]:
train = pd.read_csv(TRAIN_PATH,sep='\t')
train[LABEL_HEADER] = train[LABEL_HEADER].progress_apply(lambda l: int(l))


eval = train.sample(n=5000,random_state=42)
train = train.drop(eval.index)
train = train.sample(n=30000,random_state=42)

train.head()

In [None]:
eval.head()

### showing class distributions

In [None]:
pos_sentiment_count = train[train[LABEL_HEADER] == 1].shape[0]
neg_sentiment_count = train[train[LABEL_HEADER] == 0].shape[0]

train_distribution = {'positive': [pos_sentiment_count], 'negative': [neg_sentiment_count]}
train_distribution = pd.DataFrame(train_distribution)

train_distribution

### Test

In [None]:
test = pd.read_csv(TEST_PATH,sep='\t')
test.head()

### train, eval, and test data lists

In [None]:
x_train, y_train = train[CONTENT_HEADER].values.tolist(), train[LABEL_HEADER].values.tolist()
x_eval, y_eval = eval[CONTENT_HEADER].values.tolist(), eval[LABEL_HEADER].values.tolist()
x_test = test[CONTENT_HEADER].values.tolist()
y_test = [1]* len(x_test)

### Showing train distribution

In [None]:
train_keys = list(Counter(y_train).keys())
train_values = list(Counter(y_train).values()) # counts the elements' frequency

train_val_dict = {}
for key, value in zip(train_keys, train_values):
    train_val_dict[key] = value

train_distribution = {'positive': [train_val_dict[1]], 'negative': [train_val_dict[0]]}
train_distribution = pd.DataFrame(train_distribution)

train_distribution

In [None]:
eval_keys = list(Counter(y_eval).keys())
eval_values = list(Counter(y_eval).values()) # counts the elements' frequency

eval_val_dict = {}
for key, value in zip(eval_keys, eval_values):
    eval_val_dict[key] = value

eval_distribution = {'positive': [eval_val_dict[1]], 'negative': [eval_val_dict[0]]}
eval_distribution = pd.DataFrame(eval_distribution)

eval_distribution

## Configuration values

In [None]:
model = BertForSequenceClassification.from_pretrained(MODELS[MODEL_INDEX], num_labels=2)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODELS[MODEL_INDEX])

## Creating dataset

In [None]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

## Tokenization

In [None]:
train_encodings = tokenizer(x_train, padding=True, truncation=True, max_length=MAX_LEN)

In [None]:
# train_encodings.keys()
# train_encodings['input_ids'][10]

In [None]:
eval_encodings = tokenizer(x_eval, padding=True, truncation=True, max_length=MAX_LEN)

In [None]:
# start_time = time.time()

test_encodings = tokenizer(x_test, padding=True, truncation=True, max_length=MAX_LEN)

# end_time = time.time()
# print(end_time - start_time)

In [None]:
train_dataset = TrainDataset(train_encodings, y_train)

In [None]:
eval_dataset = TrainDataset(eval_encodings, y_eval)

In [None]:
test_dataset = TrainDataset(test_encodings, y_test)

## Fine tuning

In [None]:
# training_args = TrainingArguments("test_trainer")
training_args = TrainingArguments(
      output_dir='output' + MODEL_NAMES[MODEL_INDEX],
      evaluation_strategy="epoch",
      eval_steps=EVERY_EPOCH,
      per_device_train_batch_size=TRAIN_BATCH_SIZE,
      per_device_eval_batch_size=VALID_BATCH_SIZE,
      num_train_epochs=EPOCHS,
      learning_rate=LEARNING_RATE
  )

In [None]:
eval_df = pd.DataFrame()
try:
    eval_df = pd.read_csv(EVAL_FILE)
except:  
    eval_df = pd.DataFrame(y_eval,columns=['real_val'])
    eval_df.to_csv(EVAL_FILE, index=False)

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
#     recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(labels, pred, average='weighted')

    eval_df[MODEL_NAMES[MODEL_INDEX]] = pred
    eval_df.to_csv(EVAL_FILE, index=False)

#     matrix = confusion_matrix(y_true=labels, y_pred=pred)
#     matrix = matrix.diagonal()/matrix.sum(axis=1)
#     matrix = matrix.tolist()

    return {"accuracy": accuracy, "f1_score":f1, "precision": precision} #, "recall": recall, 'matrix': matrix}

In [None]:
trainer = Trainer(
      model=model, 
      args=training_args, 
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
      compute_metrics=compute_metrics,
  )

In [None]:
training_metrics = trainer.train()

In [None]:
evaluate_metrics = trainer.evaluate()

In [None]:
training_metrics

## Evaluation

In [None]:
evaluate_metrics

### Saving results in models.csv

In [None]:
train_loss = training_metrics.metrics['train_loss']
(eval_accuracy, eval_f1, eval_loss) = (evaluate_metrics['eval_accuracy'], evaluate_metrics['eval_f1_score'], evaluate_metrics['eval_loss'])

result_metrics = {'type': [MODEL_NAMES[MODEL_INDEX]],
                'train_loss': ["{:.2f}".format(train_loss)], 
                'eval_loss': ["{:.2f}".format(eval_loss)], 
                'eval_accuracy': ["{:.2f}".format(eval_accuracy*100)], 
                'eval_f1': ["{:.2f}".format(eval_f1*100)],
                'embedding_len': [MAX_LEN], 
                'learning_rate': [LEARNING_RATE], 
                'batch_size': [TRAIN_BATCH_SIZE]}
result_metrics = pd.DataFrame(result_metrics)
result_metrics

In [None]:
model_df = pd.DataFrame()

try:
    model_df = pd.read_csv(MODEL_RESULTS_FILE)
    model_df = model_df.append(result_metrics)
    model_df.to_csv(MODEL_RESULTS_FILE, index=False)
  # model_df = model_df.iloc[0:0]   #to clear model file
except:
    model_df = model_df.append(result_metrics)
    model_df.to_csv(MODEL_RESULTS_FILE, index=False)

In [None]:
model_df = pd.read_csv(MODEL_RESULTS_FILE)
model_df.head(50)