## Sentiment Classifier
This script trains a sentiment classifier using either the student or teacher embeddings generated from the tweets

### Setup
Needs latest version of sklearn

In [1]:
!pip uninstall scikit-learn -y

Found existing installation: scikit-learn 0.24.1
Uninstalling scikit-learn-0.24.1:
  Successfully uninstalled scikit-learn-0.24.1


In [2]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp36-cp36m-manylinux2010_x86_64.whl (22.2 MB)
[K     |████████████████████████████████| 22.2 MB 19.2 MB/s eta 0:00:01    |█▋                              | 1.1 MB 19.2 MB/s eta 0:00:02     |██████████▍                     | 7.2 MB 19.2 MB/s eta 0:00:01
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.24.2


In [1]:
# Imports
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import sklearn.linear_model
import pickle
from utils import load_csv, read_torch
from distil_funcs import *
from utils import load_pickle
import pickle
import random
from random import shuffle
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegressionCV
from tqdm import tqdm

In [2]:
# Load Teacher Model for Evaluation
DEVICE = torch.device('cpu')
teacher_model = load_teacher(device=DEVICE)

# Load Student Model for Evaluation

student_config = {
    'd_model': 768, # hidden dim of model
    'heads': 12, # attention heads
    'dropout':0.1, # dropout in network except ffn
    'dropout_ffn':0.4, # dropout in ffn 
    'd_ff': 96, # num features in FFN hidden layer
    'n_layers': 2, # num of transformer layers
    'n_experts': 40, # number of FFN experts
    'load_balancing_loss_ceof': 0.01, # load balancing co-eff, encourages expert diversity
    'is_scale_prob': True, # whether to scale the selected expert outputs by routing probability
    'drop_tokens': False, # whether to drop tokens
    'capacity_factor':1.25, # capacity factor - seemed to work best in Switch Transformer
}

# 3. Create student model
word_embeddings = deepcopy(teacher_model.get_input_embeddings())
compressed_word_embeddings = word_embedding_compression(word_embeddings, student_config['d_model'])
student_model = LaBSE_Switch(config=student_config, word_embeddings_module=compressed_word_embeddings)

# 4. Load state_dict() of trained student
path = 's3://eu1-sagemaker-bucket/borisbubla/experiments/10000.0k/switch/LR0.0005LAY2EXP40D_FF96TEMP9TIME-20210609-174240/Distil_LaBSE_2L_40E_96D'
file = read_torch(path)
student_model.load_state_dict(file)
student_model.eval()


Some weights of the model checkpoint at sentence-transformers/LaBSE were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


LaBSE_Switch(
  (switch_model): SwitchTransformer(
    (layers): TypedModuleList(
      (0): SwitchTransformerLayer(
        (attn): MultiHeadAttention(
          (query): PrepareForMultiHeadAttention(
            (linear): Linear(in_features=768, out_features=768, bias=True)
          )
          (key): PrepareForMultiHeadAttention(
            (linear): Linear(in_features=768, out_features=768, bias=True)
          )
          (value): PrepareForMultiHeadAttention(
            (linear): Linear(in_features=768, out_features=768, bias=True)
          )
          (softmax): Softmax(dim=1)
          (output): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): SwitchFeedForward(
          (experts): TypedModuleList(
            (0): FeedForward(
              (layer1): Linear(in_features=768, out_features=96, bias=True)
              (layer2): Linear(in_features=96, out_features=768, bias=True)
        

In [3]:
def create_sentence_embeddings(model, tokenizer, sentences, max_length):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output['pooler_output']
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings.numpy()

In [4]:
def shuffle_lists_together(lst1, lst2):
    # Shuffle two lists with same order
    temp = list(zip(lst1,lst2))
    random.shuffle(temp)
    lst1, lst2 = zip(*temp)
    return list(lst1), list(lst2)

### Sentiment Classifiation Task

In [5]:
# load data
sentiment_train_data = pd.read_csv('data/twitter-2016train-A.txt', sep='\t', header=None)
sentiment_dev_data = pd.read_csv('data/twitter-2016dev-A.txt', sep='\t', header=None)
sentiment_test_data = pd.read_csv('data/twitter-2016test-A.txt', sep='\t', header=None)
train_sentences = sentiment_train_data[2].to_list()
train_labels = sentiment_train_data[1].to_list()
test_sentences = sentiment_test_data[2].to_list()
test_labels = sentiment_test_data[1].to_list()

In [6]:
# shuffle data
train_sentences, train_labels = shuffle_lists_together(train_sentences, train_labels)
test_sentences, test_labels = shuffle_lists_together(test_sentences, test_labels)

In [7]:
# create train embeddings
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
embeddings_s = create_sentence_embeddings(model=student_model, tokenizer=tokenizer, sentences=train_sentences, max_length=64)
embeddings_t = create_sentence_embeddings(model=teacher_model, tokenizer=tokenizer, sentences=train_sentences, max_length=64)

In [None]:
print('Average CosSim for these embeddings: ',np.diag(cosine_similarity(embeddings_t, embeddings_s)).mean())

In [8]:
# convert test data to embeddings
test_embeddings_s = create_sentence_embeddings(model=student_model, tokenizer=tokenizer, sentences=test_sentences, max_length=64)
test_embeddings_t = create_sentence_embeddings(model=teacher_model, tokenizer=tokenizer, sentences=test_sentences, max_length=64)

In [9]:
# train model with CV - LaBSE
sentiment_model_labse = sklearn.linear_model.LogisticRegressionCV(cv=5, max_iter=10000)
sentiment_model_labse.fit(embeddings_t, train_labels)

LogisticRegressionCV(cv=5, max_iter=10000)

In [None]:
# train model with CV - DistilLaBSE
sentiment_model_student = sklearn.linear_model.LogisticRegressionCV(cv=5, max_iter=10000)
sentiment_model_student.fit(embeddings_s, train_labels)

In [10]:
# make predictions
predictions_labse = sentiment_model_labse.predict(test_embeddings_t)
#predictions_student = sentiment_model_student.predict(test_embeddings_s)

In [None]:
predictions_labse

In [11]:
# eval
from sklearn.metrics import classification_report
print(classification_report(test_labels, predictions_labse))
print(classification_report(test_labels, predictions_student))

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        29
     neutral       0.59      0.34      0.43        96
    positive       0.47      0.91      0.62        75

    accuracy                           0.51       200
   macro avg       0.35      0.42      0.35       200
weighted avg       0.46      0.51      0.44       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# create csv files for latex report
dict_report_labse = classification_report(test_labels, predictions_labse)
dict_report_distil_labse = classification_report(test_labels, predictions_student)

df = pd.DataFrame.from_dict(dict_report_labse).T.round(2)
df.to_csv('classification_report_sentiment_{}.csv'.format('labse'), index = True)
df = pd.DataFrame.from_dict(dict_report_distil_labse).T.round(2)
df.to_csv('classification_report_sentiment_{}.csv'.format('distil_labse_2L_40E_96D'), index = True)