## Banking77 Task
This script trains a topic classifier for the banking77 task. The task it to correctly categorise the customer query to the correct type of query, given by the labels.

In [None]:
# Installs

# !pip install transformers
!pip uninstall scikit-learn -y
!pip install -U scikit-learn

In [None]:
# Imports
import pandas as pd
import sklearn
from transformers import AutoModel, AutoTokenizer
from os import environ
from psutil import cpu_count
from contextlib import contextmanager
from dataclasses import dataclass
from time import time
from tqdm import tqdm
from pathlib import Path
import joblib
import torch
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegressionCV
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from distil_funcs import *
from utils import load_csv, read_torch

# Load Tokeniser
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
train_path = "data/banking77/train.csv"
test_path = 'data/banking77/test.csv'

# Load data
en_train_data = pd.read_csv(train_path).sample(frac=1, random_state=1)
en_train_tokenized = tokenizer(en_train_data.text.to_list(), padding=True, truncation=True, max_length=64, return_tensors='pt')
en_train_labels = en_train_data.category.to_list()

en_test_data = pd.read_csv(test_path)
en_test_tokenized = tokenizer(en_test_data.text.to_list(), padding=True, truncation=True, max_length=64, return_tensors='pt')
en_test_labels = en_test_data.category.to_list()    


In [None]:
# Create functions for pipeline

# Creates sentence embeddings from a given input text using the given model
def get_embeddings_torch(model, input_text):
    data_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(input_text['input_ids'], input_text['token_type_ids'], input_text['attention_mask']), batch_size=64)
    results = []
    for step, batch in enumerate(tqdm(data_loader, position=0, leave=True)):
        encoded_slice_dict =  {
            'input_ids': batch[0],
            'token_type_ids': batch[1],
            'attention_mask': batch[2]
        }
        with torch.no_grad():
            model_output = model(**encoded_slice_dict)
        embeddings = model_output['pooler_output']
        embeddings = torch.nn.functional.normalize(embeddings)
        results = results + list(embeddings)
    
    results = torch.stack((results))
    return results

# Performs the benchmark test - fits a LogisticRegressionCV model, predicts the labels, 
# prints classification report and saves the result to a csv file
def benchmark_banking77(train_embeddings, test_embeddings, model_name):

    # build classifier based on those embeddings
    classifier_model = LogisticRegressionCV(cv=4, max_iter=10000)
    classifier_model.fit(X=train_embeddings, y=en_train_labels)
    print("Classifier model built...")
    
    # generate predictions
    en_predictions = classifier_model.predict(test_embeddings)
    print("Predictions generated...")
    
    # print classification results
    print("Banking77 Results: \n")
    report = classification_report(en_test_labels, en_predictions)
    print(report)
    dict_report = classification_report(en_test_labels, en_predictions, output_dict=True)
    df = pd.DataFrame.from_dict(dict_report).T.round(2)
    df.to_csv('classification_report_{}.csv'.format(model_name), index = True)
    print('Classification report saved!')
    
print("Banking77 loaded!")

# Save any generated embeddings using joblib
def save_embeddings(embeddings, labels, name):
    # save embeddings
    filename = "data/banking77/embeddings_" + name
    joblib.dump(embeddings, Path(filename + ".joblib"))
    joblib.dump(labels, Path(filename + "_labels.joblib"))
    print("Files saved at prefix: " + filename + "...")

# Load any previous embeddings
def load_embeddings(name):
    # save embeddings
    filename = "data/banking77/embeddings_" + name
    embeddings = joblib.load(Path(filename + ".joblib"))
    labels = joblib.load(Path(filename + "_labels.joblib"))
    return embeddings, labels

In [None]:
# Load Teacher Model for Evaluation
DEVICE = torch.device('cpu')
teacher_model = load_teacher(DEVICE)

student_config = {
    'd_model': 768, # hidden dim of model
    'heads': 12, # attention heads
    'dropout':0.1, # dropout in network except ffn
    'dropout_ffn':0.4, # dropout in ffn 
    'd_ff': 96, # num features in FFN hidden layer
    'n_layers': 2, # num of transformer layers
    'n_experts': 40, # number of FFN experts
    'load_balancing_loss_ceof': 0.01, # load balancing co-eff, encourages expert diversity
    'is_scale_prob': True, # whether to scale the selected expert outputs by routing probability
    'drop_tokens': False, # whether to drop tokens
    'capacity_factor':1.25, # capacity factor - seemed to work best in Switch Transformer
}

# 3. Create student model
word_embeddings = deepcopy(teacher_model.get_input_embeddings())
compressed_word_embeddings = word_embedding_compression(word_embeddings, student_config['d_model'])
student_model = LaBSE_Switch(config=student_config, word_embeddings_module=compressed_word_embeddings)

# 4. Load state_dict() of trained student
path = 's3://eu1-sagemaker-bucket/borisbubla/experiments/10000.0k/switch/LR0.0005LAY2EXP40D_FF96TEMP9TIME-20210609-174240/Distil_LaBSE_2L_40E_96D'
file = read_torch(path)
student_model.load_state_dict(file)
student_model.eval()
# path = '/home/ec2-user/SageMaker/models/switch/time-20210611-133301/model_200.pkl'
# checkpoint = torch.load(path, map_location = torch.device('cpu'))
# student_model.load_state_dict(checkpoint['model_state_dict'])
# student_model.eval()

### Banking77

#### 1. Create or Load Embeddings - teacher

In [None]:
# create embeddings OR load embeddings

# create 
labse_embeddings_train = get_embeddings_torch(model=teacher_model, input_text=en_train_tokenized)
labse_embeddings_test = get_embeddings_torch(model=teacher_model, input_text=en_test_tokenized)

# save
save_embeddings(labse_embeddings_train, en_train_labels, 'banking77_train')
save_embeddings(labse_embeddings_test, en_test_labels, 'banking77_test')

In [4]:
# load embeddings
labse_embeddings_train, en_train_labels = load_embeddings('banking77_train')
labse_embeddings_test, en_test_labels = load_embeddings('banking77_test')

#### 2. Do benchmark - teacher

In [None]:
# do benchmark
benchmark_banking77(labse_embeddings_train, labse_embeddings_test, 'labse_model')

#### 3. Create or load embeddings - student

In [None]:
# create or load embeddings

# create
distil_labse_embeddings_train = get_embeddings_torch(model=student_model, input_text=en_train_tokenized)
distil_labse_embeddings_test = get_embeddings_torch(model=student_model, input_text=en_test_tokenized)

# save
save_embeddings(distil_labse_embeddings_train, en_train_labels, 'distil_40E96D_banking77_train')
save_embeddings(distil_labse_embeddings_test, en_test_labels, 'distil_40E96D_banking77_test')

In [8]:
# load 
distil_labse_embeddings_train, en_train_labels = load_embeddings('distil_40E96D_banking77_train')
distil_labse_embeddings_test, en_test_labels = load_embeddings('distil_40E96D_banking77_test')

#### 4. Do benchmark - student

In [None]:
# do benchmark
benchmark_banking77(distil_labse_embeddings_train, distil_labse_embeddings_test, 'distil_labse_2L_40E_96D')

#### 5. Sanity check - print cosine_similarity scores of embeddings

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
print('Average CosSim for these embeddings: ',np.diag(cosine_similarity(labse_embeddings_train, distil_labse_embeddings_train)).mean())