## Base Model

1. Prepare the data for training and validation, here we only use lines with more than 5 vocabularies, also lower case all characters and eliminate all punctuations. 
2. Use the unsupervised training method of fasttext with **skipgram** to train the embeddings of vocabularies.
3. Try different methods to aggregate multiple tokens' embeddings to one sentence embedding, using the mean aggregation at last
4. Train two seperate logistic regression classifier for two different tasks (predict the current speaker/next speaker)
5. The results are shown in the last two cells.

In [None]:
import pandas as pd
import string
import fasttext
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, classification_report
from utils import word2text

In [2]:
df = pd.read_csv("data/processed_lines.csv")
train_document = "./data/Fasttext/fasttext_unsupervised_train.txt"
labels = {
    "Sheldon" : 0,
    "Penny" : 1,
    "Leonard" : 2,
    "Raj" : 3,
    "Howard" : 4,
    "Amy" : 5,
    "Bernadette" : 6,
    "Secondary" : 7,
    "End" : 8
}

In [3]:
def unsupervised_fasttext_data_prep(df):
    '''
    Prepare data for fasttext unsupervised training
    '''
    labels_cur_speaker = []
    labels_next_speaker = []
    with open(train_document, "w") as f:
        for index, row in df.iterrows():
            line = str(row.raw_line).lower().strip()
            line = line.translate(str.maketrans({key:None for key in string.punctuation}))
            if len(line.split(" ")) >= 6:
                f.write(line + '\n')
                labels_cur_speaker.append(row.cur_speaker_label)
                labels_next_speaker.append(row.next_speaker_label)
    return labels_cur_speaker, labels_next_speaker


In [4]:
labels_cur_speaker, labels_next_speaker = unsupervised_fasttext_data_prep(df)

In [5]:
# get the embeddings for tokens
embedding_model = fasttext.train_unsupervised(train_document, model = "skipgram", dim = 150, epoch = 300, lr = 0.1, ws=4)

In [6]:
# get the embedding for each line from the tokens' embedding

vocabulary = embedding_model.words
word_embeddings = np.array([embedding_model[word] for word in vocabulary])
vector_dict = dict(zip(vocabulary, word_embeddings))
with open(train_document) as f:
    content = f.readlines()
documents = np.array([x.strip() for x in content])
aggregated_doc_vectors = word2text(documents, vector_dict, word_embeddings.shape[1], 'mean')
print(aggregated_doc_vectors.shape)

(34406, 150)


In [7]:
# Get the weight of each class, they are not used in this notebook, because logistic regression api offers automatic weights calculation, but these data are used in the bert notebook

y_cur_speaker = np.array([labels[x] for x in labels_cur_speaker])
y_next_speaker = np.array([labels[x] for x in labels_next_speaker])
cur_speaker_weight = sklearn.utils.class_weight.compute_class_weight(class_weight = "balanced", classes= np.unique(y_cur_speaker), y= y_cur_speaker)
next_speaker_weight = sklearn.utils.class_weight.compute_class_weight(class_weight = "balanced", classes= np.unique(y_next_speaker), y= y_next_speaker)
print(cur_speaker_weight)
print(next_speaker_weight)

[0.48333895 0.90484957 0.71429165 1.27128289 1.06559713 1.87805677
 2.52688014 1.29423714]
[0.5671942  0.75431904 0.59435462 1.2725995  1.03293404 1.61986817
 2.12974311 1.1452633  1.94252484]


In [8]:
cur_speaker_data = np.concatenate((y_cur_speaker.reshape(len(documents), 1), aggregated_doc_vectors), axis = 1)
next_speaker_data = np.concatenate((y_next_speaker.reshape(len(documents), 1), aggregated_doc_vectors), axis = 1)

In [25]:
def train_classfier(data, class_weight):
    np.random.shuffle(data)
    data_train = data[ : int(0.9*len(data))]
    data_val = data[int(0.9*len(data)): ]
    train_y = data_train[:,0]
    train_x = data_train[:, 1:]
    val_y = data_val[:,0]
    val_x = data_val[:, 1:]

    model = LogisticRegression(multi_class = "ovr", max_iter = 2500, class_weight = "balanced", solver = 'liblinear', C=5.0)
    model.fit(train_x, train_y)

    predict_val = model.predict(val_x)
    t = classification_report(val_y, predict_val, target_names = ["Sheldon", "Penny", "Leonard", "Raj","Howard","Amy","Bernadette","Secondary","End"])
    print(t)
    return model

In [26]:
cur_model = train_classfier(cur_speaker_data, cur_speaker_weight)

              precision    recall  f1-score   support

     Sheldon       0.49      0.55      0.52       891
       Penny       0.27      0.37      0.31       478
     Leonard       0.24      0.18      0.21       601
         Raj       0.25      0.20      0.22       347
      Howard       0.20      0.12      0.15       389
         Amy       0.11      0.09      0.10       243
  Bernadette       0.11      0.22      0.15       190
   Secondary       0.17      0.14      0.15       302

    accuracy                           0.29      3441
   macro avg       0.23      0.23      0.23      3441
weighted avg       0.28      0.29      0.28      3441



In [23]:
next_model = train_classfier(next_speaker_data, next_speaker_weight)

              precision    recall  f1-score   support

     Sheldon       0.30      0.32      0.31       697
       Penny       0.15      0.10      0.12       508
     Leonard       0.26      0.17      0.20       630
         Raj       0.08      0.09      0.09       288
      Howard       0.12      0.09      0.10       352
         Amy       0.11      0.13      0.12       231
  Bernadette       0.10      0.22      0.13       198
   Secondary       0.16      0.17      0.16       343
         End       0.09      0.15      0.11       194

    accuracy                           0.17      3441
   macro avg       0.15      0.16      0.15      3441
weighted avg       0.18      0.17      0.17      3441

