# 95-891 Homework 4 – Natural Language Processing

## Blaine Perry
## Andrew ID: blainep
Due March 31st, 2022


In [1]:
# Importing the required packages

import os
import re

import pandas as pd
import numpy as np

import gensim
import nltk
import torch

nltk.download('stopwords')
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix


from itertools import compress
import collections

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\blain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\blain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### 1: Data ETL

In [2]:
# first step is to preprocess the data, since this data contains odd formatting, I cant just read it in directly and need a helper function
def read_data(name, categories, columns):
    data = open(name, encoding = 'cp850' ).readlines()
    data_array = []
    for i, row in enumerate (data):
        cols = row.split(sep=',')
        if cols[2] in categories:
            cols[5] = re.sub('_comma_' , '' , cols[5])  #because this is a csv, all commas are replaced by _comma_, we remove these here
            data_array.append([cols[5], cols[2]])

    return pd.DataFrame(data_array, columns = columns)

columns = ['utterance', 'context']
categories = ['sad', 'jealous', 'joyful', 'terrified']

# Importing the datasets
train = read_data("train.csv", categories, columns)
valid = read_data("valid.csv", categories, columns)
test = read_data("test.csv", categories, columns)


train_x = train.utterance
train_y = train.context

test_x = test.utterance
test_y = test.context

valid_x = valid.utterance
valid_y = valid.context


# Getting the train labels; this will be used for SGD classifier
train_labels_unique = list(train['context'].unique())
label_mapper = {}
num = 0
for label in train_labels_unique:
    label_mapper[label] = num
    num += 1


train_labels = list(train['context'])
train_labels_encoded = []
for label in train_labels:
    train_labels_encoded.append(label_mapper[label])

# Getting test labels
labels_test = list(test['context'])
labels_encoded_test = []
for label in labels_test:
    labels_encoded_test.append(label_mapper[label])
labels_encoded_test = np.array(labels_encoded_test)

### 2: Bag of Words

In [3]:
# use sklearn should be good
train_count_vectorizer = CountVectorizer()
X = train_count_vectorizer.fit_transform(train_x)
encoding = X.toarray()

In [4]:
encoding.shape

(10686, 6832)

### 3: The shortcomings with the previous representation are ?
###### There are many words such as 'a', 'the', etc. which are filler and do not contribute to the meaning of the sentence.  there are also many different variations on words which should be considered the same word, such as can't and cannot.

In [5]:
# remove stop words.
# Getting the list of stopwords and appending additional words to it
stopwords_list = list(set(stopwords.words('english')))
stopwords_list.extend(['comma', '', '_comma_'])

train_data_stop_removed = train_x.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_list)]))  #here we use a lambda function to go through each word and see if it is a stop word, if it is, we do not add it back to the sentence
test_data_stop_removed = test_x.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_list)]))


# Creating the bag of words encoding again
train_count_vectorizer = CountVectorizer()
X_train = train_count_vectorizer.fit_transform(train_data_stop_removed)

train_one_hot_encoding = X_train.toarray()

### 4. Normalization
# Normalizing the training data using tfidf transformer

In [6]:
# A helper function to show results
def show_results(train_test, label, preds, model_type = 'SGD'):
    print(f'{train_test} accuracy :', np.mean(label == preds))

    f1_score_vector = f1_score(label, preds, average=None)

    print('F1 score :', np.mean(label == preds))

    print('Confusion matrix :\n', confusion_matrix(label, preds))

    print(f'f1 score using {model_type} classifier is :', np.mean(f1_score_vector))

In [7]:
# Normalization
train_tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
train_embedding_tfidf_transformer = train_tfidf_transformer.fit_transform(X_train)


### 5. Building an SGD Classifier

In [8]:
X_train = train_embedding_tfidf_transformer
y_train = np.array(train_labels_encoded)

In [9]:
from sklearn.preprocessing import StandardScaler

clf = SGDClassifier(loss = 'log', max_iter = 500)
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html


clf.fit(X_train, y_train)
train_predicted_labels = clf.predict(X_train)
show_results('test', y_train, train_predicted_labels)

misclassified = y_train != train_predicted_labels #find the misclassifications


test accuracy : 0.7730675650383679
F1 score : 0.7730675650383679
Confusion matrix :
 [[2154  166  237  151]
 [ 148 2026  203  178]
 [ 236  214 2188  245]
 [ 147  245  255 1893]]
f1 score using SGD classifier is : 0.7730547599885254


In [10]:
labels = y_train[misclassified]
utterances = train_x[misclassified]
predictions = train_predicted_labels[misclassified]

together = pd.DataFrame(zip(utterances, labels, predictions), columns=['utterance', 'true_label', 'predicted_label'])
print(label_mapper)
together.head()

{'terrified': 0, 'joyful': 1, 'sad': 2, 'jealous': 3}


Unnamed: 0,utterance,true_label,predicted_label
0,I feel like getting prepared and then having a...,0,1
1,It's hard to stay clam. How do you do it?,0,2
2,Well pleased. You should be having brainsman!T...,1,2
3,During christmas a few years ago I did not get...,2,1
4,Since that day christmas has not been a good t...,2,1


###### Looking at the few misclassifications above, they seem reasonable.  For example, the first is "I feel like getting prepared and then having a curve ball thrown at you throws you off.", with truth label of terrified and prediction of joyful.  This could be that the classifier takes in curve ball and getting prepared and thinks that the user is going to a baseball game.  A joyful experience.
###### Another misclassification was number 3 above, which states "During christmas a few years ago I did not get any presents.".  The truth classification was sad and the prediction was joyful, however the classifier most likely honed in on Christmas and assumes that everyone who talks about Christmas is happy.

In [11]:
# Using training data vocabulary on test data so that the features are consistent
test_count_vectorizer = CountVectorizer(vocabulary = train_count_vectorizer.get_feature_names())
X_test = test_count_vectorizer.fit_transform(test_data_stop_removed)

test_one_hot_encoding = X_test.toarray()

# Normalizing the test data
test_tfidf_transformer = TfidfTransformer(smooth_idf=False,use_idf=True)
test_embedding_tfidf_transformer = test_tfidf_transformer.fit_transform(test_one_hot_encoding)

# Getting predictions on test data
test_predicted_labels = clf.predict(test_embedding_tfidf_transformer)
show_results('test', labels_encoded_test, test_predicted_labels)

test accuracy : 0.6193921852387844
F1 score : 0.6193921852387844
Confusion matrix :
 [[211  27  31  29]
 [ 35 216  57  48]
 [ 51  45 237  41]
 [ 37  61  64 192]]
f1 score using SGD classifier is : 0.6200081771190856


  idf = np.log(n_samples / df) + 1


### 6. Classifier using pretrained embeddings

In [12]:
# Tokenizing the data
train_tokens = [nltk.word_tokenize(sentences) for sentences in train_data_stop_removed]
train_y = np.array(train_labels_encoded)

test_tokens = [nltk.word_tokenize(sentences) for sentences in test_data_stop_removed]
test_y = np.array(labels_encoded_test)

# Loading the pretrained word2vec model from Google
# download the model here: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [13]:
#with some help from https://towardsdatascience.com/using-word2vec-to-analyze-news-headlines-and-predict-article-success-cdeda5f14751

# Create a list of strings, one for each title
utterance_list = [utterance for utterance in train_data_stop_removed]

# Collapse the list of strings into a single long string for processing
utterance_string = ' '.join(utterance_list)

from nltk.tokenize import word_tokenize

# Tokenize the string into words
tokens = word_tokenize(utterance_string)

# Remove non-alphabetic tokens, such as punctuation
words = [word.lower() for word in tokens if word.isalpha()]

# Filter the list of vectors to include only those that Word2Vec has a vector for
vector_list = [model[word] for word in words if word in model.vocab]

# Create a list of the words corresponding to these vectors
words_filtered = [word for word in words if word in model.vocab]

# Zip the words together with their vector representations
word_vec_zip = zip(words_filtered, vector_list)

# Cast to a dict so we can turn it into a DataFrame
word_vec_dict = dict(word_vec_zip)

In [14]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)

# Our earlier preprocessing was done when we were dealing only with word vectors
# Here, we need each document to remain a document
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stopwords_list]
    doc = [word for word in doc if word.isalpha()]
    return doc

# Function that will help us drop documents that have no word vectors in word2vec
def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)

# Filter out documents
def filter_docs(corpus, texts, condition_on_doc):
    """
    Filter corpus and texts given the function condition_on_doc which takes a doc. The document doc is kept if condition_on_doc(doc) is true.
    """
    number_of_docs = len(corpus)

    if texts is not None:
        texts = [text for (text, doc) in zip(texts, corpus)
                 if condition_on_doc(doc)]

    corpus = [doc for doc in corpus if condition_on_doc(doc)]

    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, texts)

In [15]:

def create_w2v_sets(data, labels, model):
    # Preprocess the corpus
    corpus = [preprocess(word) for word in data]

    train_df = pd.DataFrame(zip(corpus, labels)) #create a training dataframe so that we drop the appropriate labels later on

    x = []
    y = []
    for _, doc in train_df.iterrows(): # append the vector for each document

        if has_vector_representation(model, doc[0]):  #ensure that the document has vectors which exist in the model
            x.append(document_vector(model, doc[0]))
            y.append(doc[1])
    x = np.array(x) # list to array
    y = np.array(y) # list to array
    return x,y

train_X_w2v, train_Y_w2v =  create_w2v_sets(train_data_stop_removed, y_train, model)
test_X_w2v, test_Y_w2v = create_w2v_sets(test_data_stop_removed, test_y, model)

In [16]:
MLP = MLPClassifier(early_stopping = True).fit(train_X_w2v, train_Y_w2v)
train_predict = MLP.predict(train_X_w2v)
show_results('MLP train', train_Y_w2v, train_predict, model_type='MLP')

test_predict = MLP.predict(test_X_w2v)
show_results('MLP test', test_Y_w2v, test_predict, model_type='MLP')

MLP train accuracy : 0.6734482433832533
F1 score : 0.6734482433832533
Confusion matrix :
 [[1931  210  262  288]
 [ 190 1767  193  388]
 [ 320  387 1648  514]
 [ 153  315  247 1804]]
f1 score using MLP classifier is : 0.6735349807788276
MLP test accuracy : 0.6329941860465116
F1 score : 0.6329941860465116
Confusion matrix :
 [[222  28  24  24]
 [ 38 223  39  55]
 [ 47  44 216  65]
 [ 29  75  37 210]]
f1 score using MLP classifier is : 0.6347848822004006


### 7. Classifier using pretrained BERT

In [17]:
# referenced a helpful guide to BERT: https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb
from transformers import DistilBertTokenizer, DistilBertModel
import torch
# load the tokenizer and the model of distilbert-base-uncased
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
BERT_model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# tokenize the text, then input the tokens (and masks) into the model to get the output.
# Set up this helper function so that I can apply it to the dataframe
def get_BERT_embeddings(data):

    encoded_input = tokenizer(data, return_tensors='pt')
    with torch.no_grad():
        output = BERT_model(**encoded_input)
    return output['last_hidden_state'][:,0,:][0].numpy()  #take the last hidden state of the bert model

print('Started training embeddings. Please be patient, this takes forever.')
x_train_BERT = train_data_stop_removed.apply(get_BERT_embeddings)
print('Completed training embeddings\n')

print('Started test embeddings. Please be patient, this takes forever.')
x_test_BERT = test_data_stop_removed.apply(get_BERT_embeddings)
print('Completed test embeddings')

#convert from pandas series to dataframe
x_train_BERT = pd.DataFrame(dict(zip(x_train_BERT.index, x_train_BERT.values))).T
x_test_BERT = pd.DataFrame(dict(zip(x_test_BERT.index, x_test_BERT.values))).T


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Started training embeddings. Please be patient, this takes forever.
Completed training embeddings

Started test embeddings. Please be patient, this takes forever.
Completed test embeddings


### Explain how you use the BERT output. Specifically, which token(s) output you use?
###### from the output, we need to take the first position in the "last_hidden_states" dict.  This contains the tokens for classification.  It can be thought of as an embedding for the utterance.

In [18]:
# use the BERT output to train a MLP classifier
Bert_MLP = MLPClassifier(early_stopping = True, max_iter=300).fit(x_train_BERT, train_y)
train_predict = Bert_MLP.predict(x_train_BERT)
show_results('Bert_MLP train', train_y, train_predict, model_type='MLP')

test_predict = Bert_MLP.predict(x_test_BERT)
show_results('Bert_MLP test', test_y, test_predict, model_type='MLP')

Bert_MLP train accuracy : 0.6991390604529291
F1 score : 0.6991390604529291
Confusion matrix :
 [[1931  220  291  266]
 [ 138 1878  222  317]
 [ 276  335 1760  512]
 [ 107  282  249 1902]]
f1 score using MLP classifier is : 0.6998566393531345
Bert_MLP test accuracy : 0.6157742402315485
F1 score : 0.6157742402315485
Confusion matrix :
 [[209  24  29  36]
 [ 27 220  38  71]
 [ 47  41 225  61]
 [ 30  77  50 197]]
f1 score using MLP classifier is : 0.6183531717475763


### 8: Read the paper and answer the following questions:
#### 1) (0.5 points) What does this paper mean by "fine-tuning" results? How might you use such fine-tuning in building an empathetic chatbot?

###### The paper means that the pre-trained models were further trained on the EmpatheticDialogues training data.  This was done to reduce training time by taking the knowledge gained in the pre-trained model and extending it to respond in a more empathetic way, by using the EmpatheticDialogues training set.  In this paper, the authors froze the Transformer encoder and classifiers.  We might use fine-tuning to build an empathetic chatbot in the same way.  If we were to freeze the encoder layers, we could train the decoder layers to more accurately respond with empathy.  This could be accomplished by adding an "empathy score" and maximizing it in the loss function.

#### 2) (0.5 points) What properties of the transformer architecture make it well suited for this application?

###### Transformers are based on the attention model which would allow the model to find the key phrases which indicate emotional state, and can then predict responses that are emotionally correct.

#### 3) (0.5 points) Explain the metrics used to evaluate performance in Table 1 (P@1,100, AVG-BLEU, and PPL).

###### P@1,100 is the precision retrieving the correct test candidate out of 100 test candidates.  It shows how often the model can select the correct response from 100 randomly selected samples.
###### AVG-BLEU is the average of the BLEU 1-4 scores.  BLEU, or Bi-Lingual Evaluation Understudy, computes the similarity between the predicted text and human generated texts.  If given a number of human generated texts, it checks the predicted text to see how many of the words in the prediction are in the human generated texts.  In other words, it calculates precision of the words within the prediction.
###### PPL is simply perplexity, or the reciprocal of probability of a sequence being in a corpus normalized by the sequence length.  It is used to evaluate the likelihood of a sequence of words.

#### 4) (0.5 points) Which of the metrics do you think provides the best measure of performance of empathic systems and why?

###### I believe BLEU would be the best metric in this case.  Assuming that humans are empathetic in their responses, we would want the similarity between the generated and human texts to be high.  When looking at PPL alone, we may find answers with low PPL which are not empathetic.

#### 5) (0.5 points) Based on Tables 1 and 2, and your reading of the paper, what do you think would help the system get to human-level performance?

###### Larger models with topics, and emotions prepended and then fine-tuned seemed to increase empathetic responses.  This indicates that giving the model some explicit understanding of the topic and emotion of an utterance's (à la the Elcor in Mass Effect) is helpful in increasing the model toward human level performance. It also shows that larger models may be needed to accurately predict empathetic responses.  In order to help the system reach human-level performance, I would try to use larger datasets with explicit topic and emotion labeling trained on large and complex models.