# Intro to AI: NLP Sentiment Classification

## Audrey Zhang
## October 2020

### Setup

In [1]:
# Importing the required packages

import os
import re

import pandas as pd
import numpy as np

import gensim
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer 


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ParameterGrid
import parfit.parfit as pf


from itertools import compress
import collections
from contractions import contractions_dict

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\audre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\audre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Preprocess the data

In [2]:
cols=['conv_id','utterance_idx','context','prompt','speaker_idx','utterance','selfeval','tags']
# Importing the datasets
train = pd.read_csv('./empatheticdialogues/train.csv', usecols=cols)
valid = pd.read_csv('./empatheticdialogues/valid.csv', usecols=cols)
test = pd.read_csv('./empatheticdialogues/test.csv', usecols=cols)

In [3]:
# only select {'sad', 'jealous', 'joyful', 'terrified'} categories

unique_labels=['sad', 'jealous', 'joyful', 'terrified']
train=train.loc[train.context.isin(unique_labels)]
valid=valid.loc[valid.context.isin(unique_labels)]
test=test.loc[test.context.isin(unique_labels)]

In [4]:
# select "utterance" and "context" as  X and y

X_train = train.utterance.copy()
X_test = test.utterance.copy()
X_valid = valid.utterance.copy()

In [37]:
# Getting the train labels and map for SGD classifier
label_mapper = {}
num = 0
for label in unique_labels:
    label_mapper[label] = num
    num += 1


labels_train = list(train['context'])
labels_encoded_train = []
for label in labels_train:
    labels_encoded_train.append(label_mapper[label])


# Getting test labels
labels_test = list(test['context'])
labels_encoded_test = []
for label in labels_test:
    labels_encoded_test.append(label_mapper[label])
labels_encoded_test = np.array(labels_encoded_test)


# get validation labels

labels_valid=list(valid['context'])
labels_encoded_valid=[]
for label in labels_valid:
    labels_encoded_valid.append(label_mapper[label])
labels_encoded_valid=np.array(labels_encoded_valid)

In [6]:
# remove punctuations

def rm_punctuations(ds):
    dataset=ds.copy()
    dataset=dataset.apply(lambda x: x.lower())
    dataset=dataset.apply(lambda x: re.sub(r'\W', ' ', x))
    dataset=dataset.apply(lambda x: re.sub(r'\s+', ' ', x))
    dataset=dataset.apply(lambda x: re.sub('_comma_', '', x))
    dataset=dataset.apply(lambda x: re.sub(r'\d+', '', x))
    return dataset

train_features_cleaned=rm_punctuations(X_train)
test_features_cleaned=rm_punctuations(X_test)
valid_features_cleaned=rm_punctuations(X_valid)


In [7]:
# convert to BOW model 

train_count_vectorizer = CountVectorizer()
X = train_count_vectorizer.fit_transform(train_features_cleaned)
encoding = X.toarray()

In [8]:
#print(train_count_vectorizer.get_feature_names())

In [9]:
# Converting counts to binary result for sparse encoding
for arr in encoding:
    arr[arr > 0] = 1

print(pd.DataFrame(data=encoding, 
                   columns=train_count_vectorizer.get_feature_names()).sum(axis=0).sort_values(ascending=False).head(20))

to      2862
it      2661
you     2510
that    2479
the     2249
my      2165
and     1795
was     1697
is      1520
of      1369
so      1358
have    1167
for     1069
in      1068
but      879
me       877
be       838
am       805
are      754
just     740
dtype: int64


In [10]:
# Getting the list of stopwords and appending additional words to it
stopwords_list = list(set(stopwords.words('english')))
stopwords_list.extend(['comma'])
stopwords_list.remove('why') # removing this one because there are observations with only 'why' as the single word in the utterance

lemmatizer = WordNetLemmatizer() 

In [16]:
# remove the tokens in the stopwords list from utterance

train_data_stop_removed = train_features_cleaned.apply(lambda x: ' '.join(
     lemmatizer.lemmatize(i) for i in x.split() if i not in stopwords_list))

test_data_stop_removed = test_features_cleaned.apply(lambda x: ' '.join(
     lemmatizer.lemmatize(i) for i in x.split() if i not in stopwords_list))

valid_data_stop_removed = valid_features_cleaned.apply(lambda x: ' '.join(
    lemmatizer.lemmatize(i) for i in x.split() if i not in stopwords_list))

In [12]:
# also expand contractions  

def expand_contractions(text):
    for i in text.split():
        if i in contractions_dict.keys():
            text=re.sub(i, contractions_dict[i], text)
    return text
    
train_data_stop_removed=train_data_stop_removed.apply(lambda x: expand_contractions(x)).reset_index(drop=True)
test_data_stop_removed=test_data_stop_removed.apply(lambda x: expand_contractions(x))
valid_data_stop_removed=valid_data_stop_removed.apply(lambda x: expand_contractions(x))

In [13]:
#empty_idx=[]
#for i in range(len(train_data_stop_removed)):
    #if len(train_data_stop_removed[i])==0:
        #empty_idx.append(i)

In [14]:
#train_data_stop_removed.drop(empty_idx, inplace=True)

In [15]:
#for i in empty_idx:
    #labels_encoded_train.pop(i)

In [16]:
# confirm the train x and y are same size
#print(len(labels_encoded_train))
#print(len(train_data_stop_removed))

9754
9754


In [17]:
# Creating the bag of words encoding again  
train_count_vectorizer = CountVectorizer()
X_vect = train_count_vectorizer.fit_transform(train_data_stop_removed)

In [18]:
train_one_hot_encoding = X_vect.toarray()

for arr in train_one_hot_encoding:
    arr[arr > 0] = 1

In [19]:
# check that this worked

print(pd.DataFrame(data=train_one_hot_encoding, 
                   columns=train_count_vectorizer.get_feature_names()).sum(axis=0).sort_values(ascending=False).head(20))


get       686
oh        619
really    591
time      570
like      568
friend    491
good      473
one       468
got       452
know      421
sorry     378
happy     374
well      359
go        359
going     358
year      356
would     354
day       349
hope      347
think     345
dtype: int64


### Normalization

In [20]:
# Normalizing the training data using tfidf transformer 

train_tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
train_embedding_tfidf_transformer = train_tfidf_transformer.fit_transform(train_one_hot_encoding)

# verify
df_idf=pd.DataFrame(train_tfidf_transformer.idf_, index=train_count_vectorizer.get_feature_names(), columns=['idf_weights'])
print(df_idf.sort_values(by=['idf_weights']))

           idf_weights
get           3.653201
oh            3.755816
really        3.802029
time          3.838146
like          3.841655
...                ...
pinscher      9.492388
pipe          9.492388
piper         9.492388
giggling      9.492388
nightlife     9.492388

[6226 rows x 1 columns]


### Building an SGD Classifier

In [21]:
#assign X_train and y_train

X_train = train_embedding_tfidf_transformer
y_train = np.array(labels_encoded_train)

In [22]:
#sgd classifier 

clf = SGDClassifier()
clf.fit(X_train, y_train)

SGDClassifier()

In [23]:
# evaluate the training accuracy 
y_pred_train=clf.predict(X_train)
#print(f1_score(y_train, y_pred_train, average='weighted'))
print(accuracy_score(y_train, y_pred_train))

0.8143325815050236


In [24]:
# test on validation dataset for baseline
valid_count_vectorizer=CountVectorizer(vocabulary=train_count_vectorizer.get_feature_names())
X_valid=valid_count_vectorizer.fit_transform(valid_data_stop_removed)
y_valid=np.array(labels_encoded_valid)

In [25]:
valid_one_hot_encoding=X_valid.toarray()
for arr in valid_one_hot_encoding:
    arr[arr > 0] = 1

In [26]:
valid_tfidf_transformer = TfidfTransformer(smooth_idf=False,use_idf=True)
valid_embedding_tfidf_transformer = valid_tfidf_transformer.fit_transform(valid_one_hot_encoding)

  idf = np.log(n_samples / df) + 1


In [27]:
# Getting predictions on test data
y_pred_valid = clf.predict(valid_embedding_tfidf_transformer)
# SGD_report=classification_report(y_valid, y_pred_valid, target_names=unique_labels)
# print(SGD_report)
#print(f1_score(y_valid, y_pred_valid, average='weighted'))
print(accuracy_score(y_valid, y_pred_valid))


0.6312741312741312


In [28]:
grid={
      'loss':['hinge', 'log', 'modified_huber', 'perceptron'],
      'penalty':['l1', 'l2', 'elasticnet'],
      'alpha': [0.01, 0.001, 0.0001, 0.00001],
      'max_iter': [200, 500, 1000, 5000],
      'learning_rate': ['optimal', 'invscaling', 'adaptive'],
      'eta0': [0.1, 0.5]}


In [29]:
paramGrid=ParameterGrid(grid)
bestModel, bestScore, allModels, allScores=pf.bestFit(SGDClassifier, paramGrid,
                                                      X_train, y_train, X_valid, y_valid,
                                                      metric=accuracy_score, greater_is_better=True, scoreLabel='accuracy')

-------------FITTING MODELS-------------


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1999s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 100 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 228 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 266 tas

-------------SCORING MODELS-------------


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s


Too many dimensions to plot.


[Parallel(n_jobs=-1)]: Done 1152 out of 1152 | elapsed:    3.2s finished


In [30]:
print(bestModel, bestScore)

SGDClassifier(alpha=1e-05, eta0=0.1, learning_rate='adaptive', loss='log',
              max_iter=200) 0.6428571428571429


In [31]:
# best accuracy score on validation dataset achieved is 64.2% 
# with model parameters alpha=1e-05, eta0=0.1, learning_rate='adaptive', loss='log'

#%%
# re-train model on test set with best model params identified above
clf1 = SGDClassifier(alpha=0.0001, eta0=0.1, learning_rate='adaptive', loss='log')
clf1.fit(X_train, y_train)

SGDClassifier(eta0=0.1, learning_rate='adaptive', loss='log')

In [32]:
# evaluate on test data 

test_count_vectorizer = CountVectorizer(vocabulary = train_count_vectorizer.get_feature_names())
X_test = test_count_vectorizer.fit_transform(test_data_stop_removed)
y_test=labels_encoded_test
test_one_hot_encoding = X_test.toarray()

for arr in test_one_hot_encoding:
    arr[arr > 0] = 1

In [33]:
# Normalizing the test data  
test_tfidf_transformer = TfidfTransformer(smooth_idf=False,use_idf=True)
test_embedding_tfidf_transformer = test_tfidf_transformer.fit_transform(test_one_hot_encoding)

  idf = np.log(n_samples / df) + 1


In [34]:
# Getting predictions on test data
y_pred_test = clf1.predict(test_embedding_tfidf_transformer)

In [35]:
# do some evaluation on the test set
print('Test accuracy :', np.mean(y_test == y_pred_test))

Test accuracy : 0.6143790849673203


In [36]:
f1_score_vector = f1_score(y_test, y_pred_test, average=None)
print('F1 score :', np.mean(y_test == y_pred_test))

F1 score : 0.6143790849673203


In [37]:
conf_mat=pd.DataFrame(confusion_matrix(y_test, y_pred_test), 
                      index=unique_labels, columns=['pred_'+i for i in unique_labels])
print('Confusion matrix :\n', conf_mat)

print('f1 score using SGD classifier is :', np.mean(f1_score_vector))

Confusion matrix :
            pred_sad  pred_jealous  pred_joyful  pred_terrified
sad             230            44           50              50
jealous          57           181           62              50
joyful           63            40          220              32
terrified        35            28           20             215
f1 score using SGD classifier is : 0.6145153062295787


In [38]:
conf_mat_norm=conf_mat.div(conf_mat.sum(axis=1), axis=0)
print('Confusion matrix normalized:\n', conf_mat_norm)

Confusion matrix normalized:
            pred_sad  pred_jealous  pred_joyful  pred_terrified
sad        0.614973      0.117647     0.133690        0.133690
jealous    0.162857      0.517143     0.177143        0.142857
joyful     0.177465      0.112676     0.619718        0.090141
terrified  0.117450      0.093960     0.067114        0.721477


In [39]:
# analyze some misclassified samples

misclassified=list(np.where(y_test!=y_pred_test)[0])
y_label_test=np.array([unique_labels[i] for i in y_pred_test]).transpose()
df_y_pred=pd.DataFrame(y_label_test, columns=['pred_context'])
df_x=test[['utterance', 'context']].copy().reset_index(drop=True)
inaccurates=df_x.merge(df_y_pred, left_index=True, right_index=True).iloc[misclassified]
print(inaccurates.head(10))

                                            utterance  context pred_context
1   Ugh_comma_ those articles always get me too......      sad      jealous
4   yes! And i do believe in God and prayers but g...      sad    terrified
6                   3 years is a long time. How come?   joyful          sad
8            Oh I see. They must miss you_comma_ too.   joyful          sad
14  She was around 11_comma_ so she took it very h...      sad    terrified
16  One of the saddest things to me is when people...      sad      jealous
18  That's perfectly natural. You sound like the k...      sad       joyful
20  I met up with an old flame recently_comma_ did...  jealous          sad
21               Oh ya? What happened?? I'm intrigued  jealous    terrified
23  Woah plot twist. She brought him along to meet...  jealous       joyful


In [41]:
inaccurates.to_csv('inaccurate_classifications.csv')

### Classifier using pretrained embeddings

In [38]:
# Tokenizing the data
train_tokens = [nltk.word_tokenize(sentences) for sentences in train_data_stop_removed]
train_y = np.array(labels_encoded_train)

test_tokens = [nltk.word_tokenize(sentences) for sentences in test_data_stop_removed]
test_y = np.array(labels_encoded_test)

valid_tokens=[nltk.word_tokenize(sentences) for sentences in valid_data_stop_removed]
valid_y=np.array(labels_encoded_valid)

In [19]:
# Loading the pretrained word2vec model from Google
# download the model here: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [20]:
def document_vector(w2v_model, sentence):
    sentence = [word for word in sentence if word in w2v_model.vocab]
    return np.mean(w2v_model[sentence], axis=0)

In [22]:
# Function that will help us drop documents that have no word vectors in word2vec
def has_vector_representation(word2vec_model, sentence):
    return not all(word not in word2vec_model.vocab for word in sentence)

In [39]:
# takes input of tokens and output fixed length numerical vector, with sentence-level mbedding averaged
x=[]
unmatched_idx=[] #keep track of samples with no matching words in w2v model
for i in range(len(train_tokens)):
    sent=train_tokens[i]
    if has_vector_representation(model, sent):
        x.append(list(document_vector(model, sent)))
    else:
        unmatched_idx.append(i)

In [41]:
for i in unmatched_idx:
    train_y=np.delete(train_y, i)

In [42]:
print(len(train_y))
print(len(x))

9746
9746


### MLPClassifier

In [45]:
# transform test data for MLP model prediction
test_x=[]
unmatched_idx_test=[] #keep track of samples with no matching words in w2v model
for i in range(len(test_tokens)):
    sent=test_tokens[i]
    if has_vector_representation(model, sent):
        test_x.append(list(document_vector(model, sent)))
    else:
        unmatched_idx_test.append(i)
        
for i in unmatched_idx_test:
    test_y=np.delete(test_y, i)
    
print(len(test_x))
print(len(test_y))

1372
1372


In [46]:
# transform validation data for MLP model prediction
valid_x=[]
unmatched_idx_val=[] #keep track of samples with no matching words in w2v model
for i in range(len(valid_tokens)):
    sent=valid_tokens[i]
    if has_vector_representation(model, sent):
        valid_x.append(list(document_vector(model, sent)))
    else:
        unmatched_idx_val.append(i)
        
for i in unmatched_idx_val:
    valid_y=np.delete(valid_y, i)
    
print(len(valid_x))
print(len(valid_y))

1544
1544


In [106]:
mlp_clf=MLPClassifier()

In [107]:
mlp_clf.fit(x, train_y)
val_y_pred=mlp_clf.predict(valid_x)
accuracy_score(valid_y, val_y_pred)

0.5680051813471503

In [110]:
# adjust hyperparams for better performance 
mlp_clf=MLPClassifier(hidden_layer_sizes=(200, ), max_iter=400, activation='tanh', solver='sgd')

In [111]:
mlp_clf.fit(x, train_y)

MLPClassifier(activation='tanh', hidden_layer_sizes=(200,), max_iter=400,
              solver='sgd')

In [112]:
val_y_pred=mlp_clf.predict(valid_x)

In [113]:
accuracy_score(valid_y, val_y_pred)
# this model performs slightly better 

0.6262953367875648

In [116]:
test_y_pred=mlp_clf.predict(test_x)
print("the test accuracy score is: ", accuracy_score(test_y, test_y_pred))

the test accuracy score is:  0.6297376093294461


In [121]:
conf_mat=pd.DataFrame(confusion_matrix(test_y, test_y_pred), 
                      index=unique_labels, columns=['pred_'+i for i in unique_labels])
print('Confusion matrix :\n', conf_mat)

f1_score_vector=f1_score(test_y, test_y_pred, average=None)
print('f1 score using SGD classifier is :', np.mean(f1_score_vector))

Confusion matrix :
            pred_sad  pred_jealous  pred_joyful  pred_terrified
sad             235            56           40              41
jealous          55           196           66              31
joyful           58            49          212              35
terrified        31            21           25             221
f1 score using SGD classifier is : 0.6313246288596899
