In [1]:
import torch
import transformers as ppb # pytorch transformers

In [2]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix,classification_report
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse
from nltk.corpus import stopwords
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import words
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

Using TensorFlow backend.


In [3]:
import warnings
warnings.filterwarnings('ignore')
import tqdm
from ipywidgets import IntProgress

In [4]:
df = pd.read_csv('sa-emotions/train_data.csv', delimiter=',')
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [5]:
# model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased') #uncomment for BERT
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

In [6]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [7]:
def model_report(train_data,test_data,train_labels,test_labels):
    lsvc = LinearSVC(random_state=0, tol=1e-5)
    rf = RandomForestClassifier(n_estimators=100)
    et = ExtraTreesClassifier(n_estimators=100, random_state=0)
    xgb = XGBClassifier()
    adb = AdaBoostClassifier(n_estimators=100, random_state=0)
    models = {'LinearSVC':lsvc,'RandomForest':rf,'ExtraTrees':et,'XGBoost':xgb,'AdaBoost':adb}
    for model_name, model in models.items():
        model.fit(train_data, train_labels)
        print("Accuracy for {} is {}".format(model_name,accuracy_score(test_labels, model.predict(test_data))))
        predicted = model.predict(test_data)
        print(classification_report(test_labels,predicted))

### Binary Classification

In [8]:
df_bin = df.loc[(df['sentiment'] == 'sadness') | (df['sentiment'] == 'happiness')]

In [9]:
df_bin.head()

Unnamed: 0,sentiment,content
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
6,sadness,"I should be sleep, but im not! thinking about ..."
8,sadness,@charviray Charlene my love. I miss you
9,sadness,@kelcouch I'm sorry at least it's Friday?


In [10]:
def cleaner(tes):
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_U_NAME = re.compile("@[\S]+")
    REPLACE_DIGITS = re.compile("\d")
    REPLACE_W_SPACE = re.compile("_")
    tes["content"] = tes["content"].str.replace(REPLACE_NO_SPACE, '')
    tes["content"] = tes["content"].str.replace(REPLACE_U_NAME,'')
    tes["content"] = tes["content"].str.replace(REPLACE_DIGITS,'')
    tes["content"] = tes["content"].str.replace(REPLACE_W_SPACE,'')
    tes["content"] = tes["content"].str.lower()
    return tes

In [11]:
df_bin = cleaner(df_bin)
df_bin.head()

Unnamed: 0,sentiment,content
1,sadness,layin n bed with a headache ughhhhwaitin on y...
2,sadness,funeral ceremonygloomy friday
6,sadness,i should be sleep but im not thinking about an...
8,sadness,charlene my love i miss you
9,sadness,im sorry at least its friday


In [12]:
tokenized = df_bin['content'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [13]:
X_train, X_test, y_train, y_test = train_test_split(tokenized, df_bin["sentiment"], test_size=0.33, random_state=42)

In [14]:
def ret_ipID(tokenized):
    print("Shape of tokenized is {}".format(tokenized.shape[0]))
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    print("Shape of padded is {}".format(np.array(padded).shape))
    return padded



In [15]:
def ret_attnmask(padded):
    attention_mask = np.where(padded != 0, 1, 0)
    print("Shape of Attention mask is {}".format(attention_mask.shape))
    return torch.tensor(attention_mask)


In [16]:
X_train_input_ids = ret_ipID(X_train)
X_test_input_ids = ret_ipID(X_test)

Shape of tokenized is 5235
Shape of padded is (5235, 51)
Shape of tokenized is 2579
Shape of padded is (2579, 45)


In [17]:
X_train_attentionMask = ret_attnmask(X_train_input_ids)
X_test_attentionMask = ret_attnmask(X_test_input_ids)

Shape of Attention mask is (5235, 51)
Shape of Attention mask is (2579, 45)


In [18]:
X_train_input_ids = torch.tensor(X_train_input_ids).to(torch.int64)
X_test_input_ids = torch.tensor(X_test_input_ids).to(torch.int64)

In [20]:
with torch.no_grad():
    last_hidden_states_X_train = model(X_train_input_ids, attention_mask=X_train_attentionMask)

In [21]:
with torch.no_grad():
    last_hidden_states_X_test = model(X_test_input_ids, attention_mask=X_test_attentionMask)

In [22]:
train_features = last_hidden_states_X_train[0][:,0,:].numpy()
test_features = last_hidden_states_X_test[0][:,0,:].numpy()

In [25]:
parameters = {'C': np.linspace(1, 0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, y_train)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 0.3637}
best scrores:  0.8045845272206303


In [27]:
lr_clf = LogisticRegression(C = 0.3637)#from gridsearch
lr_clf.fit(train_features, y_train)

LogisticRegression(C=0.3637, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
lr_clf.score(test_features, y_test)

0.8131058549825514

In [29]:
model_report(train_features,test_features,y_train,y_test)

Accuracy for LinearSVC is 0.7863512989530826
              precision    recall  f1-score   support

   happiness       0.71      0.71      0.71       954
     sadness       0.83      0.83      0.83      1625

    accuracy                           0.79      2579
   macro avg       0.77      0.77      0.77      2579
weighted avg       0.79      0.79      0.79      2579

Accuracy for RandomForest is 0.7584335013571152
              precision    recall  f1-score   support

   happiness       0.73      0.55      0.63       954
     sadness       0.77      0.88      0.82      1625

    accuracy                           0.76      2579
   macro avg       0.75      0.72      0.72      2579
weighted avg       0.75      0.76      0.75      2579

Accuracy for ExtraTrees is 0.7576580069794494
              precision    recall  f1-score   support

   happiness       0.74      0.53      0.62       954
     sadness       0.76      0.89      0.82      1625

    accuracy                           0.76

In [30]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [31]:
clf = MLPClassifier(hidden_layer_sizes=(300,100,200), max_iter=500, alpha=0.0001,learning_rate = 'constant',
                     solver='sgd', verbose=10,  random_state=21,tol=0.000000001)

In [32]:
clf.fit(train_features, y_train)
y_pred = clf.predict(test_features)

Iteration 1, loss = 0.67819170
Iteration 2, loss = 0.66678044
Iteration 3, loss = 0.66184893
Iteration 4, loss = 0.65734052
Iteration 5, loss = 0.65299706
Iteration 6, loss = 0.64859521
Iteration 7, loss = 0.64357400
Iteration 8, loss = 0.63787338
Iteration 9, loss = 0.63246560
Iteration 10, loss = 0.62634204
Iteration 11, loss = 0.61832177
Iteration 12, loss = 0.61021003
Iteration 13, loss = 0.60185079
Iteration 14, loss = 0.59243138
Iteration 15, loss = 0.58318839
Iteration 16, loss = 0.57379004
Iteration 17, loss = 0.56466785
Iteration 18, loss = 0.55549149
Iteration 19, loss = 0.54653817
Iteration 20, loss = 0.53799373
Iteration 21, loss = 0.53100231
Iteration 22, loss = 0.52342365
Iteration 23, loss = 0.51670107
Iteration 24, loss = 0.51027516
Iteration 25, loss = 0.50426131
Iteration 26, loss = 0.49891881
Iteration 27, loss = 0.49501747
Iteration 28, loss = 0.48979155
Iteration 29, loss = 0.48606166
Iteration 30, loss = 0.48147815
Iteration 31, loss = 0.47812362
Iteration 32, los

Iteration 253, loss = 0.32589435
Iteration 254, loss = 0.31889664
Iteration 255, loss = 0.32988044
Training loss did not improve more than tol=0.000000 for 10 consecutive epochs. Stopping.


In [33]:
accuracy_score(y_test, y_pred)

0.7464133385032958