In [1]:
# Importing Libraries
import re
import string
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.utils import shuffle 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
df_nonaug = pd.read_csv('emotion_dataset.csv')

In [9]:
df_nonaug.drop(['Unnamed: 0', 'Unnamed: 3'], axis=1, inplace=True)

In [10]:
df_nonaug.drop_duplicates(inplace=True)

In [11]:
df_nonaug.shape

(31161, 2)

In [12]:
# Data Cleaning on non augmented dataset
# some text cleaning functions
def convert_to_lower(text):
    return text.lower()

def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
    return " ".join(tokens)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)

def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

In [13]:
df_nonaug['Text'] = df_nonaug['Text'].apply(lambda x: convert_to_lower(x))
df_nonaug['Text'] = df_nonaug['Text'].apply(lambda x: remove_numbers(x))
df_nonaug['Text'] = df_nonaug['Text'].apply(lambda x: remove_punctuation(x))
df_nonaug['Text'] = df_nonaug['Text'].apply(lambda x: remove_stopwords(x))
df_nonaug['Text'] = df_nonaug['Text'].apply(lambda x: remove_extra_white_spaces(x))
df_nonaug['Text'] = df_nonaug['Text'].apply(lambda x: lemmatizing(x))

In [14]:
df_nonaug.head()

Unnamed: 0,Emotion,Text
0,neutral,
1,joy,sage act upgrade list tommorow
2,sadness,way homegirl baby funeral man hate funeral rea...
3,joy,eye true hazel eyeand brilliant regular featur...
4,joy,iluvmiasantos ugh babe hugggzzz babe naamazed ...


In [15]:
# converting string labels to int labels

label_map = {
    'joy': 0,
    'sadness': 1,
    'fear' : 2,
    'surprise' : 3,
    'anger' : 4,
    'neutral' : 5,
    'disgust' : 6,
    'shame' : 7
}

df_nonaug['Emotion'] = df_nonaug['Emotion'].map(label_map)
df_nonaug.head()

Unnamed: 0,Emotion,Text
0,5,
1,0,sage act upgrade list tommorow
2,1,way homegirl baby funeral man hate funeral rea...
3,0,eye true hazel eyeand brilliant regular featur...
4,0,iluvmiasantos ugh babe hugggzzz babe naamazed ...


#### Importing augmented dataset

In [16]:
df_aug = pd.read_csv('aug_df6.csv')

In [17]:
df_aug.head()

Unnamed: 0.1,Unnamed: 0,Emotion,Text
0,60520,1,every sport team cheer kick straight sack
1,33981,4,mad mad mad mad mad mad mad mad mad
2,73997,7,civil service servant watched barely concealed...
3,56999,3,to get home kids twitter app completely different
4,62764,2,left alone home man down forced upstairs front...


In [18]:
df_aug.drop(['Unnamed: 0'], axis=1, inplace=True)

In [19]:
df_aug.isnull().sum()

Emotion      0
Text       914
dtype: int64

In [20]:
df_aug.dropna(inplace=True)

In [21]:
df_aug.head()

Unnamed: 0,Emotion,Text
0,1,every sport team cheer kick straight sack
1,4,mad mad mad mad mad mad mad mad mad
2,7,civil service servant watched barely concealed...
3,3,to get home kids twitter app completely different
4,2,left alone home man down forced upstairs front...


In [22]:
df_aug.shape

(80548, 2)

In [23]:
df_aug['Emotion'].value_counts()

0    10458
1    10373
6    10355
2    10266
3    10215
4    10160
7     9644
5     9077
Name: Emotion, dtype: int64

In our augmented dataset, roughly same number of data points belong to each class. So the problem of class imbalance is somewhat solved.

## Train Test Split

In [None]:
# split the dataset before augmenting to avoid augmented data in valid set
X_train, _, y_train, _ = train_test_split(df_aug['Text'], df_aug['Emotion'].values, test_size=0.1, stratify=df_aug['Emotion'].values)
_, X_test, _, y_test = train_test_split(df_nonaug['Text'], df_nonaug['Emotion'].values, test_size=0.5,)

We dont use augmented dataset for testing purpose and we always test our model on original dataset. Therefore train test split is happened like this.

In [25]:
X_train.shape

(72493,)

In [31]:
X_test.shape

(15581,)

### Text Vectorization using tf-idf

In [27]:
tfidf = TfidfVectorizer()

In [28]:
tfidf = TfidfVectorizer()
X_train_tf = tfidf.fit_transform(X_train)

In [29]:
X_test_tf = tfidf.transform(X_test)

In [30]:
import pickle
pickle.dump(tfidf, open('vectorizer.pkl', 'wb'))

# ML Models

## KNN

In [29]:
from sklearn.preprocessing import StandardScaler

In [32]:
from sklearn.neighbors import KNeighborsClassifier

In [33]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)

In [34]:
knn_classifier.fit(X_train_tf, y_train)

KNeighborsClassifier()

In [35]:
y_preds_knn = knn_classifier.predict(X_test_tf)

In [36]:
print(confusion_matrix(y_test, y_preds_knn))
print(accuracy_score(y_test, y_preds_knn))
print(classification_report(y_test, y_preds_knn))

[[ 710  196   25   18   16 4212    5    1]
 [  30 1249   13    5   29 1775    2    3]
 [  10   34 1314    0    7  820    1    0]
 [  10   41    0 1106    0  867    0    0]
 [   6   35    5    2 1175  636    1    0]
 [   3    5    2    0    1  709    0    1]
 [   0    0    0    0    0    0  423    0]
 [   0    0    0    0    0    0    0   78]]
0.4341184776330146
              precision    recall  f1-score   support

           0       0.92      0.14      0.24      5183
           1       0.80      0.40      0.54      3106
           2       0.97      0.60      0.74      2186
           3       0.98      0.55      0.70      2024
           4       0.96      0.63      0.76      1860
           5       0.08      0.98      0.15       721
           6       0.98      1.00      0.99       423
           7       0.94      1.00      0.97        78

    accuracy                           0.43     15581
   macro avg       0.83      0.66      0.64     15581
weighted avg       0.88      0.43      0

## Multinomial Naive Bayes

In [117]:
nv_model = MultinomialNB()
nv_model.fit(X_train_tf, y_train)

MultinomialNB()

In [23]:
X_test_tf = tfidf.transform(X_test)


In [24]:
X_train_tf.shape, X_test_tf.shape

((72493, 38066), (15581, 38066))

In [120]:
y_preds = nv_model.predict(X_test_tf)

In [121]:
print(confusion_matrix(y_test, y_preds))
print(accuracy_score(y_test, y_preds))
print(classification_report(y_test, y_preds))


[[4110  269  193  144  182  145  161   38]
 [ 279 2249   94   61  178   65  115   24]
 [ 104  115 1684   34   84   20   63   20]
 [ 228   93   29 1456   65   33   76   13]
 [  89  112   47   20 1489   33   85   21]
 [ 123   34   29   13   22  461   40   25]
 [  11    3    1    3    3    0  419    0]
 [   0    0    0    0    0    0    0   74]]
0.7664463128168924
              precision    recall  f1-score   support

           0       0.83      0.78      0.81      5242
           1       0.78      0.73      0.76      3065
           2       0.81      0.79      0.80      2124
           3       0.84      0.73      0.78      1993
           4       0.74      0.79      0.76      1896
           5       0.61      0.62      0.61       747
           6       0.44      0.95      0.60       440
           7       0.34      1.00      0.51        74

    accuracy                           0.77     15581
   macro avg       0.67      0.80      0.70     15581
weighted avg       0.78      0.77      0

In [122]:
sample_text =['he hates running all day']

In [123]:
vect = tfidf.transform(sample_text).toarray()

In [124]:
nv_model.predict(vect)

array([5], dtype=int64)

## Logistic Regression

In [125]:
from sklearn.linear_model import LogisticRegression

In [126]:
lr = LogisticRegression()

In [127]:
lr.fit(X_train_tf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [128]:
y_preds1 = lr.predict(X_test_tf)

In [129]:
print(confusion_matrix(y_test, y_preds1))
print(accuracy_score(y_test, y_preds1))
print(classification_report(y_test, y_preds1))

[[4363  230  126  168  111  200   43    1]
 [ 395 2229   88   93  137   89   30    4]
 [ 175  120 1682   43   60   30   14    0]
 [ 303   97   24 1480   35   41   13    0]
 [ 177  127   51   38 1420   55   27    1]
 [ 125   32   14   10   13  548    5    0]
 [  11    7    2    2    3    0  415    0]
 [   0    0    0    0    0    0    0   74]]
0.7837109299788203
              precision    recall  f1-score   support

           0       0.79      0.83      0.81      5242
           1       0.78      0.73      0.75      3065
           2       0.85      0.79      0.82      2124
           3       0.81      0.74      0.77      1993
           4       0.80      0.75      0.77      1896
           5       0.57      0.73      0.64       747
           6       0.76      0.94      0.84       440
           7       0.93      1.00      0.96        74

    accuracy                           0.78     15581
   macro avg       0.78      0.81      0.80     15581
weighted avg       0.79      0.78      0

## XGBoost

In [31]:
from xgboost import XGBClassifier

In [32]:
xgb = XGBClassifier()

In [33]:
xgb.fit(X_train_tf, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [34]:
y_preds2 = xgb.predict(X_test_tf)

In [35]:
print(confusion_matrix(y_test, y_preds2))
print(accuracy_score(y_test, y_preds2))
print(classification_report(y_test, y_preds2))

[[4379  216   90  227   99  198   87    0]
 [1020 1612   65  120  100   96   87    1]
 [ 632   78 1221   66   39   50   53    0]
 [ 712   59   25 1061   36   27   35    0]
 [ 540  107   23   19 1069   66   63    0]
 [ 278   16    4    2    4  407    3    0]
 [  82   12    1    3    3    4  324    0]
 [   0    0    0    0    0    0    0   60]]
0.6503433669212503
              precision    recall  f1-score   support

           0       0.57      0.83      0.68      5296
           1       0.77      0.52      0.62      3101
           2       0.85      0.57      0.68      2139
           3       0.71      0.54      0.61      1955
           4       0.79      0.57      0.66      1887
           5       0.48      0.57      0.52       714
           6       0.50      0.76      0.60       429
           7       0.98      1.00      0.99        60

    accuracy                           0.65     15581
   macro avg       0.71      0.67      0.67     15581
weighted avg       0.69      0.65      0

## Decision Tree

In [135]:
from sklearn.tree import DecisionTreeClassifier

In [136]:
dt_classifier = DecisionTreeClassifier()

In [137]:
dt_classifier.fit(X_train_tf, y_train)

DecisionTreeClassifier()

In [138]:
y_preds3 = dt_classifier.predict(X_test_tf)

In [139]:
print(confusion_matrix(y_test, y_preds3))
print(accuracy_score(y_test, y_preds3))
print(classification_report(y_test, y_preds3))

[[5021   63   18   46   29   55   10    0]
 [  77 2886   19   17   35   27    4    0]
 [  40   19 2036    6   10    9    4    0]
 [  53   12    6 1905    8    4    5    0]
 [  35   27   12   11 1790   17    4    0]
 [   9    8    2    1    7  719    1    0]
 [   0    0    1    1    1    1  436    0]
 [   0    0    0    0    0    0    0   74]]
0.9541749566780052
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      5242
           1       0.96      0.94      0.95      3065
           2       0.97      0.96      0.97      2124
           3       0.96      0.96      0.96      1993
           4       0.95      0.94      0.95      1896
           5       0.86      0.96      0.91       747
           6       0.94      0.99      0.96       440
           7       1.00      1.00      1.00        74

    accuracy                           0.95     15581
   macro avg       0.95      0.96      0.96     15581
weighted avg       0.95      0.95      0

In [44]:
rf_classifier.predict(vect)

array([5], dtype=int64)

In [43]:
sample_text =['i will fight you']
vect = tfidf.transform(sample_text).toarray()

## Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rf_classifier = RandomForestClassifier()

In [27]:
rf_classifier.fit(X_train_tf, y_train)

RandomForestClassifier()

In [29]:
y_preds4 = rf_classifier.predict(X_test_tf)

In [30]:
print(confusion_matrix(y_test, y_preds4))
print(accuracy_score(y_test, y_preds4))
print(classification_report(y_test, y_preds4))

[[5146   34    7   10   18   77    4    0]
 [  75 2969    8    7   19   22    1    0]
 [  26   12 2074    6    5   16    0    0]
 [  44    5    4 1882    4   15    1    0]
 [  27   15    2    3 1816   24    0    0]
 [   6    2    2    0    3  701    0    0]
 [   1    0    0    0    0    0  428    0]
 [   0    0    0    0    0    0    0   60]]
0.967588729863295
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      5296
           1       0.98      0.96      0.97      3101
           2       0.99      0.97      0.98      2139
           3       0.99      0.96      0.97      1955
           4       0.97      0.96      0.97      1887
           5       0.82      0.98      0.89       714
           6       0.99      1.00      0.99       429
           7       1.00      1.00      1.00        60

    accuracy                           0.97     15581
   macro avg       0.96      0.98      0.97     15581
weighted avg       0.97      0.97      0.

### Hyperparameter Optimization for Random Forest using RandomSearch CV

The main parameters used by a Random Forest Classifier are:

+ criterion = the function used to evaluate the quality of a split.
+ max_depth = maximum number of levels allowed in each tree.
+ max_features = maximum number of features considered when splitting a node.
+ min_samples_leaf = minimum number of samples which can be stored in a tree leaf.
+ min_samples_split = minimum number of samples necessary in a node to cause node splitting.
+ n_estimators = number of trees in the ensamble.

In [45]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [46]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train_tf, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


 0.4718387  0.60717586 0.6521043  0.76426687 0.59480224 0.57831785
 0.66333297 0.62072198 0.54107287 0.57606937 0.46861074 0.60447213
 0.60492737 0.57938004 0.48012909 0.77346776 0.5782351  0.59542298
 0.69206678 0.5789386  0.7929455  0.56679951 0.68765254 0.78682079
 0.67025778 0.56078517 0.77755091 0.62109443 0.37305658 0.60481701
 0.57840063 0.87771232 0.65146975        nan 0.49526159 0.57184826
 0.56038514 0.58724287 0.8008635  0.62138411 0.64700034        nan
 0.57837303 0.36151081 0.59524366 0.58779464 0.76596359 0.80050484
 0.65425625 0.67271327 0.56406832 0.65206293 0.58724287 0.46410006
 0.57888344 0.60440317 0.8695736  0.87884346 0.76607396 0.48463987
 0.42773787        nan 0.60405832 0.56358548 0.56141972 0.87713297
 0.78501369 0.76992257 0.67476856 0.87128411 0.59531265 0.59521607
 0.57706255 0.86718716 0.44953292 0.67103026 0.66759545 0.37164966
 0.73553307 0.47185241 0.6035065  0.51982961 0.62157724 0.48425362
 0.60746554 0.60314784 0.59287102 0.76706713 0.80914016 0.6214

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [47]:
rf_randomcv.best_params_

{'n_estimators': 2000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 340,
 'criterion': 'gini'}

In [48]:
rf_randomcv.best_estimator_

RandomForestClassifier(max_depth=340, max_features='log2', n_estimators=2000)

In [49]:
best_random_grid = rf_randomcv.best_estimator_

In [59]:
rf_classifier_tuned = RandomForestClassifier(n_estimators=2000, min_samples_split=2, min_samples_leaf=1, max_features='log2', 
                                             max_depth = 340, criterion= 'gini')

In [60]:
rf_classifier_tuned.fit(X_train_tf, y_train)

RandomForestClassifier(max_depth=340, max_features='log2', n_estimators=2000)

In [61]:
y_preds_tuned = rf_classifier_tuned.predict(X_test_tf)

In [63]:
print(confusion_matrix(y_test, y_preds_tuned))
print(accuracy_score(y_test, y_preds_tuned))
print(classification_report(y_test, y_preds_tuned))

[[5196   31    8    6    7   46    2    0]
 [  76 2987    6    8    7   15    2    0]
 [  36    2 2090    3    2    6    0    0]
 [  52    3    2 1890    2    6    0    0]
 [  23    8    2    2 1839   13    0    0]
 [  88    3    2    0    1  620    0    0]
 [   0    0    0    0    0    0  429    0]
 [   0    0    0    0    0    0    0   60]]
0.969835055516334
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      5296
           1       0.98      0.96      0.97      3101
           2       0.99      0.98      0.98      2139
           3       0.99      0.97      0.98      1955
           4       0.99      0.97      0.98      1887
           5       0.88      0.87      0.87       714
           6       0.99      1.00      1.00       429
           7       1.00      1.00      1.00        60

    accuracy                           0.97     15581
   macro avg       0.97      0.97      0.97     15581
weighted avg       0.97      0.97      0.

- Not much improvement in accuracy (increased from 96.75% to 96.98%).

In [78]:
sample_text =['i want to fight you']
vect = tfidf.transform(sample_text).toarray()

In [79]:
rf_classifier_tuned.predict(vect)

array([4], dtype=int64)

#### Our best performing model is Random Forest Classifier after performing hyperparameter optimization.

In [81]:
import joblib

In [82]:
file = open("emotion_detection_model.pkl", "wb")

In [83]:
joblib.dump(rf_classifier_tuned, file)

In [84]:
file.close()