In [1]:
import pandas as pd
import numpy as np


In [3]:
df = pd.read_csv("emotion.csv")

In [4]:
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [5]:
df.label.value_counts()

1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: label, dtype: int64

In [7]:
df.isna().sum()

text     0
label    0
dtype: int64

## Text preprocessing 

In [9]:
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

## filtering stopwords

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\MRUH
[nltk_data]     4\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# filter stopwords

from nltk.corpus import stopwords

stop = stopwords.words('english')

df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [12]:
df

Unnamed: 0,text,label
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,3
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,3
...,...,...
15995,brief time beanbag said anna feel like beaten,0
15996,turning feel pathetic still waiting tables sub...,0
15997,feel strong good overall,1
15998,feel like rude comment im glad,3


In [15]:
# Do lemmatization

from nltk.stem import WordNetLemmatizer

from textblob import Word

df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

df['text'].head()


0                                didnt feel humiliated
1    go feeling hopeless damned hopeful around some...
2            im grabbing minute post feel greedy wrong
3    ever feeling nostalgic fireplace know still pr...
4                                      feeling grouchy
Name: text, dtype: object

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

X = tfidf.fit_transform(df['text'])

X = X.toarray()

y = df.label.values

## Model Building 

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, \

                                                test_size=0.2, shuffle=True)

In [18]:
from sklearn.naive_bayes import GaussianNB

 

#Initialize GaussianNB classifier

model = GaussianNB()

#Fit the model on the train dataset

model = model.fit(X_train, y_train)

#Make predictions on the test dataset

pred = model.predict(X_test)




from sklearn.metrics import accuracy_score

 

print("Accuracy:", accuracy_score(y_test, pred)*100, "%")

Accuracy: 35.0 %


In [19]:
from sklearn.ensemble import RandomForestClassifier

#Create a random forest classifier, 100 trees

clf_rf=RandomForestClassifier()

 

#Train the model using the training sets

clf_rf.fit(X_train,y_train)

 

rf_pred=clf_rf.predict(X_test).astype(int)

print("Accuracy:",accuracy_score(y_test, rf_pred))

from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test,rf_pred))  

print(classification_report(y_test,rf_pred))







Accuracy: 0.8784375
[[871  27   5  25  17   1]
 [ 31 946  27   7   8   2]
 [  3  78 206   5   4   0]
 [ 16  23   3 368  16   1]
 [ 11  14   1  17 344  10]
 [  4   7   1   0  25  76]]
              precision    recall  f1-score   support

           0       0.93      0.92      0.93       946
           1       0.86      0.93      0.89      1021
           2       0.85      0.70      0.76       296
           3       0.87      0.86      0.87       427
           4       0.83      0.87      0.85       397
           5       0.84      0.67      0.75       113

    accuracy                           0.88      3200
   macro avg       0.86      0.82      0.84      3200
weighted avg       0.88      0.88      0.88      3200



In [20]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(class_weight='balanced') 

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
print(confusion_matrix(y_test, y_pred))  

print(classification_report(y_test, y_pred))

[[845  29  12  32  22   6]
 [ 20 910  70   6   6   9]
 [  0  19 268   3   4   2]
 [ 12  11   3 385  13   3]
 [ 13  12   2  17 328  25]
 [  3   1   0   0  12  97]]
              precision    recall  f1-score   support

           0       0.95      0.89      0.92       946
           1       0.93      0.89      0.91      1021
           2       0.75      0.91      0.82       296
           3       0.87      0.90      0.89       427
           4       0.85      0.83      0.84       397
           5       0.68      0.86      0.76       113

    accuracy                           0.89      3200
   macro avg       0.84      0.88      0.86      3200
weighted avg       0.89      0.89      0.89      3200

