In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import spacy

In [4]:
# Read the dataset 
df = pd.read_csv("C:/Users/aansh/Downloads/Emotion_classify_Data.csv")

# Print the shape of dataframe
print(df.shape)

# Print top 10 rows
df.head(10)

(5937, 2)


Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear
5,i feel jealous becasue i wanted that kind of l...,anger
6,when a friend of mine keeps telling me morbid ...,anger
7,i finally fell asleep feeling angry useless an...,anger
8,i feel a bit annoyed and antsy in a good way,anger
9,i feel like i ve regained another vital part o...,joy


In [5]:
df['Emotion'].value_counts()

anger    2000
joy      2000
fear     1937
Name: Emotion, dtype: int64

In [6]:
# Show sample
print(f"{df['Comment'][0]} -> {df['Emotion'][0]}")

i seriously hate one subject to death but now i feel reluctant to drop it -> fear


# preprocessing

In [7]:
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

In [8]:
txt = df['Comment'][3]
txt

'ive been really angry with r and i feel like an idiot for trusting him in the first place'

In [9]:
# Tokenization
doc = nlp(txt)

# word tokenization

In [10]:
for token in doc:
    print(token)

i
ve
been
really
angry
with
r
and
i
feel
like
an
idiot
for
trusting
him
in
the
first
place


# stemming and lemmatization

In [11]:
for token in doc:
    print(f"Word: {token} | -> {token.lemma_}")

Word: i | -> I
Word: ve | -> ve
Word: been | -> be
Word: really | -> really
Word: angry | -> angry
Word: with | -> with
Word: r | -> r
Word: and | -> and
Word: i | -> I
Word: feel | -> feel
Word: like | -> like
Word: an | -> an
Word: idiot | -> idiot
Word: for | -> for
Word: trusting | -> trust
Word: him | -> he
Word: in | -> in
Word: the | -> the
Word: first | -> first
Word: place | -> place


# stop words

In [12]:
for token in doc:
    if token.is_stop or token.is_punct:
        print(token)

i
been
really
with
and
i
an
for
him
in
the
first


In [13]:
# use this utility function to get the preprocessed text data
def preprocess(text):
    #lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [14]:
print(txt)
procces_txt = preprocess(txt)
print(procces_txt)

ive been really angry with r and i feel like an idiot for trusting him in the first place
ve angry r feel like idiot trust place


In [15]:
df['preprocessed_comment'] = df['Comment'].apply(preprocess) 

In [16]:
df

Unnamed: 0,Comment,Emotion,preprocessed_comment
0,i seriously hate one subject to death but now ...,fear,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,anger,m life feel appalled
2,i sit here to write i start to dig out my feel...,fear,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,joy,ve angry r feel like idiot trust place
4,i feel suspicious if there is no one outside l...,fear,feel suspicious outside like rapture happen
...,...,...,...
5932,i begun to feel distressed for you,fear,begin feel distressed
5933,i left feeling annoyed and angry thinking that...,anger,leave feel annoyed angry thinking center stupi...
5934,i were to ever get married i d have everything...,joy,marry d ready offer ve get club perfect good l...
5935,i feel reluctant in applying there because i w...,fear,feel reluctant apply want able find company kn...


# encoding target column

In [17]:
df['Emotion_num'] = df['Emotion'].map({'joy' : 0, 'fear': 1, 'anger': 2})

df.head(5)

Unnamed: 0,Comment,Emotion,preprocessed_comment,Emotion_num
0,i seriously hate one subject to death but now ...,fear,seriously hate subject death feel reluctant drop,1
1,im so full of life i feel appalled,anger,m life feel appalled,2
2,i sit here to write i start to dig out my feel...,fear,sit write start dig feeling think afraid accep...,1
3,ive been really angry with r and i feel like a...,joy,ve angry r feel like idiot trust place,0
4,i feel suspicious if there is no one outside l...,fear,feel suspicious outside like rapture happen,1


# split data

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_comment'], df['Emotion_num'], 
                                                    test_size=0.2, random_state=42, stratify=df['Emotion_num'])

In [19]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (4749,)
Shape of X_test:  (1188,)


# converting to numerical values

In [20]:
v = TfidfVectorizer()

X_train_cv = v.fit_transform(X_train)
X_test_cv = v.transform(X_test)

# All TfidfVectorizer vocabularies
print(v.vocabulary_)



# applying random forest

In [21]:
RFC_model = RandomForestClassifier()

RFC_model.fit(X_train_cv, y_train)

In [22]:
# Get the predictions for X_test and store it in y_pred
y_pred = RFC_model.predict(X_test_cv)

In [23]:
# Print Accuracy
print(accuracy_score(y_test, y_pred))

0.9276094276094277


In [24]:
# Print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.94       400
           1       0.93      0.93      0.93       388
           2       0.94      0.90      0.92       400

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



# test

In [25]:
test_text = df['Comment'][2000]
test_text

'im looking good and feeling good other than this crappy cold im dealing with'

In [26]:
#preprocess
test_text_processed = [preprocess(test_text)]
test_text_processed

['m look good feel good crappy cold m deal']

In [27]:
#covert to vector
test_text_vc = v.transform(test_text_processed)

In [28]:
#get prediction
test_text = RFC_model.predict(test_text_vc)

In [29]:
print(f"{df['Emotion'][2000]} -> {df['Emotion_num'][2000]}")
print(test_text)

joy -> 0
[0]
