In [6]:
#import pandas library
import pandas as pd


#read the dataset with name "Emotion_classify_Data.csv" and store it in a variable df
df = pd.read_csv('Emotion_Classify_Data.csv')

#print the shape of dataframe
print(df.shape)


#print top 5 rows
print(df.head())

(5937, 2)
                                             Comment Emotion
0  i seriously hate one subject to death but now ...    fear
1                 im so full of life i feel appalled   anger
2  i sit here to write i start to dig out my feel...    fear
3  ive been really angry with r and i feel like a...     joy
4  i feel suspicious if there is no one outside l...    fear


In [8]:
#check the distribution of Emotion
df.Emotion.value_counts()

Emotion
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

In [10]:
#Add the new column "Emotion_num" which gives a unique number to each of these Emotions
#joy --> 0, fear --> 1, anger --> 2

target= {'joy': 0, 'fear': 1, 'anger': 2}
df['Emotion_num']= df.Emotion.map(target)


#checking the results by printing top 5 rows
df.head()

Unnamed: 0,Comment,Emotion,Emotion_num
0,i seriously hate one subject to death but now ...,fear,1
1,im so full of life i feel appalled,anger,2
2,i sit here to write i start to dig out my feel...,fear,1
3,ive been really angry with r and i feel like a...,joy,0
4,i feel suspicious if there is no one outside l...,fear,1


# Modelling without Pre-processing Text data

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Comment, df.Emotion_num, 
                                                    test_size=0.2,  stratify=df.Emotion_num , 
                                                    random_state= 2000)

In [15]:
print(X_test.shape)
print(X_train.shape)

(1188,)
(4749,)


In [16]:
y_test.value_counts()

Emotion_num
2    400
0    400
1    388
Name: count, dtype: int64

In [17]:
y_train.value_counts()

Emotion_num
2    1600
0    1600
1    1549
Name: count, dtype: int64

In [22]:
#import CountVectorizer, RandomForest, pipeline, classification_report from sklearn 
#using CountVectorizer with only trigrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


# create a pipeline object
clf = Pipeline([['Vectorization', CountVectorizer(ngram_range=(3,3))], 
               ['RF', RandomForestClassifier()]])

# fit with X_train and y_train
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# print the classfication report
print(classification_report(y_test,
    y_pred))

              precision    recall  f1-score   support

           0       0.50      0.15      0.23       400
           1       0.37      0.79      0.50       388
           2       0.47      0.28      0.35       400

    accuracy                           0.40      1188
   macro avg       0.45      0.41      0.36      1188
weighted avg       0.45      0.40      0.36      1188



In [23]:
# use Multinomial Naive Bayes as the classifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# create a pipeline object
clf = Pipeline([['Vectorization', CountVectorizer()], 
               ['nb', MultinomialNB()]])

# fit with X_train and y_train
clf.fit(X_test, y_test)

y_pred = clf.predict(X_test)

# print the classfication report
print(classification_report(y_test,
    y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       400
           1       0.98      0.98      0.98       388
           2       0.97      0.98      0.98       400

    accuracy                           0.98      1188
   macro avg       0.98      0.98      0.98      1188
weighted avg       0.98      0.98      0.98      1188



In [24]:
#using CountVectorizer with both unigram and Bigrams . use RandomForest as the classifier.
# create a pipeline object
clf = Pipeline([['Vectorization', CountVectorizer(ngram_range=(1,3))], 
               ['RF', RandomForestClassifier()]])

# fit with X_train and y_train
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# print the classfication report
print(classification_report(y_test,
    y_pred))

              precision    recall  f1-score   support

           0       0.84      0.96      0.90       400
           1       0.94      0.85      0.89       388
           2       0.92      0.86      0.89       400

    accuracy                           0.89      1188
   macro avg       0.90      0.89      0.89      1188
weighted avg       0.90      0.89      0.89      1188



In [25]:
# using TF-IDF vectorizer for Pre-processing the text
# use RandomForest as the classifier

#import TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer


#1. create a pipeline object
clf = Pipeline([['Vectorization', CountVectorizer()], 
               ['RF', RandomForestClassifier()]])



#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred= clf.predict(X_test)

#4. print the classfication report
print(classification_report(y_test,
    y_pred))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93       400
           1       0.94      0.90      0.92       388
           2       0.92      0.91      0.91       400

    accuracy                           0.92      1188
   macro avg       0.92      0.92      0.92      1188
weighted avg       0.92      0.92      0.92      1188



In [26]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 


#use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [27]:
# create a new column "preprocessed_comment" and use the utility function above to get the clean data
# this will take some time, please be patient

# Build a model with pre processed text

In [31]:
# import train-test-split from sklearn
import sklearn
from sklearn.model_selection import train_test_split


# Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test= train_test_split(df.Comment, df.Emotion_num, 
                                                   test_size=0.2, random_state= 20, 
                                                   stratify= df.Emotion_num)

In [32]:
#using CountVectorizer with both unigram and Bigrams . use RandomForest as the classifier.
# create a pipeline object

clf = Pipeline([['Vectorization', CountVectorizer(ngram_range=(1,3))], 
               ['RF', RandomForestClassifier()]])

# fit with X_train and y_train
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# print the classfication report
print(classification_report(y_test,
    y_pred))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88       400
           1       0.91      0.87      0.89       388
           2       0.93      0.85      0.89       400

    accuracy                           0.89      1188
   macro avg       0.89      0.89      0.89      1188
weighted avg       0.89      0.89      0.89      1188

