In [37]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emotions-dataset-for-nlp/val.txt
/kaggle/input/emotions-dataset-for-nlp/test.txt
/kaggle/input/emotions-dataset-for-nlp/train.txt


In [38]:
df_train = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/train.txt")
df_train

Unnamed: 0,i didnt feel humiliated;sadness
0,i can go from feeling so hopeless to so damned...
1,im grabbing a minute to post i feel greedy wro...
2,i am ever feeling nostalgic about the fireplac...
3,i am feeling grouchy;anger
4,ive been feeling a little burdened lately wasn...
...,...
15994,i just had a very brief time in the beanbag an...
15995,i am now turning and i feel pathetic that i am...
15996,i feel strong and good overall;joy
15997,i feel like this was such a rude comment and i...


About Data: Emotion Detection
Credits: https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp

This data consists of two columns. - Comment - Emotion

Comment are the statements or messages regarding to a particular event/situation.

Emotion feature tells whether the given comment is fear 😨, Anger 😡, Joy 😂.

As there are only 3 classes, this problem comes under the Multi-Class Classification.

In [39]:
df_train = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/train.txt", delimiter=';', header=None, names=['sentence','label'])
df_test = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/test.txt", delimiter=';', header=None, names=['sentence','label'])
df_val = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/val.txt", delimiter=';', header=None, names=['sentence','label'])


#concat the data

In [40]:
df= pd.concat([df_train, df_test, df_val])

In [41]:
df.head()

Unnamed: 0,sentence,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [42]:
df.isna().value_counts()

sentence  label
False     False    20000
Name: count, dtype: int64

In [43]:
df.label.value_counts()

label
joy         6761
sadness     5797
anger       2709
fear        2373
love        1641
surprise     719
Name: count, dtype: int64

As seen above the data is imbalance , so we have to solve the data imbalance. 

In [44]:
#setting  a sample to have the data balance , this is not the best practice though 
#because I am going to lose data but the sake of this mini project , I will do it this way

min_samples= 719

df_joy=df[df.label=='joy'].sample(min_samples, random_state=234)
df_sadness=df[df.label=='sadness'].sample(min_samples, random_state=234)
df_anger=df[df.label=='anger'].sample(min_samples, random_state=234)
df_fear=df[df.label=='fear'].sample(min_samples, random_state=234)
df_love=df[df.label=='love'].sample(min_samples, random_state=234)
df_surprise=df[df.label=='surprise'].sample(min_samples, random_state=234)

In [45]:
df_balanced = pd.concat([df_joy,df_sadness,df_anger,df_fear,df_love,df_surprise])

In [46]:
df_balanced.sample(5)

Unnamed: 0,sentence,label
4171,i do enjoy large bold prints and i suppose its...,fear
941,i expected but it did feel hopeful and it defi...,joy
8685,i hope to use this blog site to put my feeling...,joy
692,im more comfortable in a relationship because ...,love
4832,i was feeling particularly bitchy and i dont t...,anger


In [47]:
df_balanced['label'].value_counts()

label
joy         719
sadness     719
anger       719
fear        719
love        719
surprise    719
Name: count, dtype: int64

In [48]:
target = {'joy':0, 'sadness':1, 'anger':2 , 'fear' : 3, 'love' : 4 , 'surprise' :5}
df_balanced['label_num']=df_balanced.label.map(target)

In [49]:
df_balanced.sample(10)

Unnamed: 0,sentence,label,label_num
1266,i do things according to my own feelings intui...,sadness,1
353,im trying to wein off them with doctors guidan...,fear,3
10335,im happy to report that im not feeling too pet...,anger,2
1382,i cannot even begin to express in words the de...,surprise,5
4803,i feel like if you can t admit that you ve alw...,surprise,5
9557,i feel that so many might be far too eager to ...,joy,0
8874,i don t have the longevity or experience in th...,surprise,5
2586,i know you feel tortured reading this,anger,2
1850,i believe a lot of people can feel this way no...,love,4
12013,i always feel as though it is my role in life ...,love,4


Modelling without Pre-processing Text data


Importing the neccessary model

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(
df_balanced['sentence'],
    df_balanced['label_num'],
    test_size=0.2,
    stratify= df_balanced['label_num'] 
)


Attempt 1 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
Note:

using CountVectorizer with only trigrams.
use RandomForest as the classifier.
print the classification report.

In [51]:
from sklearn.pipeline import Pipeline 
from sklearn.metrics import classification_report 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier


clf= Pipeline([
    ('CountVectorizer', CountVectorizer(ngram_range=(3,3))),
    ('Model', RandomForestClassifier())
])


#model fit
clf.fit(X_train,y_train)

y_preds= clf.predict(X_test)




In [52]:
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.34      0.07      0.12       144
           1       0.21      0.05      0.08       143
           2       0.36      0.12      0.19       144
           3       0.18      0.86      0.30       144
           4       0.52      0.08      0.14       144
           5       0.63      0.15      0.25       144

    accuracy                           0.22       863
   macro avg       0.37      0.22      0.18       863
weighted avg       0.37      0.22      0.18       863



Attempt 2 :

using the sklearn pipeline module create a classification pipeline to classify the data.
Note:

using CountVectorizer with both unigram and bigrams.
use Multinomial Naive Bayes as the classifier.
print the classification report.

In [53]:
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('Vectorizer',CountVectorizer(ngram_range=(1,2))),
    ('Model',MultinomialNB() )
])

#train the model 

clf.fit(X_train,y_train)

#predcit 
y_preds=clf.predict(X_test)

#print classification_report 
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.66      0.55      0.60       144
           1       0.64      0.55      0.59       143
           2       0.78      0.67      0.72       144
           3       0.70      0.63      0.66       144
           4       0.60      0.82      0.69       144
           5       0.72      0.85      0.78       144

    accuracy                           0.68       863
   macro avg       0.68      0.68      0.67       863
weighted avg       0.68      0.68      0.67       863



Attempt 3 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
Note:

using CountVectorizer with both unigram and Bigrams.
use RandomForest as the classifier.
print the classification report.

In [56]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

clf= Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,2))),
    ('Model',RandomForestClassifier())
])

#train the model
clf.fit(X_train, y_train)

#predict
y_preds = clf.predict(X_test)

#report 
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.77      0.74      0.76       144
           1       0.73      0.75      0.74       143
           2       0.85      0.85      0.85       144
           3       0.85      0.73      0.78       144
           4       0.86      0.92      0.89       144
           5       0.87      0.95      0.91       144

    accuracy                           0.82       863
   macro avg       0.82      0.82      0.82       863
weighted avg       0.82      0.82      0.82       863



Attempt 4 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
Note:

using TF-IDF vectorizer for Pre-processing the text.
use RandomForest as the classifier.
print the classification report.

In [60]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer


clf = Pipeline([
    ('Vectorizer',TfidfVectorizer()),
    ('Model', RandomForestClassifier())
])


#training the model

clf.fit(X_train, y_train)

#predict 
y_preds = clf.predict(X_test)

#report

print(classification_report(y_test,y_preds))


              precision    recall  f1-score   support

           0       0.62      0.66      0.64       144
           1       0.65      0.62      0.64       143
           2       0.83      0.83      0.83       144
           3       0.90      0.74      0.81       144
           4       0.87      0.90      0.88       144
           5       0.88      0.98      0.92       144

    accuracy                           0.79       863
   macro avg       0.79      0.79      0.79       863
weighted avg       0.79      0.79      0.79       863



In [None]:
#####

Using text pre-processing to remove stop words, punctuations and apply lemmatization


In [65]:
import spacy

nlp = spacy.load('en_core_web_sm')

def preprocess(text):
    doc= nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)
        
    


In [66]:
preprocess('Hey, AD, I love you so much.')

'hey AD love'

In [69]:
df_balanced['sentence_preprocessed']= df_balanced.sentence.apply(preprocess)


Unnamed: 0,sentence,label,label_num,sentence_preprocessed
7123,i feel relieved to have the big moving of furn...,joy,0,feel relieved big moving furniture
840,i feel as if i am naturally talented in though...,joy,0,feel naturally talente know need improve
9880,i continue to feel amazing and feel zero alarm...,joy,0,continue feel amazing feel zero alarm prospect...
4887,i worried over the feeling of supposed to bein...,joy,0,worry feeling suppose church rich dr
1783,i feel so respected now,joy,0,feel respected


In [71]:
df_balanced.sample(10)

Unnamed: 0,sentence,label,label_num,sentence_preprocessed
7544,i feel like i should be hated and that everyth...,anger,2,feel like hate happen deserve
10300,i reshaped the workout slightly because my lef...,love,4,reshape workout slightly left upper arm feel t...
879,i feel more amazed and more thankful for havin...,surprise,5,feel amazed thankful have e life
1835,i am now in cyprus seeing my timeline so visib...,sadness,1,cyprus see timeline visibly ask feel stress ho...
3179,i feel strange with the judge passing sentence...,surprise,5,feel strange judge pass sentence manner
15560,i am glad that the exhibition closed during sp...,joy,0,glad exhibition close spring time new leave co...
13527,i finished blogging i was feeling shaky and ch...,fear,3,finish blogge feel shaky check level
15703,i think we all feel a bit disappointed to miss...,sadness,1,think feel bit disappointed miss point today
5132,i still feel a bit overwhelmed,fear,3,feel bit overwhelmed
957,i feel anxious and off,fear,3,feel anxious


In [72]:
X_train, X_test, y_train, y_test = train_test_split(
df_balanced.sentence_preprocessed,
    df_balanced.label_num,
    test_size=0.2,
    stratify= df_balanced.label_num    
)

Let's check the scores with our best model till now

Random Forest
Attempt1 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
Note:

using CountVectorizer with both unigrams and bigrams.
use RandomForest as the classifier.
print the classification report.

In [75]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

clf= Pipeline([
    ('Vectorizer',CountVectorizer(ngram_range=(1,2))),
     ('Model',  RandomForestClassifier())
])

#train
clf.fit(X_train, y_train)

#test
y_preds=clf.predict(X_test)

#result

print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.78      0.62      0.69       144
           1       0.59      0.80      0.68       143
           2       0.87      0.76      0.81       144
           3       0.90      0.76      0.82       144
           4       0.82      0.84      0.83       144
           5       0.87      0.97      0.92       144

    accuracy                           0.79       863
   macro avg       0.81      0.79      0.79       863
weighted avg       0.81      0.79      0.79       863



In [None]:
Attempt 2 :

using the sklearn pipeline module create a classification pipeline to classify the data.
Note:

using TF-IDF vectorizer for pre-processing the text.
use RandomForest as the classifier.
print the classification report.

In [76]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report 

clf= Pipeline([
    ('Vectorizer',TfidfVectorizer()),
    ('Model', RandomForestClassifier())
])


#train
clf.fit(X_train,y_train)

#test
y_preds= clf.predict(X_test)

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.60      0.69      0.64       144
           1       0.72      0.66      0.69       143
           2       0.89      0.78      0.83       144
           3       0.92      0.75      0.82       144
           4       0.79      0.87      0.83       144
           5       0.85      0.97      0.91       144

    accuracy                           0.79       863
   macro avg       0.79      0.79      0.79       863
weighted avg       0.79      0.79      0.79       863

