In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Emotion_classify_Data.csv")

In [12]:
df.shape

(5937, 3)

In [3]:
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [4]:
df.Emotion.value_counts()

anger    2000
joy      2000
fear     1937
Name: Emotion, dtype: int64

In [5]:
df['Emotion_num'] = df.Emotion.map({
    "joy":0,
    "fear":1,
    "anger":2
})

In [6]:
df.head()

Unnamed: 0,Comment,Emotion,Emotion_num
0,i seriously hate one subject to death but now ...,fear,1
1,im so full of life i feel appalled,anger,2
2,i sit here to write i start to dig out my feel...,fear,1
3,ive been really angry with r and i feel like a...,joy,0
4,i feel suspicious if there is no one outside l...,fear,1


## Modelling without Pre-processing Text data

In [8]:
from sklearn.model_selection import train_test_split

# X is your feature matrix (input data)
# y is your target variable (labels or outputs)

# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(df.Comment, df.Emotion_num, test_size=0.2, random_state=2022, stratify=df.Emotion_num)


In [9]:
X_train.shape

(4749,)

In [11]:
X_test.shape

(1188,)

# RandomForest

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Create the classification pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(3, 3))),
    ('classifier', RandomForestClassifier())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.60      0.26      0.36       400
           1       0.37      0.82      0.51       388
           2       0.54      0.21      0.31       400

    accuracy                           0.43      1188
   macro avg       0.50      0.43      0.39      1188
weighted avg       0.51      0.43      0.39      1188



In [15]:

# Create the classification pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.96      0.90       400
           1       0.95      0.88      0.91       388
           2       0.92      0.86      0.89       400

    accuracy                           0.90      1188
   macro avg       0.91      0.90      0.90      1188
weighted avg       0.91      0.90      0.90      1188



In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer


# Create the classification pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.94      0.90       400
           1       0.92      0.89      0.91       388
           2       0.92      0.86      0.89       400

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188



# MultinomialNB

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Create the classification pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.87      0.86      0.87       400
           1       0.87      0.83      0.85       388
           2       0.83      0.88      0.85       400

    accuracy                           0.86      1188
   macro avg       0.86      0.86      0.86      1188
weighted avg       0.86      0.86      0.86      1188



### Use text pre-processing to remove stop words, punctuations and apply lemmatization


In [17]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 


#use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [18]:
df['preprocessed_txt'] = df['Comment'].apply(preprocess)

In [19]:
df.head()

Unnamed: 0,Comment,Emotion,Emotion_num,preprocessed_txt
0,i seriously hate one subject to death but now ...,fear,1,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,anger,2,m life feel appalled
2,i sit here to write i start to dig out my feel...,fear,1,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,joy,0,ve angry r feel like idiot trust place
4,i feel suspicious if there is no one outside l...,fear,1,feel suspicious outside like rapture happen


## Build a model with pre processed text



In [20]:
X_train, X_test, y_train, y_test = train_test_split(df.preprocessed_txt, df.Emotion_num, test_size=0.2, random_state=2022, stratify=df.Emotion_num)


In [21]:
# Create the classification pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.95       400
           1       0.94      0.91      0.92       388
           2       0.92      0.94      0.93       400

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



In [22]:

# Create the classification pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93       400
           1       0.93      0.91      0.92       388
           2       0.94      0.91      0.92       400

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



In [23]:
# Final Observations
As part of this exercise we have trained the data with algorithms like Multinomial Naive Bayes and Random Forest which are most used and provide good results for text related problems.

As Machine learning algorithms do not work on text data directly, we need to convert them into numeric vectors and feed that into models while training. For this purpose, we have used Bag of words(unigrams, bigrams, n-grams) and TF-IDF text representation techniques.

Key Findings

As the n_gram range keeps increasing, there's drastic fall of improvement in performance metrics.

There's seen a significant improvement in results before pre-processing and after pre-processing the data.

TF-IDF and Bag of words both performed equally well in performance metrics like Recall and F1-score.

Random Forest performed quite well when compared to Multinomial Naive Bayes.

SyntaxError: unterminated string literal (detected at line 8) (2903283625.py, line 8)