In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/codebasics/nlp-tutorials/main/12_tf_idf/Emotion_classify_Data.csv')

In [3]:
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [4]:
df.isnull().sum()

Comment    0
Emotion    0
dtype: int64

In [5]:
df.Emotion.value_counts()

anger    2000
joy      2000
fear     1937
Name: Emotion, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder

encoder=  LabelEncoder()

df['emotion_num'] = encoder.fit_transform(df.Emotion)

In [7]:
df.head()

Unnamed: 0,Comment,Emotion,emotion_num
0,i seriously hate one subject to death but now ...,fear,1
1,im so full of life i feel appalled,anger,0
2,i sit here to write i start to dig out my feel...,fear,1
3,ive been really angry with r and i feel like a...,joy,2
4,i feel suspicious if there is no one outside l...,fear,1


In [9]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [10]:
def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = [token.lemma_ for token in doc if not token.is_stop or not token.is_punct]
    return ' '.join(filtered_tokens)

df['text'] = df['Comment'].apply(preprocess)

In [12]:
df.head()

Unnamed: 0,Comment,Emotion,emotion_num,text
0,i seriously hate one subject to death but now ...,fear,1,I seriously hate one subject to death but now ...
1,im so full of life i feel appalled,anger,0,I m so full of life I feel appal
2,i sit here to write i start to dig out my feel...,fear,1,I sit here to write I start to dig out my feel...
3,ive been really angry with r and i feel like a...,joy,2,I ve be really angry with r and I feel like an...
4,i feel suspicious if there is no one outside l...,fear,1,I feel suspicious if there be no one outside l...


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.text, df.emotion_num, test_size=0.2)

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('model', MultinomialNB())
])

clf.fit(X_train, y_train)

In [15]:
from sklearn.metrics import classification_report

print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.89      0.92      0.91       404
           1       0.91      0.89      0.90       395
           2       0.91      0.89      0.90       389

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188



In [21]:
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('model', RandomForestClassifier(n_estimators=100))
])

clf.fit(X_train, y_train)

print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.88      0.90       404
           1       0.93      0.89      0.91       395
           2       0.87      0.96      0.91       389

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



In [23]:
from sklearn.feature_extraction.text import CountVectorizer


clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('model', RandomForestClassifier(n_estimators=200))
])

clf.fit(X_train, y_train)

print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92       404
           1       0.95      0.90      0.92       395
           2       0.89      0.97      0.93       389

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.92      1188
weighted avg       0.93      0.93      0.92      1188



In [20]:
from sklearn.neighbors import KNeighborsClassifier

clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('model', KNeighborsClassifier())
])

clf.fit(X_train, y_train)

print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.66      0.79      0.72       404
           1       0.71      0.73      0.72       395
           2       0.83      0.64      0.72       389

    accuracy                           0.72      1188
   macro avg       0.74      0.72      0.72      1188
weighted avg       0.73      0.72      0.72      1188

