## Import libraries

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import kagglehub

## Import dataset

In [28]:
path = kagglehub.dataset_download("rahmasleam/emotions-rml-dataset")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/emotions-rml-dataset


In [29]:
dataset = pd.read_csv(path + '/Emotion_classify_Data.csv')
len(dataset)

5937

In [30]:
dataset.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [31]:
print(dataset['Emotion'].unique())

['fear' 'anger' 'joy']


## Clean text

In [32]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []

for i in range(0, 5937):
  comment = re.sub('^[a-zA-z]', ' ', dataset['Comment'][i])
  comment = comment.lower()
  comment = comment.split()

  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  all_stopwords.remove('no')

  comment = [ps.stem(word) for word in comment if not word in set(all_stopwords)]
  comment = ' '.join(comment)
  corpus.append(comment)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
print(np.array(corpus[0:5]).reshape(-1, 1))

[['serious hate one subject death feel reluct drop']
 ['full life feel appal']
 ['sit write start dig feel think afraid accept possibl might not make']
 ['realli angri r feel like idiot trust first place']
 ['feel suspici no one outsid like raptur happen someth']]


## Create Bag of Words model

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

In [35]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = dataset.iloc[:, -1].values
y = le.fit_transform(y)
print(y[0:5])

[1 0 1 2 1]


In [36]:
print(corpus[0], (y[0]))

serious hate one subject death feel reluct drop 1


## Split dataset into training and test set

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Train and compare models

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

models = [LogisticRegression(),
          MultinomialNB(),
          RandomForestClassifier(n_estimators=150)]

for model in models:
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(f"{model.__class__.__name__}")
  cm = confusion_matrix(y_test, y_pred)
  print(cm)
  acc = accuracy_score(y_test, y_pred)
  print(acc)

LogisticRegression
[[367  27  22]
 [ 12 351  10]
 [  3   7 389]]
0.9318181818181818
MultinomialNB
[[354  35  27]
 [ 19 335  19]
 [ 22  27 350]]
0.8745791245791246
RandomForestClassifier
[[383  26   7]
 [ 13 358   2]
 [  8  13 378]]
0.9419191919191919
