In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import spacy
from spacy.lang.en.stop_words import STOP_WORDS


In [2]:
dff = pd.read_csv('emotions.csv')

dff["label"].replace(0, "sadness", inplace=True)
dff["label"].replace(1, "joy", inplace=True)
dff["label"].replace(2, "love", inplace=True)
dff["label"].replace(3, "anger", inplace=True)
dff["label"].replace(4, "fear", inplace=True)
dff["label"].replace(5, "surprise", inplace=True)

# Display first few rows
print(dff.head())

df =  dff[:5000]


                                                text    label
0      i just feel really helpless and heavy hearted     fear
1  ive enjoyed being able to slouch about relax a...  sadness
2  i gave up my internship with the dmrg and am f...     fear
3                         i dont know i feel so lost  sadness
4  i am a kindergarten teacher and i am thoroughl...     fear


In [3]:
# Check the size and shape of the dataset
print(f"Dataset Shape: {df.shape}")

# Check for missing values
print(df.isnull().sum())

# Print the distribution of the target variable (emotions)
print(df['label'].value_counts())

# Show data types and summary statistics
print(df.info())
print(df.describe())


Dataset Shape: (5000, 2)
text     0
label    0
dtype: int64
label
joy         1704
sadness     1434
anger        696
fear         596
love         382
surprise     188
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB
None
                                                     text label
count                                                5000  5000
unique                                               4999     6
top     i feel like if i continue this i will turn int...   joy
freq                                                    2  1704


In [4]:
# Initialize the Spacy NLP model
nlp = spacy.load('en_core_web_sm')


In [5]:
# Preprocessing function to clean text
def preprocess_text(text):
    doc = nlp(text.lower())  # Convert to lowercase
    tokens = [token.lemma_ for token in doc if token.text not in STOP_WORDS and token.is_alpha]  # Remove stopwords and keep only alphabetic tokens
    return ' '.join(tokens)


In [6]:
# Apply preprocessing to the text data
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Display the cleaned text
print(df['cleaned_text'].head())

0                          feel helpless heavy hearted
1    ve enjoy able slouch relax unwind frankly need...
2                 give internship dmrg feel distraught
3                                   not know feel lost
4    kindergarten teacher thoroughly weary job havi...
Name: cleaned_text, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'] = df['text'].apply(preprocess_text)


In [7]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Transform the cleaned text data into features
X = tfidf_vectorizer.fit_transform(df['cleaned_text']).toarray()

# Target labels (emotions)
y = df['label']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shape of training and testing sets
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Training data shape: (4000, 1000)
Testing data shape: (1000, 1000)


In [8]:
# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train, y_train)

# Predict emotions on the test set
y_pred = nb_classifier.predict(X_test)


In [9]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Classification report (precision, recall, F1-score for each emotion)
print(classification_report(y_test, y_pred))


Accuracy: 70.20%
              precision    recall  f1-score   support

       anger       0.95      0.51      0.66       144
        fear       0.87      0.50      0.64       117
         joy       0.64      0.93      0.75       336
        love       0.93      0.16      0.28        86
     sadness       0.70      0.87      0.77       282
    surprise       1.00      0.03      0.06        35

    accuracy                           0.70      1000
   macro avg       0.85      0.50      0.53      1000
weighted avg       0.76      0.70      0.67      1000



In [14]:
# Example prediction
new_text = ["He seemed mesmerized"]

# Preprocess the new text
new_text_processed = preprocess_text(new_text[0])

# Transform the text using the trained TF-IDF Vectorizer
new_text_features = tfidf_vectorizer.transform([new_text_processed])

# Predict the emotion
predicted_emotion = nb_classifier.predict(new_text_features)
print(f"Predicted Emotion: {predicted_emotion[0]}")


Predicted Emotion: joy
