In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:
from google.colab import files
uploaded = files.upload()


Saving archive (8).zip to archive (8).zip


In [None]:
# Step 2: Extract the ZIP file
import zipfile
import os

zip_path = "archive (8).zip"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("dataset")

# See what files are inside
os.listdir("dataset")


['test.txt', 'val.txt', 'train.txt']

In [None]:
import os

# List everything in current directory
print("Root folder:", os.listdir())

# If you extracted ZIP to 'dataset', check inside it
print("\nInside dataset folder:", os.listdir("dataset"))


Root folder: ['.config', 'archive (8).zip', 'dataset', 'sample_data']

Inside dataset folder: ['test.txt', 'val.txt', 'train.txt']


In [None]:
import pandas as pd

# Load the text files (they are inside the "dataset" folder)
train_df = pd.read_csv("dataset/train.txt", sep="\t", names=["text", "emotion"])
test_df  = pd.read_csv("dataset/test.txt", sep="\t", names=["text", "emotion"])
val_df   = pd.read_csv("dataset/val.txt", sep="\t", names=["text", "emotion"])

# Preview data
print("Train Data Sample:")
print(train_df.head())

print("\nShapes:")
print("Train:", train_df.shape)
print("Test:", test_df.shape)
print("Validation:", val_df.shape)


Train Data Sample:
                                                text  emotion
0                    i didnt feel humiliated;sadness      NaN
1  i can go from feeling so hopeless to so damned...      NaN
2  im grabbing a minute to post i feel greedy wro...      NaN
3  i am ever feeling nostalgic about the fireplac...      NaN
4                         i am feeling grouchy;anger      NaN

Shapes:
Train: (16000, 2)
Test: (2000, 2)
Validation: (2000, 2)


In [None]:
# Combine all three datasets into one
data = pd.concat([train_df, test_df, val_df], ignore_index=True)

# Preview
print(data.head())
print(data.shape)


                                                text  emotion
0                    i didnt feel humiliated;sadness      NaN
1  i can go from feeling so hopeless to so damned...      NaN
2  im grabbing a minute to post i feel greedy wro...      NaN
3  i am ever feeling nostalgic about the fireplac...      NaN
4                         i am feeling grouchy;anger      NaN
(20000, 2)


In [None]:
import re

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))   # keep only letters and spaces
    text = text.lower().strip()
    return text

data['clean_text'] = data['text'].apply(clean_text)
data.head()


Unnamed: 0,text,emotion,clean_text
0,i didnt feel humiliated;sadness,,i didnt feel humiliatedsadness
1,i can go from feeling so hopeless to so damned...,,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wro...,,im grabbing a minute to post i feel greedy wro...
3,i am ever feeling nostalgic about the fireplac...,,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy;anger,,i am feeling grouchyanger


In [None]:
from sklearn.model_selection import train_test_split

X = data['clean_text']
y = data['emotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)


In [None]:
# Drop any rows where emotion is missing
data = data.dropna(subset=['emotion'])

# Just to be sure, reset index
data = data.reset_index(drop=True)

print(data.isnull().sum())
print("âœ… Cleaned data shape:", data.shape)


text          0
emotion       0
clean_text    0
dtype: int64
âœ… Cleaned data shape: (0, 3)


In [None]:
import pandas as pd

# Read all data (without splitting first)
train_df = pd.read_csv("dataset/train.txt", header=None, names=["raw"])
test_df  = pd.read_csv("dataset/test.txt", header=None, names=["raw"])
val_df   = pd.read_csv("dataset/val.txt", header=None, names=["raw"])

# Combine
data = pd.concat([train_df, test_df, val_df], ignore_index=True)

# Split by semicolon (some rows may not have one)
data[['text', 'emotion']] = data['raw'].str.split(";", n=1, expand=True)

# Drop rows where emotion is missing
data = data.dropna(subset=['emotion']).reset_index(drop=True)

# Clean text
import re
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    text = text.lower().strip()
    return text

data['clean_text'] = data['text'].apply(clean_text)

print(data.head())
print(data['emotion'].value_counts())
print("âœ… Final shape:", data.shape)



                                                 raw  \
0                    i didnt feel humiliated;sadness   
1  i can go from feeling so hopeless to so damned...   
2  im grabbing a minute to post i feel greedy wro...   
3  i am ever feeling nostalgic about the fireplac...   
4                         i am feeling grouchy;anger   

                                                text  emotion  \
0                            i didnt feel humiliated  sadness   
1  i can go from feeling so hopeless to so damned...  sadness   
2   im grabbing a minute to post i feel greedy wrong    anger   
3  i am ever feeling nostalgic about the fireplac...     love   
4                               i am feeling grouchy    anger   

                                          clean_text  
0                            i didnt feel humiliated  
1  i can go from feeling so hopeless to so damned...  
2   im grabbing a minute to post i feel greedy wrong  
3  i am ever feeling nostalgic about the fireplac...

In [None]:
from sklearn.model_selection import train_test_split
X = data['clean_text']
y = data['emotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.model_selection import train_test_split

X = data['clean_text']
y = data['emotion']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test_tfidf)

print("âœ… Accuracy:", accuracy_score(y_test, y_pred))
print("\nðŸ“‹ Classification Report:\n", classification_report(y_test, y_pred))


âœ… Accuracy: 0.8625

ðŸ“‹ Classification Report:
               precision    recall  f1-score   support

       anger       0.91      0.82      0.86       536
        fear       0.83      0.74      0.79       458
         joy       0.83      0.95      0.89      1339
        love       0.86      0.64      0.74       335
     sadness       0.89      0.94      0.91      1173
    surprise       0.89      0.50      0.64       159

    accuracy                           0.86      4000
   macro avg       0.87      0.77      0.80      4000
weighted avg       0.86      0.86      0.86      4000



In [None]:
sample = ["I feel so proud and happy today!"]
sample_tfidf = vectorizer.transform(sample)
print("Predicted Emotion:", model.predict(sample_tfidf)[0])


Predicted Emotion: joy


In [None]:
# Let the user type their own text
user_input = input("Enter a sentence to detect emotion: ")

# Clean the input the same way as training data
import re
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    text = text.lower().strip()
    return text

cleaned_input = clean_text(user_input)

# Convert to TF-IDF form
input_tfidf = vectorizer.transform([cleaned_input])

# Predict emotion
predicted_emotion = model.predict(input_tfidf)[0]

print(f"\nðŸŽ¯ Predicted Emotion: {predicted_emotion}")


Enter a sentence to detect emotion: i am having a bad day

ðŸŽ¯ Predicted Emotion: sadness
