# Emotion Analysis Model Training
This notebook trains TF‑IDF + Logistic Regression model.

In [1]:
import os, re, joblib
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [2]:
PROJECT_ROOT = Path().resolve()
DATA_DIR = PROJECT_ROOT / 'Dataset'
MODEL_DIR = PROJECT_ROOT / 'Model'
MODEL_DIR.mkdir(exist_ok=True)
MAX_FEAT = 12000

In [3]:
def clean_text(s):
    if not isinstance(s, str): return ''
    s = re.sub(r'http\S+', '', s)
    s = re.sub(r"’", "'", s)
    s = re.sub(r"[^A-Za-z0-9\s'\-]", ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s.lower()

def expand_contractions(text):
    contractions = {"can't":"can not","won't":"will not","n't":" not",
                    "i'm":"i am","it's":"it is","that's":"that is",
                    "i've":"i have","i'd":"i would","you're":"you are"}
    for k,v in contractions.items():
        text = re.sub(r'\b'+re.escape(k)+r'\b', v, text)
    return text

def handle_negations(text):
    text = expand_contractions(text)
    words = text.split()
    out, neg = [], False
    neg_tokens = {'not','no','never','cannot','cant',"won't"}
    for w in words:
        lw = w.lower()
        if neg:
            out.append('not_'+lw); neg=False; continue
        if lw in neg_tokens or lw.endswith("n't"):
            neg=True; continue
        out.append(lw)
    return ' '.join(out)

In [5]:
train=pd.read_csv(DATA_DIR/'train.txt',sep=';',names=['text','emotion'])
test=pd.read_csv(DATA_DIR/'test.txt',sep=';',names=['text','emotion'])
val=pd.read_csv(DATA_DIR/'val.txt',sep=';',names=['text','emotion'])
df=pd.concat([train,test,val]).reset_index(drop=True)
df['text']=df['text'].astype(str).apply(clean_text).apply(handle_negations)
le=LabelEncoder(); df['label']=le.fit_transform(df['emotion'])

In [6]:
X_train,X_test,y_train,y_test=train_test_split(df['text'],df['label'],test_size=0.2,random_state=42,stratify=df['label'])
vectorizer=TfidfVectorizer(max_features=MAX_FEAT,ngram_range=(1,2),sublinear_tf=True)
X_tr=vectorizer.fit_transform(X_train)
X_te=vectorizer.transform(X_test)

In [7]:
model=LogisticRegression(max_iter=2000,class_weight='balanced',solver='saga')
model.fit(X_tr,y_train)
pred=model.predict(X_te)
print('Accuracy:',accuracy_score(y_test,pred))
print(classification_report(y_test,pred,target_names=le.classes_))

Accuracy: 0.8585933630510153
              precision    recall  f1-score   support

       anger       0.82      0.87      0.84       545
        fear       0.86      0.79      0.83       482
         joy       0.90      0.87      0.88      1359
        love       0.71      0.86      0.78       335
     sadness       0.94      0.86      0.90      1166
    surprise       0.61      0.91      0.73       151

    accuracy                           0.86      4038
   macro avg       0.81      0.86      0.83      4038
weighted avg       0.87      0.86      0.86      4038





In [9]:
joblib.dump(model, MODEL_DIR/'emotion_model.pkl')
joblib.dump(vectorizer, MODEL_DIR/'tfidf_vectorizer.pkl')
joblib.dump(le, MODEL_DIR/'label_encoder.pkl')

['/content/Model/label_encoder.pkl']

In [11]:
# --- Demo Predictions ---
demo_sentences = [
    "I am extremely happy today!",
    "This is not good at all.",
    "I’m furious right now.",
    "Everything feels hopeless.",
    "I love spending time with you.",
    "I am terrified for tomorrow.",
    "That's insane, how can that happen?"
]

# Clean + negation transform
processed_demo = [handle_negations(clean_text(s)) for s in demo_sentences]

# Vectorize and predict
vec_demo = vectorizer.transform(processed_demo)
pred_indices = model.predict(vec_demo)
probabilities = model.predict_proba(vec_demo)

# Display Predictions
print("DEMO PREDICTIONS\n")
for sentence, idx, prob_row in zip(demo_sentences, pred_indices, probabilities):
    label = le.inverse_transform([idx])[0]
    confidence = round(max(prob_row) * 100, 2)
    print(f"Text: {sentence}")
    print(f"Predicted Emotion: {label} ({confidence}% confidence)")
    print("-" * 60)


DEMO PREDICTIONS

Text: I am extremely happy today!
Predicted Emotion: joy (62.71% confidence)
------------------------------------------------------------
Text: This is not good at all.
Predicted Emotion: surprise (52.91% confidence)
------------------------------------------------------------
Text: I’m furious right now.
Predicted Emotion: anger (70.22% confidence)
------------------------------------------------------------
Text: Everything feels hopeless.
Predicted Emotion: sadness (67.61% confidence)
------------------------------------------------------------
Text: I love spending time with you.
Predicted Emotion: love (35.86% confidence)
------------------------------------------------------------
Text: I am terrified for tomorrow.
Predicted Emotion: fear (78.43% confidence)
------------------------------------------------------------
Text: That's insane, how can that happen?
Predicted Emotion: surprise (84.86% confidence)
--------------------------------------------------------