In [25]:
!pip install -r requirements.txt




[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
# --- Imports ---
import pandas as pd

# Load the dataset
df_intent = pd.read_csv("data/intent_emotion.csv", sep="\t", header=None)
df_intent.columns = ["text", "intent", "emotion"]

# Check data
print(df_intent.head())
print(df_intent['intent'].value_counts())
print(df_intent['emotion'].value_counts())


                                    text    intent  emotion
0                               hi there  greeting    happy
1  can you tell me ways to manage stress       faq  neutral
2                     thanks for talking  farewell  neutral
3                 i feel so tense lately     other  anxious
4                           good morning  greeting    happy
intent
faq         119
other        87
greeting     69
farewell     59
Name: count, dtype: int64
emotion
neutral      80
suicidal     73
happy        61
depressed    49
anxious      43
angry        28
Name: count, dtype: int64


In [27]:
import re
from spellchecker import SpellChecker
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
spell = SpellChecker()

def preprocess_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""
    
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    
    corrected = []
    for word in tokens:
        if word in stop_words or word.strip() == "":
            corrected.append(word)
        else:
            # pyspellchecker might return None, so we fallback to the original word
            c = spell.correction(word)
            corrected.append(c if c is not None else word)
    
    lemmatized = [lemmatizer.lemmatize(word) for word in corrected if word not in stop_words]
    return ' '.join(lemmatized)


df_intent['clean_text'] = df_intent['text'].apply(preprocess_text)
print(df_intent[['text', 'clean_text']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\limda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\limda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\limda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\limda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                    text              clean_text
0                               hi there                      hi
1  can you tell me ways to manage stress  tell way manage stress
2                     thanks for talking          thanks talking
3                 i feel so tense lately       feel tense lately
4                           good morning            good morning


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# ---------------------------
# 3a: Intent Classifier
# ---------------------------
X_intent = df_intent['clean_text']
y_intent = df_intent['intent']  # 'faq', 'greeting', 'farewell', 'other'

# Train/test split
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
    X_intent, y_intent, test_size=0.2, random_state=42, stratify=y_intent
)

# Pipeline: TF-IDF + Logistic Regression
intent_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=200))
])

# Train
intent_pipeline.fit(X_train_i, y_train_i)

# Predict & evaluate
y_pred_i = intent_pipeline.predict(X_test_i)
print("=== Intent Classification ===")
print(classification_report(y_test_i, y_pred_i))

# ---------------------------
# 3b: Emotion Classifier
# ---------------------------
X_emotion = df_intent['clean_text']
y_emotion = df_intent['emotion']  # 'anxious', 'depressed', 'happy', 'angry', 'suicidal'

# Train/test split
X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(
    X_emotion, y_emotion, test_size=0.2, random_state=42, stratify=y_emotion
)

# Pipeline: TF-IDF + Logistic Regression
emotion_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=200))
])

# Train
emotion_pipeline.fit(X_train_e, y_train_e)

# Predict & evaluate
y_pred_e = emotion_pipeline.predict(X_test_e)
print("=== Emotion Classification ===")
print(classification_report(y_test_e, y_pred_e))


=== Intent Classification ===
              precision    recall  f1-score   support

         faq       0.83      0.83      0.83        24
    farewell       1.00      1.00      1.00        12
    greeting       1.00      0.93      0.96        14
       other       0.72      0.76      0.74        17

    accuracy                           0.87        67
   macro avg       0.89      0.88      0.88        67
weighted avg       0.87      0.87      0.87        67

=== Emotion Classification ===
              precision    recall  f1-score   support

       angry       1.00      0.60      0.75         5
     anxious       1.00      0.56      0.71         9
   depressed       0.38      0.30      0.33        10
       happy       0.62      0.67      0.64        12
     neutral       0.42      0.62      0.50        16
    suicidal       0.64      0.60      0.62        15

    accuracy                           0.57        67
   macro avg       0.67      0.56      0.59        67
weighted avg    

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load FAQ dataset
df_faq = pd.read_csv("data/faq_qa.csv", sep="\t", header=None)
df_faq.columns = ["question", "answer"]

# Preprocess FAQ questions (reuse the same preprocessing function)
df_faq['clean_question'] = df_faq['question'].apply(preprocess_text)

# Vectorize FAQ questions
faq_vectorizer = TfidfVectorizer()
faq_vectors = faq_vectorizer.fit_transform(df_faq['clean_question'])

# Emotion-based template responses
emotion_templates = {
    "anxious": "Take a deep breath and calm down. ",
    "depressed": "I understand you feel down. ",
    "happy": "Glad to hear from you! ",
    "angry": "I hear your frustration. ",
    "suicidal": "Please reach out to a professional immediately: "
}

# Function to get chatbot response
def get_response(user_input):
    clean_input = preprocess_text(user_input)

    # Predict intent
    proba = intent_pipeline.predict_proba([clean_input])[0]
    best_idx = proba.argmax()
    intent_conf = proba[best_idx]
    intent = intent_pipeline.classes_[best_idx]
    
    if intent_conf < 0.4:
        intent = "fallback"

    # Predict emotion
    proba_e = emotion_pipeline.predict_proba([clean_input])[0]
    best_idx_e = proba_e.argmax()
    emotion_conf = proba_e[best_idx_e]
    emotion = emotion_pipeline.classes_[best_idx_e]
    
    if emotion_conf < 0.3:
        emotion = "neutral"

    # Generate response
    if intent == "faq":
        user_vector = faq_vectorizer.transform([clean_input])
        similarities = cosine_similarity(user_vector, faq_vectorizer.transform(df_faq['clean_question']))
        best_idx = similarities.argmax()
        answer = df_faq.iloc[best_idx]['answer']
        response = f"{emotion_templates.get(emotion,'')}{answer}"
    elif intent == "greeting":
        response = f"{emotion_templates.get(emotion,'')}Hello! How can I help you today?"
    elif intent == "farewell":
        response = f"{emotion_templates.get(emotion,'')}Goodbye! Take care."
    elif intent == "fallback":
        response = "I'm not sure I understand. Can you rephrase your question?"
    else:
        response = f"{emotion_templates.get(emotion,'')}I'm here to listen. Tell me more."

    return response


In [30]:
import pickle
import os

# Create model folder if it doesn't exist
os.makedirs("model", exist_ok=True)

# --- Save intent pipeline ---
with open("model/intent_pipeline.pkl", "wb") as f:
    pickle.dump(intent_pipeline, f)

# --- Save emotion pipeline ---
with open("model/emotion_pipeline.pkl", "wb") as f:
    pickle.dump(emotion_pipeline, f)

# --- Save FAQ vectorizer ---
with open("model/faq_vectorizer.pkl", "wb") as f:
    pickle.dump(faq_vectorizer, f)

# --- Save preprocessed FAQ dataset ---
df_faq.to_pickle("model/faq_dataset.pkl")

print("All models and data saved successfully!")


All models and data saved successfully!
