In [2]:
# ## trial_improved.ipynb
# Cell 1: Setup & installs (run once)
# If you already have packages installed, skip the pip installs below.
# On macOS, prefer conda for heavy libs (lightgbm/xgboost/torch) but pip works for others.

# !pip install -U scikit-learn pandas numpy joblib tqdm nltk textblob vaderSentiment gensim sentence-transformers optuna
# Optional:
# !pip install empath                      # optional semantic lexicon (may fail)
# !pip install xgboost lightgbm            # optional, use conda if problems
# !pip install transformers datasets torch # optional: heavy, only if you want to fine-tune

# Cell 2: Imports & reproducibility
import os
import re
import joblib
import time
import string
import logging
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from typing import List
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
import nltk, ssl

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
logging.basicConfig(level=logging.INFO)

# Cell 3: NLTK setup (use ~/nltk_new_data as you requested)
NLTK_DIR = os.path.expanduser("~/nltk_new_data")
os.makedirs(NLTK_DIR, exist_ok=True)
nltk.data.path.append(NLTK_DIR)

# fix macOS SSL issues if any
try:
    _create_unverified_https_context = ssl._create_unverified_context
    ssl._create_default_https_context = _create_unverified_https_context
except Exception:
    pass

for pkg in ["stopwords", "punkt", "averaged_perceptron_tagger", "wordnet"]:
    try:
        nltk.data.find(f"corpora/{pkg}")
    except LookupError:
        print(f"Downloading {pkg} to {NLTK_DIR} ...")
        nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)

from nltk.corpus import stopwords, wordnet
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer

STOPWORDS = set(stopwords.words("english"))
LEM = WordNetLemmatizer()

# Cell 4: Optional imports with graceful fallback
try:
    from empath import Empath
    LEX = Empath()
    print("Empath loaded")
except Exception:
    LEX = None
    print("Empath not available — continuing without it.")

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
SIA = SentimentIntensityAnalyzer()

# If xgboost/lightgbm are installed we'll include them later
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
    print("xgboost available:", xgb.__version__)
except Exception:
    XGBOOST_AVAILABLE = False

try:
    import lightgbm as lgb
    LIGHTGBM_AVAILABLE = True
    print("lightgbm available:", lgb.__version__)
except Exception:
    LIGHTGBM_AVAILABLE = False

# Cell 5: Load dataset
# Replace with your actual CSV path; expects columns: 'clean_text' or 'text' and 'target' (labels 0-5)
DATA_PATH = "/Users/rishabhkapur/Documents/work/mental-health-analysis/docs/cleaned_data_after_posion.csv"  # change to your file
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"{DATA_PATH} not found — put your dataset here or change DATA_PATH")

df = pd.read_csv(DATA_PATH)
# Inspect columns; adapt if names differ
print("Columns:", df.columns.tolist())
# Assume there's a text column and a label 'target' (0..5)
TEXT_COL = "clean_text" if "clean_text" in df.columns else "text"
LABEL_COL = "target"
if TEXT_COL not in df.columns or LABEL_COL not in df.columns:
    raise ValueError(f"Expected columns {TEXT_COL} and {LABEL_COL} in dataset")

df = df[[TEXT_COL, LABEL_COL]].dropna()
df[TEXT_COL] = df[TEXT_COL].astype(str)
print("Loaded samples:", len(df))
df.head()

# Cell 6: Preprocessing functions (same approach as your pipeline)
def basic_clean(text: str):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"[^a-z\s']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def simple_sentiment_clean(text: str):
    # minimal cleaning for sentiment models (train sentiment on lightly cleaned text)
    return basic_clean(text)

def advanced_clean(text: str):
    # tokenization + pos lemmatization (like you used)
    text = basic_clean(text)
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    out = []
    for w, tag in tags:
        if w in STOPWORDS:
            continue
        pos = wordnet.VERB if tag.startswith('V') else \
              wordnet.NOUN if tag.startswith('N') else \
              wordnet.ADJ if tag.startswith('J') else wordnet.ADV
        out.append(LEM.lemmatize(w, pos))
    return " ".join(out)

# quick apply (you can precompute to speed up)
tqdm.pandas()
df["text_mh"] = df[TEXT_COL].progress_apply(advanced_clean)
df["text_sent"] = df[TEXT_COL].progress_apply(simple_sentiment_clean)

# Cell 7: Train/test split
X = df["text_mh"].tolist()
X_sent = df["text_sent"].tolist()   # used for sentiment model (if separate)
y = df[LABEL_COL].astype(int).values

X_train, X_test, X_train_sent, X_test_sent, y_train, y_test = train_test_split(
    df["text_mh"].tolist(), df["text_sent"].tolist(), y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
print("Train/test sizes:", len(X_train), len(X_test))

# Cell 8: Feature engineering helpers (dense features)
def text_stats_features(texts: List[str]):
    rows = []
    for t in texts:
        words = t.split()
        word_count = len(words)
        avg_len = np.mean([len(w) for w in words]) if words else 0
        punct_ratio = sum(ch in string.punctuation for ch in t) / max(len(t), 1)
        rows.append([word_count, avg_len, punct_ratio])
    return np.array(rows)

def sentiment_features(texts: List[str]):
    rows = []
    for t in texts:
        vs = SIA.polarity_scores(t)
        tb = TextBlob(t).sentiment
        rows.append([vs["compound"], tb.polarity, tb.subjectivity])
    return np.array(rows)

def pos_features(texts: List[str]):
    rows = []
    for t in texts:
        tags = pos_tag(t.split())
        nouns = sum(1 for _, p in tags if p.startswith("N"))
        verbs = sum(1 for _, p in tags if p.startswith("V"))
        adjs = sum(1 for _, p in tags if p.startswith("J"))
        advs = sum(1 for _, p in tags if p.startswith("R"))
        rows.append([nouns, verbs, adjs, advs])
    return np.array(rows)

def empath_features(texts: List[str], categories=40):
    if LEX is None:
        return np.zeros((len(texts), categories))
    cat_names = list(LEX.cats.keys())[:categories]
    out = []
    for t in texts:
        try:
            scores = LEX.analyze(t or "", normalize=True) or {}
            vals = [scores.get(cat, 0.0) for cat in cat_names]
        except Exception:
            vals = [0.0] * len(cat_names)
        out.append(vals)
    return np.array(out)

# Precompute dense features for train/test (for meta-model)
dense_train = np.hstack([
    text_stats_features(X_train),
    sentiment_features(X_train_sent),
    pos_features(X_train),
    empath_features(X_train)
])

dense_test = np.hstack([
    text_stats_features(X_test),
    sentiment_features(X_test_sent),
    pos_features(X_test),
    empath_features(X_test)
])

print("Dense shapes:", dense_train.shape, dense_test.shape)

# Cell 9: Sparse features (TF-IDF) + optional SVD to reduce dimensionality
tfidf = TfidfVectorizer(max_features=15000, ngram_range=(1,2), sublinear_tf=True)
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
print("TF-IDF shapes:", X_train_tfidf.shape, X_test_tfidf.shape)

# Optionally reduce dimensionality in case of heavy models (uncomment if desired)
# svd = TruncatedSVD(n_components=200, random_state=RANDOM_STATE)
# X_train_tfidf = svd.fit_transform(X_train_tfidf)
# X_test_tfidf = svd.transform(X_test_tfidf)

# Save vectorizer
os.makedirs("models/mental_health_v2", exist_ok=True)
joblib.dump(tfidf, "models/mental_health_v2/tfidf_vectorizer.pkl")

# Cell 10: Base LR on TF-IDF (base model used for meta features)
base_lr = LogisticRegression(max_iter=2000, C=5, solver="saga", n_jobs=-1, random_state=RANDOM_STATE)
print("Training base LR on TF-IDF ...")
base_lr.fit(X_train_tfidf, y_train)
print("Base LR score (val):", base_lr.score(X_test_tfidf, y_test))
joblib.dump(base_lr, "models/mental_health_v2/base_lr_tfidf.pkl")

# We'll create meta features by concatenating base_lr.predict_proba(X_tfidf) + dense features

# Cell 11: Prepare meta-features
proba_train = base_lr.predict_proba(X_train_tfidf)
proba_test = base_lr.predict_proba(X_test_tfidf)
X_meta_train = np.hstack([proba_train, dense_train])
X_meta_test = np.hstack([proba_test, dense_test])
print("X_meta shapes:", X_meta_train.shape, X_meta_test.shape)

# Cell 12: Try multiple classical models on meta features (quick CV + grid)
models = {
    "logreg": LogisticRegression(max_iter=2000, C=1, solver="lbfgs", multi_class="auto", random_state=RANDOM_STATE),
    "rf": RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1),
    "gb": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "svc": SVC(probability=True, random_state=RANDOM_STATE),
    "nb": MultinomialNB(),  # note: NB expects non-negative features (meta-features might include negatives)
}

if XGBOOST_AVAILABLE:
    models["xgb"] = xgb.XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=RANDOM_STATE)
if LIGHTGBM_AVAILABLE:
    models["lgbm"] = lgb.LGBMClassifier(random_state=RANDOM_STATE)

# Evaluate each model with 5-fold CV on X_meta_train
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
results = {}
for name, m in models.items():
    try:
        print(f"CV for {name} ...")
        scores = cross_val_score(m, X_meta_train, y_train, cv=cv, scoring="f1_weighted", n_jobs=-1)
        results[name] = (scores.mean(), scores.std())
        print(f"{name}: f1_weighted = {scores.mean():.4f} ± {scores.std():.4f}")
    except Exception as e:
        print(f"Skipping {name} due to error:", e)

# Cell 13: Fit top candidates and evaluate on test set
sorted_models = sorted(results.items(), key=lambda x: x[1][0], reverse=True)
sorted_models

# choose top 3 (or all) to fit and evaluate
fitted_models = {}
for name, (mean_score, std) in sorted_models[:5]:
    m = models[name]
    print("Fitting", name)
    m.fit(X_meta_train, y_train)
    ypred = m.predict(X_meta_test)
    print(name, "accuracy:", accuracy_score(y_test, ypred), "f1_weighted:", f1_score(y_test, ypred, average="weighted"))
    print(classification_report(y_test, ypred))
    fitted_models[name] = m
    joblib.dump(m, f"models/mental_health_v2/{name}_meta.pkl")

# Cell 14: Build a stacking classifier (meta = LogisticRegression) using base_lr probabilities + dense features as input
estimators = []
# choose a subset of fitted base estimators
for nm, m in fitted_models.items():
    estimators.append((nm, m))

if estimators:
    stack = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(max_iter=2000, random_state=RANDOM_STATE),
        passthrough=False,
        n_jobs=-1
    )
    print("Fitting stacking classifier on meta features ...")
    stack.fit(X_meta_train, y_train)
    yp_stack = stack.predict(X_meta_test)
    print("Stack accuracy:", accuracy_score(y_test, yp_stack))
    print("Stack f1_weighted:", f1_score(y_test, yp_stack, average="weighted"))
    print(classification_report(y_test, yp_stack))
    joblib.dump(stack, "models/mental_health_v2/stack_meta.pkl")
else:
    print("No fitted base estimators to stack.")

# Cell 15: Save final chosen model and artifacts
# Choose the best performing model (evaluate results above and pick)
best_model = None
# heuristic: pick stacking if exists, else the best fitted model by test f1
if "stack" in locals():
    best_model = stack
else:
    # pick by f1 on test
    best_name, best_score = None, -1
    for name, m in fitted_models.items():
        ypred = m.predict(X_meta_test)
        s = f1_score(y_test, ypred, average="weighted")
        if s > best_score:
            best_score = s
            best_name = name
    if best_name:
        best_model = fitted_models[best_name]

if best_model is not None:
    joblib.dump(best_model, "models/mental_health_v2/final_meta_model.pkl")
    print("Saved final_meta_model.pkl")

# Cell 16: OPTIONAL — Use sentence-transformers embeddings + classical model
# This is often a powerful and simpler approach: get dense embeddings and train a light classifier.
try:
    from sentence_transformers import SentenceTransformer
    print("Loading sentence-transformer (may be large) ...")
    model_name = "all-MiniLM-L6-v2"  # small & fast
    sbert = SentenceTransformer(model_name)
    emb_train = sbert.encode(X_train, show_progress_bar=True)
    emb_test = sbert.encode(X_test, show_progress_bar=True)
    print("Emb shapes:", emb_train.shape, emb_test.shape)

    # train a simple LR on embeddings
    emb_clf = LogisticRegression(max_iter=2000, random_state=RANDOM_STATE, n_jobs=-1)
    emb_clf.fit(emb_train, y_train)
    ypred_emb = emb_clf.predict(emb_test)
    print("SBERT+LR f1_weighted:", f1_score(y_test, ypred_emb, average="weighted"))
    joblib.dump(emb_clf, "models/mental_health_v2/sbert_lr.pkl")
except Exception as e:
    print("Sentence-transformers unavailable or failed:", e)

# Cell 17: OPTIONAL — Transformer fine-tuning (guarded)
# WARNING: heavy; only run if you have GPU or accept long CPU training times.
# This section uses the 'transformers' library + datasets Trainer to fine-tune a small model.
#
# If you want this, uncomment and proceed carefully.
#
# from datasets import Dataset
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
#
# model_nm = "distilbert-base-uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_nm)
# def tokenize_batch(ex):
#     return tokenizer(ex['text'], truncation=True, padding='max_length', max_length=128)
#
# ds = Dataset.from_pandas(pd.DataFrame({'text': df[TEXT_COL].tolist(), 'label': df[LABEL_COL].tolist()}))
# ds = ds.train_test_split(test_size=0.2, stratify_by_column='label', seed=RANDOM_STATE)
# ds = ds.map(tokenize_batch, batched=True)
# model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=len(df[LABEL_COL].unique()))
# args = TrainingArguments(output_dir="transformer_out", per_device_train_batch_size=8, num_train_epochs=2, evaluation_strategy="epoch")
# trainer = Trainer(model=model, args=args, train_dataset=ds['train'], eval_dataset=ds['test'])
# trainer.train()
# trainer.save_model("models/transformer_finetuned")

# Cell 18: Final evaluation summary
print("Final evaluation summary:")
if best_model is not None:
    ypred = best_model.predict(X_meta_test)
    print("Final model f1_weighted:", f1_score(y_test, ypred, average="weighted"))
    print(confusion_matrix(y_test, ypred))
    print(classification_report(y_test, ypred))
else:
    print("No final model saved.")

# End of notebook


Downloading punkt to /Users/rishabhkapur/nltk_new_data ...
Downloading averaged_perceptron_tagger to /Users/rishabhkapur/nltk_new_data ...
Downloading wordnet to /Users/rishabhkapur/nltk_new_data ...
Empath loaded
lightgbm available: 4.6.0
Columns: ['Unnamed: 0', 'clean_text', 'target']
Loaded samples: 6064


100%|██████████| 6064/6064 [00:14<00:00, 423.20it/s]
100%|██████████| 6064/6064 [00:00<00:00, 37598.95it/s]


Train/test sizes: 4851 1213
Dense shapes: (4851, 50) (1213, 50)
TF-IDF shapes: (4851, 15000) (1213, 15000)
Training base LR on TF-IDF ...
Base LR score (val): 0.8202802967848309
X_meta shapes: (4851, 56) (1213, 56)
CV for logreg ...




logreg: f1_weighted = 0.9893 ± 0.0017
CV for rf ...
rf: f1_weighted = 0.9885 ± 0.0020
CV for gb ...
gb: f1_weighted = 0.9889 ± 0.0014
CV for svc ...
svc: f1_weighted = 0.3299 ± 0.0284
CV for nb ...
Skipping nb due to error: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/mental_health/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/mental_health/lib/python3.10/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/anaconda3/envs/mental_health/lib/python3.10/site-packages/sklearn/naive_b













lgbm: f1_weighted = 0.9882 ± 0.0014
Fitting logreg
logreg accuracy: 0.8153338829348722 f1_weighted: 0.8164442919933993
              precision    recall  f1-score   support

           0       0.91      0.88      0.90       208
           1       0.74      0.76      0.75       193
           2       0.91      0.85      0.88       205
           3       0.72      0.77      0.74       205
           4       0.84      0.80      0.82       204
           5       0.78      0.82      0.80       198

    accuracy                           0.82      1213
   macro avg       0.82      0.81      0.82      1213
weighted avg       0.82      0.82      0.82      1213

Fitting gb
gb accuracy: 0.8178070898598516 f1_weighted: 0.8195156695952286
              precision    recall  f1-score   support

           0       0.92      0.88      0.90       208
           1       0.74      0.78      0.76       193
           2       0.92      0.85      0.88       205
           3       0.71      0.79      0.75   



svc accuracy: 0.380873866446826 f1_weighted: 0.35133999316821846
              precision    recall  f1-score   support

           0       0.31      0.38      0.34       208
           1       0.37      0.23      0.28       193
           2       0.31      0.34      0.32       205
           3       0.29      0.16      0.21       205
           4       0.40      0.25      0.31       204
           5       0.49      0.93      0.65       198

    accuracy                           0.38      1213
   macro avg       0.36      0.38      0.35      1213
weighted avg       0.36      0.38      0.35      1213

Fitting stacking classifier on meta features ...




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011768 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10183
[LightGBM] [Info] Number of data points in the train set: 4851, number of used features: 56
[LightGBM] [Info] Start training from score -1.760707
[LightGBM] [Info] Start training from score -1.834077
[LightGBM] [Info] Start training from score -1.780078
[LightGBM] [Info] Start training from score -1.780078
[LightGBM] [Info] Start training from score -1.783752
[LightGBM] [Info] Start training from score -1.813642




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015881 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9777
[LightGBM] [Info] Number of data points in the train set: 3881, number of used features: 56
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026828 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019911 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9638
[LightGBM] [Info] Total Bins 9797
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9











Stack accuracy: 0.8211046990931574
Stack f1_weighted: 0.8219887503616159
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       208
           1       0.74      0.76      0.75       193
           2       0.92      0.86      0.89       205
           3       0.75      0.76      0.75       205
           4       0.85      0.79      0.82       204
           5       0.77      0.86      0.81       198

    accuracy                           0.82      1213
   macro avg       0.82      0.82      0.82      1213
weighted avg       0.82      0.82      0.82      1213

Saved final_meta_model.pkl


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Loading sentence-transformer (may be large) ...


Batches: 100%|██████████| 152/152 [00:14<00:00, 10.76it/s]
Batches: 100%|██████████| 38/38 [00:02<00:00, 17.65it/s]


Emb shapes: (4851, 384) (1213, 384)
SBERT+LR f1_weighted: 0.7464097177230942
Final evaluation summary:




Final model f1_weighted: 0.8219887503616159
[[185   3   0   9   2   9]
 [  5 147  10  15  10   6]
 [  0  14 176   3   4   8]
 [  4  14   1 155  10  21]
 [  8  10   4  12 162   8]
 [  1  11   0  13   2 171]]
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       208
           1       0.74      0.76      0.75       193
           2       0.92      0.86      0.89       205
           3       0.75      0.76      0.75       205
           4       0.85      0.79      0.82       204
           5       0.77      0.86      0.81       198

    accuracy                           0.82      1213
   macro avg       0.82      0.82      0.82      1213
weighted avg       0.82      0.82      0.82      1213



In [5]:
# =====================================
# 🧠 Mental Health Classification v3
# Hybrid NLP Model: TF-IDF + Dense Features + Transformer Embeddings
# =====================================

# --- Imports ---
import os
import re
import string
import joblib
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

# optional libraries
try:
    from lightgbm import LGBMClassifier
    LIGHTGBM_AVAILABLE = True
except:
    LIGHTGBM_AVAILABLE = False

try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except:
    XGBOOST_AVAILABLE = False

try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except:
    CATBOOST_AVAILABLE = False

from sentence_transformers import SentenceTransformer
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk, ssl
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

# =====================================
# 1️⃣ NLTK setup
# =====================================
NLTK_DIR = os.path.expanduser("~/nltk_new_data")
os.makedirs(NLTK_DIR, exist_ok=True)
nltk.data.path.append(NLTK_DIR)

try:
    ssl._create_default_https_context = ssl._create_unverified_context
except Exception:
    pass

for pkg in ["stopwords", "punkt", "averaged_perceptron_tagger", "wordnet"]:
    try:
        nltk.data.find(f"corpora/{pkg}")
    except LookupError:
        nltk.download(pkg, download_dir=NLTK_DIR, quiet=True)

STOPWORDS = set(stopwords.words("english"))
LEM = WordNetLemmatizer()
SIA = SentimentIntensityAnalyzer()

# =====================================
# 2️⃣ Load Dataset
# =====================================
DATA_PATH = "/Users/rishabhkapur/Documents/work/mental-health-analysis/docs/cleaned_data_after_posion.csv"  # path to your final dataset
df = pd.read_csv(DATA_PATH)
df = df[['clean_text', 'target']].dropna()
print("Samples:", len(df))

# =====================================
# 3️⃣ Text Cleaning
# =====================================
def basic_clean(text: str):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"[^a-z\s']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def advanced_clean(text: str):
    text = basic_clean(text)
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    out = []
    for w, tag in tags:
        if w in STOPWORDS:
            continue
        pos = wordnet.VERB if tag.startswith('V') else \
              wordnet.NOUN if tag.startswith('N') else \
              wordnet.ADJ if tag.startswith('J') else wordnet.ADV
        out.append(LEM.lemmatize(w, pos))
    return " ".join(out)

tqdm.pandas()
df['clean_text'] = df['clean_text'].progress_apply(advanced_clean)

# =====================================
# 4️⃣ Feature Engineering Helpers
# =====================================
def text_stats_features(texts):
    rows = []
    for t in texts:
        words = t.split()
        word_count = len(words)
        avg_len = np.mean([len(w) for w in words]) if words else 0
        punct_ratio = sum(ch in string.punctuation for ch in t) / max(len(t), 1)
        rows.append([word_count, avg_len, punct_ratio])
    return np.array(rows)

def sentiment_features(texts):
    rows = []
    for t in texts:
        vs = SIA.polarity_scores(t)
        tb = TextBlob(t).sentiment
        rows.append([vs["compound"], tb.polarity, tb.subjectivity])
    return np.array(rows)

def pos_features(texts):
    rows = []
    for t in texts:
        tags = pos_tag(t.split())
        nouns = sum(1 for _, p in tags if p.startswith("N"))
        verbs = sum(1 for _, p in tags if p.startswith("V"))
        adjs = sum(1 for _, p in tags if p.startswith("J"))
        advs = sum(1 for _, p in tags if p.startswith("R"))
        rows.append([nouns, verbs, adjs, advs])
    return np.array(rows)

# =====================================
# 5️⃣ Split Data
# =====================================
X = df['clean_text'].values
y = df['target'].astype(int).values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print("Train/Test:", len(X_train), len(X_test))

# =====================================
# 6️⃣ TF-IDF Features
# =====================================
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2), sublinear_tf=True)
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
print("TF-IDF shape:", X_train_tfidf.shape)

# =====================================
# 7️⃣ Dense Features
# =====================================
dense_train = np.hstack([
    text_stats_features(X_train),
    sentiment_features(X_train),
    pos_features(X_train)
])
dense_test = np.hstack([
    text_stats_features(X_test),
    sentiment_features(X_test),
    pos_features(X_test)
])
print("Dense shape:", dense_train.shape)

# scale numeric features only
scaler = StandardScaler()
dense_train_scaled = scaler.fit_transform(dense_train)
dense_test_scaled = scaler.transform(dense_test)

# =====================================
# 8️⃣ Transformer Embeddings (SBERT)
# =====================================
print("Encoding transformer embeddings...")
model_name = "all-MiniLM-L6-v2"
sbert = SentenceTransformer(model_name)
emb_train = sbert.encode(X_train, show_progress_bar=True)
emb_test = sbert.encode(X_test, show_progress_bar=True)
print("SBERT Emb shape:", emb_train.shape)

# =====================================
# 9️⃣ Concatenate All Features
# =====================================
from scipy.sparse import hstack, csr_matrix

# stack sparse (TF-IDF) and dense parts
X_train_full = hstack([
    X_train_tfidf,
    csr_matrix(dense_train_scaled),
    csr_matrix(emb_train)
])
X_test_full = hstack([
    X_test_tfidf,
    csr_matrix(dense_test_scaled),
    csr_matrix(emb_test)
])

print("Final combined shape:", X_train_full.shape)

# =====================================
# 🔟 Try Multiple Models
# =====================================
models = {
    "LogisticRegression": LogisticRegression(max_iter=3000, solver='saga', n_jobs=-1, C=2),
    "RandomForest": RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "SVC": SVC(kernel='linear', probability=True, random_state=42)
}

if LIGHTGBM_AVAILABLE:
    models["LightGBM"] = LGBMClassifier(n_estimators=300, random_state=42)
if XGBOOST_AVAILABLE:
    models["XGBoost"] = XGBClassifier(n_estimators=300, use_label_encoder=False, eval_metric="mlogloss", random_state=42)
if CATBOOST_AVAILABLE:
    models["CatBoost"] = CatBoostClassifier(verbose=0, iterations=300, random_state=42)

results = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"Training {name} ...")
    scores = cross_val_score(model, X_train_full, y_train, cv=cv, scoring="f1_weighted", n_jobs=-1)
    print(f"F1-weighted: {scores.mean():.4f} ± {scores.std():.4f}")
    results[name] = (scores.mean(), scores.std())


# =====================================
# 11️⃣ Fit Best Models & Evaluate
# =====================================
sorted_models = sorted(results.items(), key=lambda x: x[1][0], reverse=True)
print("\nTop Models:")
for name, (mean, std) in sorted_models[:5]:
    print(f"{name}: {mean:.4f}")

# Pick top 3 models for stacking
top_models = [m[0] for m in sorted_models[:3]]
print("Stacking:", top_models)

# Fit base models
fitted_models = []
for name in top_models:
    m = models[name]
    m.fit(X_train_full, y_train)
    fitted_models.append((name, m))
    preds = m.predict(X_test_full)
    print(f"{name} Test F1:", f1_score(y_test, preds, average='weighted'))

# =====================================
# 12️⃣ Stacking Classifier
# =====================================
stack = StackingClassifier(
    estimators=fitted_models,
    final_estimator=LogisticRegression(max_iter=2000, random_state=42),
    n_jobs=-1
)
print("Training stacking model ...")
stack.fit(X_train_full, y_train)
y_pred = stack.predict(X_test_full)
print("\nStacked Model F1:", f1_score(y_test, y_pred, average="weighted"))
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# =====================================
# 13️⃣ Save Models
# =====================================
os.makedirs("models/mental_health_v3", exist_ok=True)
joblib.dump(tfidf, "models/mental_health_v3/tfidf_vectorizer.pkl")
joblib.dump(scaler, "models/mental_health_v3/scaler.pkl")
joblib.dump(stack, "models/mental_health_v3/final_hybrid_model.pkl")
print("✅ Saved final hybrid model!")

# =====================================
# 14️⃣ Summary
# =====================================
best_name = sorted_models[0][0]
print(f"\n🏆 Best Base Model: {best_name}")
print(f"Stacked Model (Hybrid) F1: {f1_score(y_test, y_pred, average='weighted'):.4f}")
print("✅ Training complete!")


Samples: 6064


100%|██████████| 6064/6064 [00:12<00:00, 466.70it/s]


Train/Test: 4851 1213
TF-IDF shape: (4851, 20000)


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Dense shape: (4851, 10)
Encoding transformer embeddings...


Batches: 100%|██████████| 152/152 [00:07<00:00, 19.14it/s]
Batches: 100%|██████████| 38/38 [00:01<00:00, 20.08it/s]


SBERT Emb shape: (4851, 384)
Final combined shape: (4851, 20394)
Training LogisticRegression ...
F1-weighted: 0.7952 ± 0.0149
Training RandomForest ...
F1-weighted: 0.7805 ± 0.0159
Training GradientBoosting ...
F1-weighted: 0.7853 ± 0.0137
Training SVC ...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)


F1-weighted: 0.8012 ± 0.0143
Training LightGBM ...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.117161 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 142782
[LightGBM] [Info] Number of data points in the train set: 3881, number of used features: 1878
[LightGBM] [Info] Start training from score -1.761058
[LightGBM] [Info] Start training from score -1.834129
[LightGBM] [Info] Start training from score -1.780741
[LightGBM] [Info] Start training from score -1.779213
[LightGBM] [Info] Start training from score -1.783804
[LightGBM] [Info] Start training from score -1.813378
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.132406 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.116994 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM