In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import re
import nltk


In [None]:
from sklearn.metrics import classification_report,recall_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv("/content/drive/MyDrive/text_mining/combined/mental_disorders_reddit.csv")

data.head()

Unnamed: 0,title,selftext,created_utc,over_18,subreddit
0,Life is so pointless without others,Does anyone else think the most important part...,1650356960,False,BPD
1,Cold rage?,Hello fellow friends 😄\n\nI'm on the BPD spect...,1650356660,False,BPD
2,I don’t know who I am,My [F20] bf [M20] told me today (after I said ...,1650355379,False,BPD
3,HELP! Opinions! Advice!,"Okay, I’m about to open up about many things I...",1650353430,False,BPD
4,help,[removed],1650350907,False,BPD


In [None]:
print(data.shape)
print(data['subreddit'].value_counts())

(701787, 5)
subreddit
BPD              241116
Anxiety          173990
depression       156972
mentalillness     53232
bipolar           51112
schizophrenia     25365
Name: count, dtype: int64


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/text_mining/combined/mental_disorders_reddit.csv")

# Step 1: Keep only required columns
df = df[['title', 'selftext', 'subreddit']]

# Step 2: Combine 'title' and 'selftext' into 'text'
df['text'] = df['title'].fillna('') + ' ' + df['selftext'].fillna('')

# Step 3: Rename 'subreddit' to 'label'
df.rename(columns={'subreddit': 'label'}, inplace=True)

# Step 4: Remove rows where label == 'mentalillness'
df = df[df['label'] != 'mentalillness']

# Step 5: Keep only 10,000 samples per class
df_balanced = (
    df.groupby('label', group_keys=False)
    .apply(lambda x: x.sample(n=10000, random_state=42) if len(x) >= 10000 else x)
    .reset_index(drop=True)
)

# Step 6: Keep only 'text' and 'label' columns
df_balanced = df_balanced[['text', 'label']]

# Step 7: Save to CSV
output_path = "/content/drive/MyDrive/text_mining/combined/reddit_processed_balanced.csv"
df_balanced.to_csv(output_path, index=False)

print("Processed dataset saved to:", output_path)


  .apply(lambda x: x.sample(n=10000, random_state=42) if len(x) >= 10000 else x)


Processed dataset saved to: /content/drive/MyDrive/text_mining/combined/reddit_processed_balanced.csv


In [None]:
data = pd.read_csv("/content/drive/MyDrive/text_mining/combined/reddit_processed_balanced.csv")

data.head()

Unnamed: 0,text,label
0,"Anyone else I personally have always, even as ...",Anxiety
1,Anxiety that I might be having a pulmonary emb...,Anxiety
2,Why are my feet sweating but are cold ?!!,Anxiety
3,Not able to eat outside Whenever I go out with...,Anxiety
4,How can I learn to live in the moment? It's cr...,Anxiety


In [None]:
print(data.shape)
print(data['label'].value_counts())

(50000, 2)
label
Anxiety          10000
BPD              10000
bipolar          10000
depression       10000
schizophrenia    10000
Name: count, dtype: int64


In [None]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Load stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text cleaning function
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply cleaning
data['clean_text_final'] = data['text'].apply(clean_text)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# ----------------------------
# Model dictionary
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000)
}

# ----------------------------
# Training and feature importance display
# ----------------------------
def run_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. TF-IDF vectorization
    if vectorizer_type == 'basic':
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit models
    for name, model in models.items():
        print(f"\n--- {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Get feature importances
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Clean the data
data['clean_text_final'] = data['text'].apply(clean_text)

# Run all configurations
run_pipeline(data['clean_text_final'], data['label'], vectorizer_type='basic', apply_chi2=False)
run_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=False)
run_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=True)



--- BASIC | None | SVM ---

               precision    recall  f1-score   support

      Anxiety       0.82      0.82      0.82      2043
          BPD       0.77      0.73      0.75      1993
      bipolar       0.74      0.67      0.70      2016
   depression       0.66      0.72      0.69      1998
schizophrenia       0.71      0.75      0.73      1950

     accuracy                           0.74     10000
    macro avg       0.74      0.74      0.74     10000
 weighted avg       0.74      0.74      0.74     10000

Top 20 impactful words:
bpd: 38.1765
bipolar: 32.1850
schizophrenia: 23.6127
anxiety: 22.3500
fp: 19.8338
manic: 17.6655
schizophrenic: 17.6194
mania: 17.3558
dbt: 15.9098
schizoaffective: 15.7594
hypomania: 15.2374
hypomanic: 14.5350
depression: 14.4258
borderline: 12.7985
bp: 12.6854
anxious: 12.3248
delusion: 11.3779
dae: 11.1725
lamictal: 10.4671
sz: 9.8823

--- BASIC | None | AdaBoost ---

               precision    recall  f1-score   support

      Anxiety      

mutual info + tf-idf

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# ----------------------------
# Model dictionary
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Training and feature importance display
# ----------------------------
def run_pipeline(X_text, y, vectorizer_type, apply_mutual_info=False):
    # 1. TF-IDF vectorization
    if vectorizer_type == 'basic':
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    if apply_mutual_info:
        selector = SelectKBest(mutual_info_classif, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit models
    for name, model in models.items():
        print(f"\n--- {vectorizer_type.upper()} | {'Mutual Info' if apply_mutual_info else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Get feature importances
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Clean the data
data['clean_text_final'] = data['text'].apply(clean_text)

# Run all configurations
run_pipeline(data['clean_text_final'], data['label'], vectorizer_type='basic', apply_mutual_info=False)
run_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_mutual_info=False)
run_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_mutual_info=True)



--- BASIC | None | SVM ---

               precision    recall  f1-score   support

      Anxiety       0.82      0.82      0.82      2043
          BPD       0.77      0.73      0.75      1993
      bipolar       0.74      0.67      0.70      2016
   depression       0.66      0.72      0.69      1998
schizophrenia       0.71      0.75      0.73      1950

     accuracy                           0.74     10000
    macro avg       0.74      0.74      0.74     10000
 weighted avg       0.74      0.74      0.74     10000

Top 20 impactful words:
bpd: 38.1765
bipolar: 32.1850
schizophrenia: 23.6127
anxiety: 22.3500
fp: 19.8338
manic: 17.6655
schizophrenic: 17.6194
mania: 17.3558
dbt: 15.9098
schizoaffective: 15.7594
hypomania: 15.2374
hypomanic: 14.5350
depression: 14.4258
borderline: 12.7985
bp: 12.6854
anxious: 12.3248
delusion: 11.3779
dae: 11.1725
lamictal: 10.4671
sz: 9.8823

--- BASIC | None | AdaBoost ---

               precision    recall  f1-score   support

      Anxiety      

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m



--- NGRAM | Mutual Info | SVM ---

               precision    recall  f1-score   support

      Anxiety       0.81      0.81      0.81      2043
          BPD       0.76      0.72      0.74      1993
      bipolar       0.75      0.65      0.70      2016
   depression       0.66      0.73      0.69      1998
schizophrenia       0.69      0.76      0.72      1950

     accuracy                           0.73     10000
    macro avg       0.74      0.73      0.73     10000
 weighted avg       0.74      0.73      0.73     10000

Top 20 impactful words:
bpd: 38.4849
bipolar: 31.6136
anxiety: 24.6304
schizophrenia: 24.4322
fp: 21.2985
schizophrenic: 18.6157
manic: 18.0760
mania: 18.0538
dbt: 15.9910
hypomanic: 15.3693
hypomania: 15.2953
depression: 13.9498
bp: 13.6325
borderline: 13.0891
delusion: 12.5397
anxious: 12.3075
dae: 11.1641
lamictal: 11.0685
splitting: 10.3037
lithium: 9.5453

--- NGRAM | Mutual Info | AdaBoost ---

               precision    recall  f1-score   support

      

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


# ----------------------------
# Models to evaluate
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000)
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate models
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Assume `data` already contains 10K rows per class and is cleaned
data['clean_text_final'] = data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='basic', apply_chi2=False)

# Run BoW + n-gram
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=True)



--- BoW | BASIC | None | SVM ---





               precision    recall  f1-score   support

      Anxiety       0.80      0.77      0.79      2043
          BPD       0.76      0.70      0.73      1993
      bipolar       0.72      0.65      0.68      2016
   depression       0.62      0.66      0.64      1998
schizophrenia       0.66      0.76      0.71      1950

     accuracy                           0.71     10000
    macro avg       0.71      0.71      0.71     10000
 weighted avg       0.71      0.71      0.71     10000

Top 20 impactful words:
bpd: 8.9259
fp: 8.7382
schizoaffective: 8.4600
hypomanic: 7.6073
dbt: 7.4299
hypomania: 7.3873
schizophrenic: 7.3063
bipolar: 7.1725
sz: 7.1275
schizo: 6.9257
schizophrenia: 6.8997
bp: 6.7141
hypo: 6.3740
prodromal: 5.6203
bd: 5.4132
mania: 5.2856
pwbpd: 5.0887
manic: 4.9914
bpds: 4.9070
borderline: 4.8091

--- BoW | BASIC | None | AdaBoost ---

               precision    recall  f1-score   support

      Anxiety       0.77      0.67      0.72      2043
          BPD      



               precision    recall  f1-score   support

      Anxiety       0.78      0.76      0.77      2043
          BPD       0.75      0.69      0.72      1993
      bipolar       0.69      0.64      0.66      2016
   depression       0.62      0.63      0.62      1998
schizophrenia       0.66      0.76      0.71      1950

     accuracy                           0.70     10000
    macro avg       0.70      0.70      0.70     10000
 weighted avg       0.70      0.70      0.70     10000

Top 20 impactful words:
fp: 10.5944
bpd: 10.4105
schizoaffective: 9.0633
dbt: 8.9278
hypomanic: 8.3105
schizophrenic: 8.2606
bp: 8.1654
schizophrenia: 7.8478
bipolar: 7.5605
hypomania: 7.3233
im bipolar: 6.9787
hypo: 6.5352
negative symptom: 6.4392
depakote: 5.9596
mania: 5.9514
mixed episode: 5.7095
borderline: 5.5091
manic: 5.4538
criterion: 5.2318
clozapine: 5.1664

--- BoW | NGRAM | None | AdaBoost ---

               precision    recall  f1-score   support

      Anxiety       0.77      0.67 

mutual info + Bow

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# ----------------------------
# Models to evaluate
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_mutual_info=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Mutual Information selection (optional)
    if apply_mutual_info:
        selector = SelectKBest(mutual_info_classif, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate models
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Mutual Info' if apply_mutual_info else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Assume `data` already contains 10K rows per class and is cleaned
data['clean_text_final'] = data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='basic', apply_mutual_info=False)

# Run BoW + n-gram
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_mutual_info=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_mutual_info=True)



--- BoW | BASIC | None | SVM ---





               precision    recall  f1-score   support

      Anxiety       0.80      0.77      0.79      2043
          BPD       0.76      0.70      0.73      1993
      bipolar       0.72      0.65      0.68      2016
   depression       0.62      0.66      0.64      1998
schizophrenia       0.66      0.76      0.71      1950

     accuracy                           0.71     10000
    macro avg       0.71      0.71      0.71     10000
 weighted avg       0.71      0.71      0.71     10000

Top 20 impactful words:
bpd: 8.9259
fp: 8.7382
schizoaffective: 8.4600
hypomanic: 7.6073
dbt: 7.4299
hypomania: 7.3873
schizophrenic: 7.3063
bipolar: 7.1725
sz: 7.1275
schizo: 6.9257
schizophrenia: 6.8997
bp: 6.7141
hypo: 6.3740
prodromal: 5.6203
bd: 5.4132
mania: 5.2856
pwbpd: 5.0887
manic: 4.9914
bpds: 4.9070
borderline: 4.8091

--- BoW | BASIC | None | AdaBoost ---

               precision    recall  f1-score   support

      Anxiety       0.77      0.67      0.72      2043
          BPD      



               precision    recall  f1-score   support

      Anxiety       0.78      0.76      0.77      2043
          BPD       0.75      0.69      0.72      1993
      bipolar       0.69      0.64      0.66      2016
   depression       0.62      0.63      0.62      1998
schizophrenia       0.66      0.76      0.71      1950

     accuracy                           0.70     10000
    macro avg       0.70      0.70      0.70     10000
 weighted avg       0.70      0.70      0.70     10000

Top 20 impactful words:
fp: 10.5944
bpd: 10.4105
schizoaffective: 9.0633
dbt: 8.9278
hypomanic: 8.3105
schizophrenic: 8.2606
bp: 8.1654
schizophrenia: 7.8478
bipolar: 7.5605
hypomania: 7.3233
im bipolar: 6.9787
hypo: 6.5352
negative symptom: 6.4392
depakote: 5.9596
mania: 5.9514
mixed episode: 5.7095
borderline: 5.5091
manic: 5.4538
criterion: 5.2318
clozapine: 5.1664

--- BoW | NGRAM | None | AdaBoost ---

               precision    recall  f1-score   support

      Anxiety       0.77      0.67 



               precision    recall  f1-score   support

      Anxiety       0.82      0.79      0.81      2043
          BPD       0.79      0.71      0.75      1993
      bipolar       0.78      0.63      0.70      2016
   depression       0.65      0.70      0.67      1998
schizophrenia       0.64      0.81      0.71      1950

     accuracy                           0.73     10000
    macro avg       0.74      0.73      0.73     10000
 weighted avg       0.74      0.73      0.73     10000

Top 20 impactful words:
hypomanic: 6.8101
bpd: 6.4049
fp: 6.1558
schizoaffective: 5.6897
hypo: 5.6525
hypomania: 5.5440
schizophrenic: 5.0756
mixed episode: 5.0177
bipolar: 4.9135
im bipolar: 4.8971
dbt: 4.8352
npd: 4.7857
schizophrenia: 4.7000
bp: 4.6585
negative symptom: 4.3282
mania: 3.9762
depakote: 3.6109
dae: 3.3770
manic: 3.3664
borderline personality: 3.3574

--- BoW | NGRAM | Mutual Info | AdaBoost ---

               precision    recall  f1-score   support

      Anxiety       0.77      

In [None]:
from sklearn.naive_bayes import MultinomialNB

# ----------------------------
# Use only Naïve Bayes
# ----------------------------
models = {
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate model
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Naïve Bayes does not offer meaningful feature importances like coef_ or feature_importances_
        if hasattr(model, 'feature_log_prob_'):
            importances = np.max(model.feature_log_prob_, axis=0)
            words_weights = list(zip(feature_names, importances))
            sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

            print("Top 20 impactful words:")
            for word, score in sorted_words[:20]:
                print(f"{word}: {score:.4f}")
        else:
            print("No importances available for this model.")

# Assume `data` already contains 10K rows per class and is cleaned
data['clean_text_final'] = data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='basic', apply_chi2=False)

# Run BoW + n-gram
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=True)



--- BoW | BASIC | None | NaiveBayes ---

               precision    recall  f1-score   support

      Anxiety       0.81      0.72      0.76      2043
          BPD       0.70      0.70      0.70      1993
      bipolar       0.68      0.64      0.66      2016
   depression       0.62      0.68      0.65      1998
schizophrenia       0.67      0.73      0.70      1950

     accuracy                           0.69     10000
    macro avg       0.70      0.69      0.69     10000
 weighted avg       0.70      0.69      0.69     10000

Top 20 impactful words:
im: -3.6137
anxiety: -3.9607
feel: -4.0810
like: -4.0993
dont: -4.1357
removed: -4.4722
life: -4.5322
know: -4.5418
want: -4.5637
get: -4.5721
bipolar: -4.6706
ive: -4.6709
wall: -4.6817
schizophrenia: -4.7554
bpd: -4.7664
time: -4.7940
even: -4.8124
cant: -4.8467
really: -4.9436
thing: -4.9579

--- BoW | NGRAM | None | NaiveBayes ---

               precision    recall  f1-score   support

      Anxiety       0.80      0.74      0.

In [None]:
# ----------------------------
# Use only Naïve Bayes
# ----------------------------
models = {
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Main TF-IDF Pipeline
# ----------------------------
def run_tfidf_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. TF-IDF vectorization
    if vectorizer_type == 'basic':
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate model
    for name, model in models.items():
        print(f"\n--- TF-IDF | {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance from feature_log_prob_
        if hasattr(model, 'feature_log_prob_'):
            importances = np.max(model.feature_log_prob_, axis=0)
            words_weights = list(zip(feature_names, importances))
            sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

            print("Top 20 impactful words:")
            for word, score in sorted_words[:20]:
                print(f"{word}: {score:.4f}")
        else:
            print("No importances available for this model.")

# Assume `subset_data` already contains 10K rows per class and is cleaned
data['clean_text_final'] = data['text'].apply(clean_text)

# Run TF-IDF-only
run_tfidf_pipeline(data['clean_text_final'], data['label'], vectorizer_type='basic', apply_chi2=False)

# Run TF-IDF + n-gram
run_tfidf_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=False)

# Run TF-IDF + n-gram + Chi-2
run_tfidf_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=True)



--- TF-IDF | BASIC | None | NaiveBayes ---

               precision    recall  f1-score   support

      Anxiety       0.77      0.71      0.74      2043
          BPD       0.66      0.72      0.69      1993
      bipolar       0.69      0.61      0.64      2016
   depression       0.61      0.72      0.66      1998
schizophrenia       0.74      0.68      0.70      1950

     accuracy                           0.69     10000
    macro avg       0.69      0.69      0.69     10000
 weighted avg       0.69      0.69      0.69     10000

Top 20 impactful words:
removed: -3.5335
anxiety: -4.2971
wall: -4.3911
schizophrenia: -4.4656
im: -4.5208
bipolar: -4.6240
bpd: -4.7626
feel: -4.8180
dont: -4.8313
like: -4.9303
life: -5.0134
want: -5.0510
episode: -5.1278
manic: -5.1291
get: -5.2412
know: -5.2436
depression: -5.2506
selfie: -5.2801
med: -5.2992
voice: -5.3224

--- TF-IDF | NGRAM | None | NaiveBayes ---

               precision    recall  f1-score   support

      Anxiety       0.77  