In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import re
import nltk


In [2]:
from sklearn.metrics import classification_report,recall_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data = pd.read_csv("/content/drive/MyDrive/text_mining/combined/combined_balanced_cleaned.csv")

data.head()

Unnamed: 0,statement,status,text
0,"I try so hard to be happy. For me, for my fami...",Depression,try hard happy family friends cannot jealous f...
1,If you do not love someone or cannot promise t...,Depression,love someone cannot promise love die get marri...
2,I have suffered from depression and anxiety fo...,Depression,suffered depression anxiety many years resulte...
3,i have been on prozac since maybe september la...,Depression,prozac since maybe september last year prozac ...
4,I ask myself daily will my extra efforts towar...,Depression,ask daily extra efforts towards better life im...


In [5]:
print(data.shape)
print(data['status'].value_counts())

(31956, 3)
status
Depression    10652
Normal        10652
Suicidal      10652
Name: count, dtype: int64


In [6]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

subset_data = (
    data.groupby('status', group_keys=False)
    .apply(lambda x: x.sample(n=10000, random_state=42))
    .reset_index(drop=True)
)

# Load stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text cleaning function
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply cleaning
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
  .apply(lambda x: x.sample(n=10000, random_state=42))


In [7]:
print(subset_data.shape)
print(subset_data['status'].value_counts())

(30000, 4)
status
Depression    10000
Normal        10000
Suicidal      10000
Name: count, dtype: int64


In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# ----------------------------
# Model dictionary
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000)
}

# ----------------------------
# Training and feature importance display
# ----------------------------
def run_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. TF-IDF vectorization
    if vectorizer_type == 'basic':
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit models
    for name, model in models.items():
        print(f"\n--- {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Get feature importances
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Clean the data
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run all configurations
run_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='basic', apply_chi2=False)
run_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_chi2=False)
run_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_chi2=True)
#############################################


--- BASIC | None | SVM ---

              precision    recall  f1-score   support

  Depression       0.73      0.68      0.70      2002
      Normal       0.89      0.95      0.92      1996
    Suicidal       0.70      0.71      0.70      2002

    accuracy                           0.78      6000
   macro avg       0.77      0.78      0.78      6000
weighted avg       0.77      0.78      0.77      6000

Top 20 impactful words:
depression: 16.8994
suicide: 9.6104
wa: 9.0770
suicidal: 8.8724
depressed: 8.3585
od: 6.0645
ha: 5.6992
pression: 5.6467
url: 5.4629
die: 5.4136
le: 5.3447
depressive: 5.3267
ptsd: 5.3030
anymore: 5.1619
kill: 5.1436
doe: 5.0631
consent: 4.8811
killing: 4.7875
awareness: 4.7872
raped: 4.6861

--- BASIC | None | AdaBoost ---

              precision    recall  f1-score   support

  Depression       0.68      0.50      0.57      2002
      Normal       0.69      0.88      0.77      1996
    Suicidal       0.59      0.58      0.58      2002

    accuracy         

In [8]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# ----------------------------
# Model dictionary
# ----------------------------
models = {
  "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Training and feature importance display
# ----------------------------
def run_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. TF-IDF vectorization
    if vectorizer_type == 'basic':
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit models
    for name, model in models.items():
        print(f"\n--- {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Get feature importances
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [9]:
# Clean the data
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run all configurations
run_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='basic', apply_chi2=False)
run_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_chi2=False)
run_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_chi2=True)
*************************************************************************************************


--- BASIC | None | NaiveBayes ---

              precision    recall  f1-score   support

  Depression       0.68      0.68      0.68      2002
      Normal       0.92      0.77      0.84      1996
    Suicidal       0.67      0.78      0.72      2002

    accuracy                           0.74      6000
   macro avg       0.75      0.74      0.74      6000
weighted avg       0.75      0.74      0.74      6000

No importances available for this model.

--- NGRAM | None | NaiveBayes ---

              precision    recall  f1-score   support

  Depression       0.68      0.67      0.68      2002
      Normal       0.90      0.81      0.85      1996
    Suicidal       0.68      0.76      0.72      2002

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000

No importances available for this model.

--- NGRAM | Chi-2 | NaiveBayes ---

              precision    recall  f1-score   s

mutual information + tf-idf

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# ----------------------------
# Model dictionary
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Training and feature importance display
# ----------------------------
def run_pipeline(X_text, y, vectorizer_type, apply_mutual_info=False):
    # 1. TF-IDF vectorization
    if vectorizer_type == 'basic':
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_mutual_info:
        selector = SelectKBest(mutual_info_classif, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit models
    for name, model in models.items():
        print(f"\n--- {vectorizer_type.upper()} | {'Mutual Info' if apply_mutual_info else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Get feature importances
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Clean the data
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run all configurations
run_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='basic', apply_mutual_info=False)
run_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_mutual_info=False)
run_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_mutual_info=True)
***************************************************************************************************


--- BASIC | None | SVM ---

              precision    recall  f1-score   support

  Depression       0.73      0.68      0.70      2002
      Normal       0.89      0.95      0.92      1996
    Suicidal       0.70      0.71      0.70      2002

    accuracy                           0.78      6000
   macro avg       0.77      0.78      0.78      6000
weighted avg       0.77      0.78      0.77      6000

Top 20 impactful words:
depression: 16.8994
suicide: 9.6104
wa: 9.0770
suicidal: 8.8724
depressed: 8.3585
od: 6.0645
ha: 5.6992
pression: 5.6467
url: 5.4629
die: 5.4136
le: 5.3447
depressive: 5.3267
ptsd: 5.3030
anymore: 5.1619
kill: 5.1436
doe: 5.0631
consent: 4.8811
killing: 4.7875
awareness: 4.7872
raped: 4.6861

--- BASIC | None | AdaBoost ---

              precision    recall  f1-score   support

  Depression       0.68      0.50      0.57      2002
      Normal       0.69      0.88      0.77      1996
    Suicidal       0.59      0.58      0.58      2002

    accuracy         

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m



--- NGRAM | Mutual Info | SVM ---

              precision    recall  f1-score   support

  Depression       0.74      0.66      0.70      2002
      Normal       0.87      0.95      0.91      1996
    Suicidal       0.70      0.72      0.71      2002

    accuracy                           0.77      6000
   macro avg       0.77      0.77      0.77      6000
weighted avg       0.77      0.77      0.77      6000

Top 20 impactful words:
depression: 17.7979
suicide: 10.4297
depressed: 9.5297
suicidal: 9.4918
wa: 9.1301
le: 7.8165
kill: 7.2634
ha: 6.1913
http co: 5.9786
depressive: 5.9783
die: 5.8016
killing: 5.5306
ptsd: 5.0961
doe: 5.0798
gun: 4.8949
anymore: 4.8910
dead: 4.8827
wasnt: 4.7264
tired living: 4.6733
goodbye: 4.6582

--- NGRAM | Mutual Info | AdaBoost ---

              precision    recall  f1-score   support

  Depression       0.68      0.50      0.58      2002
      Normal       0.69      0.88      0.77      1996
    Suicidal       0.59      0.58      0.59      2002

  

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


# ----------------------------
# Models to evaluate
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000)
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate models
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Assume `subset_data` already contains 10K rows per class and is cleaned
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='basic', apply_chi2=False)

# Run BoW + n-gram
run_bow_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_chi2=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_chi2=True)
#######################################################################


--- BoW | BASIC | None | SVM ---

              precision    recall  f1-score   support

  Depression       0.70      0.64      0.67      2002
      Normal       0.89      0.94      0.91      1996
    Suicidal       0.67      0.69      0.68      2002

    accuracy                           0.76      6000
   macro avg       0.75      0.76      0.75      6000
weighted avg       0.75      0.76      0.75      6000

Top 20 impactful words:
pression: 5.4079
hearted: 4.7267
url: 4.1849
depression: 4.0513
com: 4.0144
surrounding: 3.9656
consent: 3.9644
awareness: 3.8522
policy: 3.7518
twisted: 3.7357
ordinary: 3.6288
od: 3.5130
chunk: 3.4933
console: 3.4908
privileged: 3.4611
internally: 3.3982
le: 3.3695
heroin: 3.2618
inconsiderate: 3.1772
fulfillment: 3.1606

--- BoW | BASIC | None | AdaBoost ---

              precision    recall  f1-score   support

  Depression       0.64      0.58      0.61      2002
      Normal       0.68      0.88      0.77      1996
    Suicidal       0.62      0.5

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# ----------------------------
# Models to evaluate
# ----------------------------
models = {
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate models
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [12]:
# Assume `subset_data` already contains 10K rows per class and is cleaned
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='basic', apply_chi2=False)

# Run BoW + n-gram
run_bow_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_chi2=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_chi2=True)
#*****************************************************************************************************************


--- BoW | BASIC | None | NaiveBayes ---

              precision    recall  f1-score   support

  Depression       0.69      0.66      0.68      2002
      Normal       0.91      0.79      0.85      1996
    Suicidal       0.67      0.79      0.72      2002

    accuracy                           0.75      6000
   macro avg       0.76      0.75      0.75      6000
weighted avg       0.76      0.75      0.75      6000

No importances available for this model.

--- BoW | NGRAM | None | NaiveBayes ---

              precision    recall  f1-score   support

  Depression       0.69      0.69      0.69      2002
      Normal       0.90      0.82      0.86      1996
    Suicidal       0.69      0.75      0.72      2002

    accuracy                           0.75      6000
   macro avg       0.76      0.75      0.75      6000
weighted avg       0.76      0.75      0.75      6000

No importances available for this model.

--- BoW | NGRAM | Chi-2 | NaiveBayes ---

              precision    re

mutual information + BoW

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


# ----------------------------
# Models to evaluate
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000)
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_mutual_info=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    if apply_mutual_info:
        selector = SelectKBest(mutual_info_classif, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate models
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Mutual Info' if apply_mutual_info else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Assume `subset_data` already contains 10K rows per class and is cleaned
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='basic', apply_mutual_info=False)

# Run BoW + n-gram
run_bow_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_mutual_info=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_mutual_info=True)
#################################################################################


--- BoW | BASIC | None | SVM ---

              precision    recall  f1-score   support

  Depression       0.70      0.64      0.67      2002
      Normal       0.89      0.94      0.91      1996
    Suicidal       0.67      0.69      0.68      2002

    accuracy                           0.76      6000
   macro avg       0.75      0.76      0.75      6000
weighted avg       0.75      0.76      0.75      6000

Top 20 impactful words:
pression: 5.4079
hearted: 4.7267
url: 4.1849
depression: 4.0513
com: 4.0144
surrounding: 3.9656
consent: 3.9644
awareness: 3.8522
policy: 3.7518
twisted: 3.7357
ordinary: 3.6288
od: 3.5130
chunk: 3.4933
console: 3.4908
privileged: 3.4611
internally: 3.3982
le: 3.3695
heroin: 3.2618
inconsiderate: 3.1772
fulfillment: 3.1606

--- BoW | BASIC | None | AdaBoost ---

              precision    recall  f1-score   support

  Depression       0.64      0.58      0.61      2002
      Normal       0.68      0.88      0.77      1996
    Suicidal       0.62      0.5

In [13]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# ----------------------------
# Models to evaluate
# ----------------------------
models = {
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_mutual_info=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    if apply_mutual_info:
        selector = SelectKBest(mutual_info_classif, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate models
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Mutual Info' if apply_mutual_info else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [14]:
# Assume `subset_data` already contains 10K rows per class and is cleaned
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='basic', apply_mutual_info=False)

# Run BoW + n-gram
run_bow_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_mutual_info=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(subset_data['clean_text_final'], subset_data['status'], vectorizer_type='ngram', apply_mutual_info=True)
#*****************************************************************************************************************


--- BoW | BASIC | None | NaiveBayes ---

              precision    recall  f1-score   support

  Depression       0.69      0.66      0.68      2002
      Normal       0.91      0.79      0.85      1996
    Suicidal       0.67      0.79      0.72      2002

    accuracy                           0.75      6000
   macro avg       0.76      0.75      0.75      6000
weighted avg       0.76      0.75      0.75      6000

No importances available for this model.

--- BoW | NGRAM | None | NaiveBayes ---

              precision    recall  f1-score   support

  Depression       0.69      0.69      0.69      2002
      Normal       0.90      0.82      0.86      1996
    Suicidal       0.69      0.75      0.72      2002

    accuracy                           0.75      6000
   macro avg       0.76      0.75      0.75      6000
weighted avg       0.76      0.75      0.75      6000

No importances available for this model.

--- BoW | NGRAM | Mutual Info | NaiveBayes ---

              precision

In [None]:
data = pd.read_csv("/content/drive/MyDrive/text_mining/combined/Stress.csv")

data.head()

Unnamed: 0,subreddit,post_id,sentence_range,text,label,confidence,social_timestamp
0,ptsd,8601tu,"(15, 20)","He said he had not felt that way before, sugge...",1,0.8,1521614353
1,assistance,8lbrx9,"(0, 5)","Hey there r/assistance, Not sure if this is th...",0,1.0,1527009817
2,ptsd,9ch1zh,"(15, 20)",My mom then hit me with the newspaper and it s...,1,0.8,1535935605
3,relationships,7rorpp,"[5, 10]","until i met my new boyfriend, he is amazing, h...",1,0.6,1516429555
4,survivorsofabuse,9p2gbc,"[0, 5]",October is Domestic Violence Awareness Month a...,1,0.8,1539809005


In [None]:
import pandas as pd

# Step 2: Load the dataset
stress_path = "/content/drive/MyDrive/text_mining/combined/Stress.csv"
stress_data = pd.read_csv(stress_path)

# Step 3: Filter only anxiety posts
anxiety_data = stress_data[stress_data['subreddit'] == 'anxiety']

# Step 4: Keep only the relevant columns and rename
anxiety_data = anxiety_data[['text', 'subreddit']]
anxiety_data.rename(columns={'subreddit': 'label'}, inplace=True)

# Step 6: Save to CSV
output_path = "/content/drive/MyDrive/text_mining/combined/anxiety.csv"
anxiety_data.to_csv(output_path, index=False)



In [None]:
data = pd.read_csv("/content/drive/MyDrive/text_mining/combined/anxiety.csv")
data.head()

Unnamed: 0,text,label
0,It cleared up and I was okay but. On Monday ...,anxiety
1,Next week I’ll be flying for our family vacati...,anxiety
2,Everything sets me off and I'm almost having a...,anxiety
3,I’ve been taking 12.5 mgs zoloft for about 6 m...,anxiety
4,These past couple of months have been the wors...,anxiety


In [None]:
data.shape


(503, 2)

In [None]:
data = pd.read_csv("/content/drive/MyDrive/text_mining/combined/healthanxiety_dataset.csv")
data.head()

Unnamed: 0,subreddit,author,date,post,automated_readability_index,coleman_liau_index,flesch_kincaid_grade_level,flesch_reading_ease,gulpease_index,gunning_fog_index,...,tfidf_wish,tfidf_without,tfidf_wonder,tfidf_work,tfidf_worri,tfidf_wors,tfidf_would,tfidf_wrong,tfidf_x200b,tfidf_year
0,healthanxiety,Nomis176,1/1/2018,"Final doctor appointment tomorrow, tired of co...",9.137909,7.282428,8.616061,72.474091,61.090909,11.345455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055919
1,healthanxiety,psychstudent317,1/1/2018,Anyone have bone or muscle pain that was stres...,4.399892,7.131386,3.665806,87.185376,74.16129,5.423656,...,0.0,0.0,0.0,0.0,0.290466,0.0,0.0,0.0,0.0,0.0
2,healthanxiety,bulk_barn,1/1/2018,Listening to your body? I'm curious how those ...,3.59066,4.930895,4.605802,85.640967,72.584906,7.564151,...,0.0,0.0,0.0,0.110757,0.0,0.0,0.0,0.332673,0.0,0.0
3,healthanxiety,AutoModerator,1/1/2018,Weekly /r/HealthAnxiety Challenge - Exercise A...,4.515682,7.033342,5.059091,78.383636,72.863636,8.945455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,healthanxiety,parthkhurana7,1/1/2018,This is killing me So i had a bacterial stomac...,3.034388,5.356252,3.977494,85.306958,76.721519,7.757637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import pandas as pd

# Step 1: Load the dataset
data = pd.read_csv("/content/drive/MyDrive/text_mining/combined/healthanxiety_dataset.csv")

# Step 2: Keep only relevant columns
data = data[['post', 'subreddit']]

# Step 3: Rename columns
data.rename(columns={'post': 'text', 'subreddit': 'label'}, inplace=True)

# Step 4: Modify label values
data['label'] = data['label'].replace('healthanxiety', 'anxiety')

# Step 5: Save to CSV
output_path = "/content/drive/MyDrive/text_mining/combined/healthanxiety_cleaned.csv"
data.to_csv(output_path, index=False)

print("Cleaned healthanxiety dataset saved to:", output_path)


Cleaned healthanxiety dataset saved to: /content/drive/MyDrive/text_mining/combined/healthanxiety_cleaned.csv


In [None]:
data = pd.read_csv("/content/drive/MyDrive/text_mining/combined/healthanxiety_cleaned.csv")
data.head()

Unnamed: 0,text,label
0,"Final doctor appointment tomorrow, tired of co...",anxiety
1,Anyone have bone or muscle pain that was stres...,anxiety
2,Listening to your body? I'm curious how those ...,anxiety
3,Weekly /r/HealthAnxiety Challenge - Exercise A...,anxiety
4,This is killing me So i had a bacterial stomac...,anxiety


In [None]:
data.shape

(1967, 2)

In [None]:
import pandas as pd

# Load both datasets
df1 = pd.read_csv("/content/drive/MyDrive/text_mining/combined/healthanxiety_cleaned.csv")
df2 = pd.read_csv("/content/drive/MyDrive/text_mining/combined/anxiety.csv")

# Concatenate the datasets
merged_df = pd.concat([df1, df2], ignore_index=True)

# Save to new CSV
output_path = "/content/drive/MyDrive/text_mining/combined/anxiety_final.csv"
merged_df.to_csv(output_path, index=False)

print("Merged dataset saved to:", output_path)


Merged dataset saved to: /content/drive/MyDrive/text_mining/combined/anxiety_final.csv


In [None]:
data = pd.read_csv("/content/drive/MyDrive/text_mining/combined/anxiety_final.csv")
data.head()

Unnamed: 0,text,label
0,"Final doctor appointment tomorrow, tired of co...",anxiety
1,Anyone have bone or muscle pain that was stres...,anxiety
2,Listening to your body? I'm curious how those ...,anxiety
3,Weekly /r/HealthAnxiety Challenge - Exercise A...,anxiety
4,This is killing me So i had a bacterial stomac...,anxiety


In [None]:
data.shape

(2470, 2)

In [None]:
import pandas as pd

# Step 1: Load combined_balanced_cleaned
combined_path = "/content/drive/MyDrive/text_mining/combined/combined_balanced_cleaned.csv"
combined_df = pd.read_csv(combined_path)

# Step 2: Remove 'Suicidal' status
combined_df = combined_df[combined_df['status'] != 'Suicidal']

# Step 3: Rename 'status' to 'label'
combined_df.rename(columns={'status': 'label'}, inplace=True)

# Step 4: Load anxiety_final.csv
anxiety_path = "/content/drive/MyDrive/text_mining/combined/anxiety_final.csv"
anxiety_df = pd.read_csv(anxiety_path)

# Step 5: Ensure both have same structure (drop extra columns if needed)
# If anxiety_df has no 'clean_text_final' column, drop it from combined_df
if 'clean_text_final' in combined_df.columns and 'clean_text_final' not in anxiety_df.columns:
    combined_df.drop(columns=['clean_text_final'], inplace=True)

# Step 6: Concatenate the datasets
final_df = pd.concat([combined_df, anxiety_df], ignore_index=True)

# Step 7: Save to CSV
output_path = "/content/drive/MyDrive/text_mining/combined/combined_anxiety_cleaned.csv"
final_df.to_csv(output_path, index=False)

print("Final merged dataset saved to:", output_path)


Final merged dataset saved to: /content/drive/MyDrive/text_mining/combined/combined_anxiety_cleaned.csv


In [None]:
data = pd.read_csv("/content/drive/MyDrive/text_mining/combined/combined_anxiety_cleaned.csv")
data.head()

Unnamed: 0,statement,label,text
0,"I try so hard to be happy. For me, for my fami...",Depression,try hard happy family friends cannot jealous f...
1,If you do not love someone or cannot promise t...,Depression,love someone cannot promise love die get marri...
2,I have suffered from depression and anxiety fo...,Depression,suffered depression anxiety many years resulte...
3,i have been on prozac since maybe september la...,Depression,prozac since maybe september last year prozac ...
4,I ask myself daily will my extra efforts towar...,Depression,ask daily extra efforts towards better life im...


In [None]:
print(data.shape)
print(data['label'].value_counts())

(23774, 3)
label
Depression    10652
Normal        10652
anxiety        2470
Name: count, dtype: int64


In [None]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Load stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text cleaning function
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply cleaning
data['clean_text_final'] = data['text'].apply(clean_text)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# ----------------------------
# Model dictionary
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000)
}

# ----------------------------
# Training and feature importance display
# ----------------------------
def run_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. TF-IDF vectorization
    if vectorizer_type == 'basic':
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit models
    for name, model in models.items():
        print(f"\n--- {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Get feature importances
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Clean the data
data['clean_text_final'] = data['text'].apply(clean_text)

# Run all configurations
run_pipeline(data['clean_text_final'], data['label'], vectorizer_type='basic', apply_chi2=False)
run_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=False)
run_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=True)



--- BASIC | None | SVM ---

              precision    recall  f1-score   support

  Depression       0.93      0.92      0.93      2111
      Normal       0.90      0.96      0.93      2126
     anxiety       0.91      0.75      0.82       518

    accuracy                           0.92      4755
   macro avg       0.92      0.87      0.89      4755
weighted avg       0.92      0.92      0.92      4755

Top 20 impactful words:
depression: 21.0380
suicide: 8.6819
depressed: 8.3682
wa: 6.9453
anymore: 6.4916
health: 6.3752
anxiety: 6.3550
suicidal: 6.0670
wasnt: 5.9587
idk: 5.6224
didnt: 5.4352
ive: 5.3175
cancer: 5.2777
ampxb: 5.1999
couldnt: 5.1852
pression: 5.0497
ptsd: 5.0027
die: 4.9412
symptom: 4.8715
life: 4.7959

--- BASIC | None | AdaBoost ---

              precision    recall  f1-score   support

  Depression       0.89      0.79      0.84      2111
      Normal       0.79      0.95      0.86      2126
     anxiety       0.87      0.52      0.65       518

    accuracy     

mutual info + tf-idf

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# ----------------------------
# Model dictionary
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Training and feature importance display
# ----------------------------
def run_pipeline(X_text, y, vectorizer_type, apply_mutual_info=False):
    # 1. TF-IDF vectorization
    if vectorizer_type == 'basic':
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Mutual Information selection (optional)
    if apply_mutual_info:
        selector = SelectKBest(mutual_info_classif, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit models
    for name, model in models.items():
        print(f"\n--- {vectorizer_type.upper()} | {'Mutual Info' if apply_mutual_info else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Get feature importances
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Clean the data
data['clean_text_final'] = data['text'].apply(clean_text)

# Run all configurations
run_pipeline(data['clean_text_final'], data['label'], vectorizer_type='basic', apply_mutual_info=False)
run_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_mutual_info=False)
run_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_mutual_info=True)
###################################################


--- BASIC | None | SVM ---

              precision    recall  f1-score   support

  Depression       0.93      0.92      0.93      2111
      Normal       0.90      0.96      0.93      2126
     anxiety       0.91      0.75      0.82       518

    accuracy                           0.92      4755
   macro avg       0.92      0.87      0.89      4755
weighted avg       0.92      0.92      0.92      4755

Top 20 impactful words:
depression: 21.0380
suicide: 8.6819
depressed: 8.3682
wa: 6.9453
anymore: 6.4916
health: 6.3752
anxiety: 6.3550
suicidal: 6.0670
wasnt: 5.9587
idk: 5.6224
didnt: 5.4352
ive: 5.3175
cancer: 5.2777
ampxb: 5.1999
couldnt: 5.1852
pression: 5.0497
ptsd: 5.0027
die: 4.9412
symptom: 4.8715
life: 4.7959

--- BASIC | None | AdaBoost ---

              precision    recall  f1-score   support

  Depression       0.89      0.79      0.84      2111
      Normal       0.79      0.95      0.86      2126
     anxiety       0.87      0.52      0.65       518

    accuracy     

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m



--- NGRAM | Mutual Info | SVM ---

              precision    recall  f1-score   support

  Depression       0.93      0.91      0.92      2111
      Normal       0.90      0.95      0.92      2126
     anxiety       0.91      0.74      0.82       518

    accuracy                           0.91      4755
   macro avg       0.91      0.87      0.89      4755
weighted avg       0.91      0.91      0.91      4755

Top 20 impactful words:
depression: 21.8743
depressed: 9.7453
suicide: 9.3408
health anxiety: 7.8961
anymore: 7.6439
le: 7.6224
wa: 7.2470
suicidal: 7.1583
ampxb: 6.6500
anxiety: 6.4153
health: 6.1424
http co: 6.0238
idk: 5.9072
wasnt: 5.7933
didnt: 5.7832
self: 5.6276
ptsd: 5.5103
cancer: 5.3472
couldnt: 5.3399
life: 5.2747

--- NGRAM | Mutual Info | AdaBoost ---

              precision    recall  f1-score   support

  Depression       0.90      0.77      0.83      2111
      Normal       0.77      0.96      0.86      2126
     anxiety       0.87      0.51      0.64       51

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


# ----------------------------
# Models to evaluate
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000)
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate models
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Assume `data` already contains 10K rows per class and is cleaned
data['clean_text_final'] = data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='basic', apply_chi2=False)

# Run BoW + n-gram
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=True)
#########################################################################


--- BoW | BASIC | None | SVM ---

              precision    recall  f1-score   support

  Depression       0.93      0.92      0.93      2111
      Normal       0.91      0.95      0.93      2126
     anxiety       0.85      0.73      0.78       518

    accuracy                           0.91      4755
   macro avg       0.90      0.87      0.88      4755
weighted avg       0.91      0.91      0.91      4755

Top 20 impactful words:
depression: 6.1388
suicide: 4.3225
pression: 4.0085
weakness: 3.9448
recovery: 3.6624
snapchat: 3.2825
wasnt: 3.1158
couldnt: 3.0724
wrist: 3.0666
direction: 3.0439
sink: 2.9021
ptsd: 2.8966
depressed: 2.8740
breakdown: 2.8668
meme: 2.8384
depressive: 2.7626
wa: 2.7212
selfish: 2.7061
rambling: 2.7020
idk: 2.6883

--- BoW | BASIC | None | AdaBoost ---

              precision    recall  f1-score   support

  Depression       0.90      0.77      0.83      2111
      Normal       0.77      0.96      0.86      2126
     anxiety       0.87      0.51      0.6

mutual info + BoW

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# ----------------------------
# Models to evaluate
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_mutual_info=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Mutual Information selection (optional)
    if apply_mutual_info:
        selector = SelectKBest(mutual_info_classif, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate models
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Mutual Info' if apply_mutual_info else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Assume `data` already contains 10K rows per class and is cleaned
data['clean_text_final'] = data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='basic', apply_mutual_info=False)

# Run BoW + n-gram
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_mutual_info=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_mutual_info=True)
##########################################################################


--- BoW | BASIC | None | SVM ---

              precision    recall  f1-score   support

  Depression       0.93      0.92      0.93      2111
      Normal       0.91      0.95      0.93      2126
     anxiety       0.85      0.73      0.78       518

    accuracy                           0.91      4755
   macro avg       0.90      0.87      0.88      4755
weighted avg       0.91      0.91      0.91      4755

Top 20 impactful words:
depression: 6.1388
suicide: 4.3225
pression: 4.0085
weakness: 3.9448
recovery: 3.6624
snapchat: 3.2825
wasnt: 3.1158
couldnt: 3.0724
wrist: 3.0666
direction: 3.0439
sink: 2.9021
ptsd: 2.8966
depressed: 2.8740
breakdown: 2.8668
meme: 2.8384
depressive: 2.7626
wa: 2.7212
selfish: 2.7061
rambling: 2.7020
idk: 2.6883

--- BoW | BASIC | None | AdaBoost ---

              precision    recall  f1-score   support

  Depression       0.90      0.77      0.83      2111
      Normal       0.77      0.96      0.86      2126
     anxiety       0.87      0.51      0.6

In [None]:
from sklearn.naive_bayes import MultinomialNB

# ----------------------------
# Use only Naïve Bayes
# ----------------------------
models = {
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate model
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Naïve Bayes does not offer meaningful feature importances like coef_ or feature_importances_
        if hasattr(model, 'feature_log_prob_'):
            importances = np.max(model.feature_log_prob_, axis=0)
            words_weights = list(zip(feature_names, importances))
            sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

            print("Top 20 impactful words:")
            for word, score in sorted_words[:20]:
                print(f"{word}: {score:.4f}")
        else:
            print("No importances available for this model.")

# Assume `data` already contains 10K rows per class and is cleaned
data['clean_text_final'] = data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='basic', apply_chi2=False)

# Run BoW + n-gram
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=True)
###########################################################################


--- BoW | BASIC | None | NaiveBayes ---

              precision    recall  f1-score   support

  Depression       0.78      0.92      0.84      2111
      Normal       0.92      0.75      0.83      2126
     anxiety       0.78      0.82      0.80       518

    accuracy                           0.83      4755
   macro avg       0.83      0.83      0.82      4755
weighted avg       0.84      0.83      0.83      4755

Top 20 impactful words:
im: -3.8274
like: -4.0588
feel: -4.0728
anxiety: -4.3937
want: -4.4413
ive: -4.5230
life: -4.5283
know: -4.5653
get: -4.6029
time: -4.7197
even: -4.7644
dont: -4.8447
people: -4.9090
thing: -4.9215
year: -4.9305
really: -4.9436
would: -4.9689
day: -4.9723
friend: -5.0581
one: -5.0620

--- BoW | NGRAM | None | NaiveBayes ---

              precision    recall  f1-score   support

  Depression       0.80      0.91      0.85      2111
      Normal       0.91      0.76      0.83      2126
     anxiety       0.78      0.84      0.81       518

    accu

In [None]:
# ----------------------------
# Use only Naïve Bayes
# ----------------------------
models = {
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Main TF-IDF Pipeline
# ----------------------------
def run_tfidf_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. TF-IDF vectorization
    if vectorizer_type == 'basic':
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate model
    for name, model in models.items():
        print(f"\n--- TF-IDF | {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance from feature_log_prob_
        if hasattr(model, 'feature_log_prob_'):
            importances = np.max(model.feature_log_prob_, axis=0)
            words_weights = list(zip(feature_names, importances))
            sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

            print("Top 20 impactful words:")
            for word, score in sorted_words[:20]:
                print(f"{word}: {score:.4f}")
        else:
            print("No importances available for this model.")

# Assume `data` already contains 10K rows per class and is cleaned
data['clean_text_final'] = data['text'].apply(clean_text)

# Run TF-IDF-only
run_tfidf_pipeline(data['clean_text_final'], data['label'], vectorizer_type='basic', apply_chi2=False)

# Run TF-IDF + n-gram
run_tfidf_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=False)

# Run TF-IDF + n-gram + Chi-2
run_tfidf_pipeline(data['clean_text_final'], data['label'], vectorizer_type='ngram', apply_chi2=True)



--- TF-IDF | BASIC | None | NaiveBayes ---

              precision    recall  f1-score   support

  Depression       0.75      0.95      0.84      2111
      Normal       0.92      0.74      0.82      2126
     anxiety       0.93      0.66      0.78       518

    accuracy                           0.83      4755
   macro avg       0.87      0.79      0.81      4755
weighted avg       0.85      0.83      0.82      4755

Top 20 impactful words:
im: -4.5099
feel: -4.7237
like: -4.8114
want: -4.9875
anxiety: -5.0278
ive: -5.0551
life: -5.0864
depression: -5.1681
know: -5.1856
dont: -5.2267
get: -5.2564
even: -5.3885
time: -5.4119
people: -5.4125
really: -5.4815
thing: -5.4950
day: -5.5038
good: -5.5113
friend: -5.5240
year: -5.5481

--- TF-IDF | NGRAM | None | NaiveBayes ---

              precision    recall  f1-score   support

  Depression       0.78      0.94      0.85      2111
      Normal       0.90      0.77      0.83      2126
     anxiety       0.92      0.70      0.80       5