In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import re
import nltk


In [2]:
from sklearn.metrics import classification_report,recall_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data = pd.read_csv("/content/drive/MyDrive/text_mining/datasets/depression_cleaned_balanced.csv")

data.head()

Unnamed: 0,text,label
0,like anyone else love whatches mean dont know ...,0.0
1,straight boy want boy cuddle give hugs super c...,0.0
2,anyone experience bupropionwellbutrinzybanvoxr...,1.0
3,ways make good money guys say mow lawns mom sa...,0.0
4,dont know future life confusing glory days gon...,0.0


In [None]:
print(data.shape)
print(data['label'].value_counts())

(851170, 2)
label
0.0    425585
1.0    425585
Name: count, dtype: int64


In [6]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

subset_data = (
    data.groupby('label', group_keys=False)
    .apply(lambda x: x.sample(n=10000, random_state=42))
    .reset_index(drop=True)
)

# Load stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text cleaning function
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply cleaning
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
  .apply(lambda x: x.sample(n=10000, random_state=42))


In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# ----------------------------
# Model dictionary
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000)
}

# ----------------------------
# Training and feature importance display
# ----------------------------
def run_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. TF-IDF vectorization
    if vectorizer_type == 'basic':
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit models
    for name, model in models.items():
        print(f"\n--- {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Get feature importances
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Clean the data
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run all configurations
run_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='basic', apply_chi2=False)
run_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='ngram', apply_chi2=False)
run_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='ngram', apply_chi2=True)



--- BASIC | None | SVM ---

              precision    recall  f1-score   support

         0.0       0.90      0.91      0.91      2019
         1.0       0.91      0.90      0.90      1981

    accuracy                           0.91      4000
   macro avg       0.91      0.91      0.91      4000
weighted avg       0.91      0.91      0.91      4000

Top 20 impactful words:
depression: 4.1796
depressed: 3.0277
suicide: 2.8466
rteenagers: 2.7360
crush: 2.6912
kill: 2.4982
dm: 2.3926
rant: 2.3545
killing: 2.3059
mg: 2.2958
teen: 2.1282
everyday: 2.1088
numb: 2.1086
suicidal: 2.0541
teenager: 2.0513
as: 1.9888
dealing: 1.9661
die: 1.9521
feel: 1.9439
rdepression: 1.9139

--- BASIC | None | AdaBoost ---

              precision    recall  f1-score   support

         0.0       0.81      0.88      0.85      2019
         1.0       0.87      0.79      0.83      1981

    accuracy                           0.84      4000
   macro avg       0.84      0.84      0.84      4000
weighted avg   

Mutual Information

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# ----------------------------
# Model dictionary
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Training and feature importance display
# ----------------------------
def run_pipeline(X_text, y, vectorizer_type, apply_mutual_info=False):
    # 1. TF-IDF vectorization
    if vectorizer_type == 'basic':
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    if apply_mutual_info:
        selector = SelectKBest(mutual_info_classif, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit models
    for name, model in models.items():
        print(f"\n--- {vectorizer_type.upper()} | {'Mutual Info' if apply_mutual_info else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Get feature importances
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


In [None]:
# Clean the data
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run all configurations
run_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='basic', apply_mutual_info=False)
run_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='ngram', apply_mutual_info=False)
run_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='ngram', apply_mutual_info=True)



--- BASIC | None | SVM ---

              precision    recall  f1-score   support

         0.0       0.90      0.91      0.91      2019
         1.0       0.91      0.90      0.90      1981

    accuracy                           0.91      4000
   macro avg       0.91      0.91      0.91      4000
weighted avg       0.91      0.91      0.91      4000

Top 20 impactful words:
depression: 4.1796
depressed: 3.0277
suicide: 2.8466
rteenagers: 2.7360
crush: 2.6912
kill: 2.4982
dm: 2.3926
rant: 2.3545
killing: 2.3059
mg: 2.2958
teen: 2.1282
everyday: 2.1088
numb: 2.1086
suicidal: 2.0541
teenager: 2.0513
as: 1.9888
dealing: 1.9661
die: 1.9521
feel: 1.9439
rdepression: 1.9139

--- BASIC | None | AdaBoost ---

              precision    recall  f1-score   support

         0.0       0.81      0.88      0.85      2019
         1.0       0.87      0.79      0.83      1981

    accuracy                           0.84      4000
   macro avg       0.84      0.84      0.84      4000
weighted avg   

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m



--- NGRAM | Mutual Info | SVM ---

              precision    recall  f1-score   support

         0.0       0.89      0.91      0.90      2019
         1.0       0.90      0.88      0.89      1981

    accuracy                           0.89      4000
   macro avg       0.90      0.89      0.89      4000
weighted avg       0.89      0.89      0.89      4000

Top 20 impactful words:
depression: 4.7762
depressed: 3.6598
suicide: 3.5283
crush: 3.1445
rteenagers: 2.9889
kill: 2.6741
mg: 2.5765
rant: 2.5344
killing: 2.5166
suicidal: 2.4961
teen: 2.4878
therapy: 2.4332
get bed: 2.3636
dm: 2.3052
numb: 2.2369
everyday: 2.1925
therapist: 2.1861
living: 2.1824
as: 2.1219
discussion: 2.1126

--- NGRAM | Mutual Info | AdaBoost ---

              precision    recall  f1-score   support

         0.0       0.81      0.89      0.85      2019
         1.0       0.88      0.79      0.83      1981

    accuracy                           0.84      4000
   macro avg       0.84      0.84      0.84      

chi-2


In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


# ----------------------------
# Models to evaluate
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000)
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate models
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Assume `subset_data` already contains 10K rows per class and is cleaned
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='basic', apply_chi2=False)

# Run BoW + n-gram
run_bow_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='ngram', apply_chi2=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='ngram', apply_chi2=True)



--- BoW | BASIC | None | SVM ---





              precision    recall  f1-score   support

         0.0       0.86      0.89      0.88      2019
         1.0       0.89      0.86      0.87      1981

    accuracy                           0.88      4000
   macro avg       0.88      0.88      0.88      4000
weighted avg       0.88      0.88      0.88      4000

Top 20 impactful words:
rteenagers: 1.9053
dm: 1.8105
discord: 1.4908
asf: 1.4604
depressionanxiety: 1.4337
gold: 1.4233
muscle: 1.4226
downer: 1.3703
rdepression: 1.3568
dumbass: 1.3484
complaining: 1.3246
raid: 1.3057
betrayed: 1.2932
neglect: 1.2751
unbearable: 1.2656
numb: 1.2648
fragile: 1.2547
bound: 1.2488
prescribed: 1.2452
graphic: 1.2358

--- BoW | BASIC | None | AdaBoost ---

              precision    recall  f1-score   support

         0.0       0.81      0.90      0.85      2019
         1.0       0.89      0.79      0.83      1981

    accuracy                           0.84      4000
   macro avg       0.85      0.84      0.84      4000
weighted av



              precision    recall  f1-score   support

         0.0       0.86      0.89      0.87      2019
         1.0       0.88      0.85      0.87      1981

    accuracy                           0.87      4000
   macro avg       0.87      0.87      0.87      4000
weighted avg       0.87      0.87      0.87      4000

Top 20 impactful words:
rteenagers: 2.3120
know many: 1.8564
year really: 1.7082
discord: 1.6805
time one: 1.6541
accomplish: 1.5773
mg: 1.5748
home alone: 1.5108
generation: 1.5025
certainly: 1.4628
even like: 1.4498
rdepression: 1.4377
everything ive: 1.4206
knowledge: 1.3913
numb: 1.3808
universe: 1.3736
discussion: 1.3484
crush: 1.3414
whats point: 1.3228
life anymore: 1.3051

--- BoW | NGRAM | None | AdaBoost ---

              precision    recall  f1-score   support

         0.0       0.81      0.90      0.85      2019
         1.0       0.88      0.79      0.83      1981

    accuracy                           0.84      4000
   macro avg       0.85      0.8



              precision    recall  f1-score   support

         0.0       0.85      0.91      0.88      2019
         1.0       0.90      0.84      0.87      1981

    accuracy                           0.87      4000
   macro avg       0.87      0.87      0.87      4000
weighted avg       0.87      0.87      0.87      4000

Top 20 impactful words:
rteenagers: 3.1053
dm: 2.4664
discord: 1.9397
rdepression: 1.6029
life isnt: 1.5790
hurt much: 1.5180
ward: 1.4877
another day: 1.4777
actively: 1.4508
mg: 1.3585
stay alive: 1.3486
die dont: 1.3433
unbearable: 1.2839
thing get better: 1.2746
get bed: 1.2216
point living: 1.2018
whats point: 1.1922
numb: 1.1891
dont energy: 1.1856
understands: 1.1819

--- BoW | NGRAM | Chi-2 | AdaBoost ---

              precision    recall  f1-score   support

         0.0       0.81      0.90      0.85      2019
         1.0       0.88      0.78      0.83      1981

    accuracy                           0.84      4000
   macro avg       0.85      0.84    

Mutual Information

In [7]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# ----------------------------
# Models to evaluate
# ----------------------------
models = {
    "SVM": LinearSVC(),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_mutual_info=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    if apply_mutual_info:
        selector = SelectKBest(mutual_info_classif, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate models
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Mutual Info' if apply_mutual_info else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance
        if hasattr(model, 'coef_'):
            importances = np.abs(model.coef_).sum(axis=0)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        else:
            print("No importances available for this model.")
            continue

        words_weights = list(zip(feature_names, importances))
        sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

        print("Top 20 impactful words:")
        for word, score in sorted_words[:20]:
            print(f"{word}: {score:.4f}")
#################################################################################################################

In [8]:
# Assume `subset_data` already contains 10K rows per class and is cleaned
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='basic', apply_mutual_info=False)

# Run BoW + n-gram
run_bow_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='ngram', apply_mutual_info=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='ngram', apply_mutual_info=True)



--- BoW | BASIC | None | SVM ---





              precision    recall  f1-score   support

         0.0       0.86      0.89      0.88      2019
         1.0       0.89      0.86      0.87      1981

    accuracy                           0.88      4000
   macro avg       0.88      0.88      0.88      4000
weighted avg       0.88      0.88      0.88      4000

Top 20 impactful words:
rteenagers: 1.9053
dm: 1.8105
discord: 1.4908
asf: 1.4604
depressionanxiety: 1.4337
gold: 1.4233
muscle: 1.4226
downer: 1.3703
rdepression: 1.3568
dumbass: 1.3484
complaining: 1.3246
raid: 1.3057
betrayed: 1.2932
neglect: 1.2751
unbearable: 1.2656
numb: 1.2648
fragile: 1.2547
bound: 1.2488
prescribed: 1.2452
graphic: 1.2358

--- BoW | BASIC | None | AdaBoost ---

              precision    recall  f1-score   support

         0.0       0.81      0.90      0.85      2019
         1.0       0.89      0.79      0.83      1981

    accuracy                           0.84      4000
   macro avg       0.85      0.84      0.84      4000
weighted av



              precision    recall  f1-score   support

         0.0       0.86      0.89      0.87      2019
         1.0       0.88      0.85      0.87      1981

    accuracy                           0.87      4000
   macro avg       0.87      0.87      0.87      4000
weighted avg       0.87      0.87      0.87      4000

Top 20 impactful words:
rteenagers: 2.3120
know many: 1.8564
year really: 1.7082
discord: 1.6805
time one: 1.6541
accomplish: 1.5773
mg: 1.5748
home alone: 1.5108
generation: 1.5025
certainly: 1.4628
even like: 1.4498
rdepression: 1.4377
everything ive: 1.4206
knowledge: 1.3913
numb: 1.3808
universe: 1.3736
discussion: 1.3484
crush: 1.3414
whats point: 1.3228
life anymore: 1.3051

--- BoW | NGRAM | None | AdaBoost ---

              precision    recall  f1-score   support

         0.0       0.81      0.90      0.85      2019
         1.0       0.88      0.79      0.83      1981

    accuracy                           0.84      4000
   macro avg       0.85      0.8

In [None]:
from sklearn.naive_bayes import MultinomialNB

# ----------------------------
# Use only Naïve Bayes
# ----------------------------
models = {
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Main BoW Pipeline
# ----------------------------
def run_bow_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. BoW vectorization
    if vectorizer_type == 'basic':
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate model
    for name, model in models.items():
        print(f"\n--- BoW | {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Naïve Bayes does not offer meaningful feature importances like coef_ or feature_importances_
        if hasattr(model, 'feature_log_prob_'):
            importances = np.max(model.feature_log_prob_, axis=0)
            words_weights = list(zip(feature_names, importances))
            sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

            print("Top 20 impactful words:")
            for word, score in sorted_words[:20]:
                print(f"{word}: {score:.4f}")
        else:
            print("No importances available for this model.")

# Assume `subset_data` already contains 10K rows per class and is cleaned
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run BoW-only
run_bow_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='basic', apply_chi2=False)

# Run BoW + n-gram
run_bow_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='ngram', apply_chi2=False)

# Run BoW + n-gram + Chi-2
run_bow_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='ngram', apply_chi2=True)



--- BoW | BASIC | None | NaiveBayes ---

              precision    recall  f1-score   support

         0.0       0.94      0.82      0.88      2019
         1.0       0.84      0.94      0.89      1981

    accuracy                           0.88      4000
   macro avg       0.89      0.88      0.88      4000
weighted avg       0.89      0.88      0.88      4000

Top 20 impactful words:
im: -3.6686
dont: -4.1562
like: -4.2323
feel: -4.3411
want: -4.3918
life: -4.5718
bruh: -4.5774
know: -4.6051
ive: -4.7048
get: -4.7356
time: -4.8290
friend: -4.8697
cant: -4.8805
even: -4.9133
people: -4.9306
year: -4.9793
fuck: -4.9827
day: -4.9992
one: -4.9992
tock: -5.0186

--- BoW | NGRAM | None | NaiveBayes ---

              precision    recall  f1-score   support

         0.0       0.95      0.77      0.85      2019
         1.0       0.80      0.96      0.87      1981

    accuracy                           0.86      4000
   macro avg       0.88      0.86      0.86      4000
weighted avg   

In [None]:
# ----------------------------
# Use only Naïve Bayes
# ----------------------------
models = {
    "NaiveBayes": MultinomialNB()
}

# ----------------------------
# Main TF-IDF Pipeline
# ----------------------------
def run_tfidf_pipeline(X_text, y, vectorizer_type, apply_chi2=False):
    # 1. TF-IDF vectorization
    if vectorizer_type == 'basic':
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
    else:  # 'ngram'
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

    X_vect = vectorizer.fit_transform(X_text)
    feature_names = vectorizer.get_feature_names_out()

    # 2. Chi-2 selection (optional)
    if apply_chi2:
        selector = SelectKBest(chi2, k=2000)
        X_vect = selector.fit_transform(X_vect, y)
        selected_idx = selector.get_support(indices=True)
        feature_names = feature_names[selected_idx]

    # 3. Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

    # 4. Fit and evaluate model
    for name, model in models.items():
        print(f"\n--- TF-IDF | {vectorizer_type.upper()} | {'Chi-2' if apply_chi2 else 'None'} | {name} ---\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

        # 5. Feature importance from feature_log_prob_
        if hasattr(model, 'feature_log_prob_'):
            importances = np.max(model.feature_log_prob_, axis=0)
            words_weights = list(zip(feature_names, importances))
            sorted_words = sorted(words_weights, key=lambda x: x[1], reverse=True)

            print("Top 20 impactful words:")
            for word, score in sorted_words[:20]:
                print(f"{word}: {score:.4f}")
        else:
            print("No importances available for this model.")

# Assume `subset_data` already contains 10K rows per class and is cleaned
subset_data['clean_text_final'] = subset_data['text'].apply(clean_text)

# Run TF-IDF-only
run_tfidf_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='basic', apply_chi2=False)

# Run TF-IDF + n-gram
run_tfidf_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='ngram', apply_chi2=False)

# Run TF-IDF + n-gram + Chi-2
run_tfidf_pipeline(subset_data['clean_text_final'], subset_data['label'], vectorizer_type='ngram', apply_chi2=True)



--- TF-IDF | BASIC | None | NaiveBayes ---

              precision    recall  f1-score   support

         0.0       0.93      0.83      0.88      2019
         1.0       0.85      0.94      0.89      1981

    accuracy                           0.88      4000
   macro avg       0.89      0.89      0.88      4000
weighted avg       0.89      0.88      0.88      4000

Top 20 impactful words:
im: -4.5004
dont: -4.8629
feel: -4.9020
want: -5.0268
like: -5.0503
life: -5.1201
know: -5.2471
ive: -5.2783
cant: -5.3307
guy: -5.3629
get: -5.3814
people: -5.4712
time: -5.4807
even: -5.5041
girl: -5.5080
friend: -5.5370
year: -5.5666
one: -5.5759
really: -5.5923
school: -5.6038

--- TF-IDF | NGRAM | None | NaiveBayes ---

              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88      2019
         1.0       0.86      0.91      0.88      1981

    accuracy                           0.88      4000
   macro avg       0.88      0.88      0.88      4000
weigh