In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk 
from nltk.corpus import stopwords

In [48]:
data=pd.read_csv("data_to_be_cleansed.csv")

## Data Exploaration

In [49]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,title,target
0,0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1


In [50]:
data.shape

(5957, 4)

In [51]:
data.columns

Index(['Unnamed: 0', 'text', 'title', 'target'], dtype='object')

In [52]:
data.drop('Unnamed: 0',axis=1,inplace=True)

In [53]:
data.head()

Unnamed: 0,text,title,target
0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1


In [54]:
data.isnull().any()

text       True
title     False
target    False
dtype: bool

In [55]:
data.isnull().value_counts()

text   title  target
False  False  False     5607
True   False  False      350
Name: count, dtype: int64

### Fixing Missing Values

In [56]:
## combining text and title to have one single corpus
data['text'] = data['text'].fillna('') + ' ' + data['title'].fillna('')

In [57]:
data.head()

Unnamed: 0,text,title,target
0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1


In [58]:
data['text'][0]

'Welcome to /r/depression\'s check-in post - a place to take a moment and share what is going on and how you are doing. If you have an accomplishment you want to talk about (these shouldn\'t be standalone posts in the sub as they violate the "role model" rule, but are welcome here), or are having a tough time but prefer not to make your own post, this is a place you can share.\n\n-----\n\nOur subreddit rules are located in the sidebar (you can also always access them at https://www.reddit.com/r/depression/about/rules) - since all of them exist for important safety reasons, we ask everyone here to read and follow them. Please click \'report\' on any harmful content you see here - we always want to know and deal as soon as we can.\n\nWe also have several wikis there for help with finding and giving support:\n\nhttps://www.reddit.com/r/depression/wiki/what_is_depression provides guidance about what is and isn\'t a depressive disorder, guidance on the complex nature of the illnesses that a

In [59]:
data.drop('title',axis=1,inplace=True)

In [60]:
data.head()

Unnamed: 0,text,target
0,Welcome to /r/depression's check-in post - a p...,1
1,We understand that most people who reply immed...,1
2,Anyone else just miss physical touch? I crave ...,1
3,I’m just so ashamed. Everyone and everything f...,1
4,I really need a friend. I don't even have a si...,1


In [61]:
data.shape

(5957, 2)

In [62]:
df=data

## Text Preprocessing

In [63]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import re 
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [64]:
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

True

In [65]:
lematizer=WordNetLemmatizer()
corpus=[]
for i in range(0,len(df)):
    text=df['text'][i]
    text = text.lower()  # Lowercase
    text = re.sub(r'\[.*?\]', '', text) # Remove text in square brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # Remove URLs
    text = re.sub(r'<.*?>+', '', text) # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove numbers
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [lematizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    tokens=" ".join(tokens)
    corpus.append(tokens)

In [66]:
corpus

['welcome rdepressions checkin post place take moment share going accomplishment want talk shouldnt standalone post sub violate role model rule welcome tough time prefer make post place share subreddit rule located sidebar also always access since exist important safety reason ask everyone read follow please click report harmful content see always want know deal soon also several wikis help finding giving support provides guidance isnt depressive disorder guidance complex nature illness usually grouped depression label redirect information common offtopic issue offer information nature value peer support mentalhealth issue general lot guidance learning isnt usually helpful giving peer support ysk type rule violation frequently see interfering people getting safe relevant support people breaking private contact rule never trust anyone try get private conversation response post see im help post show dont understand basic principle peer support especially selectivity giving help wiki expl

In [68]:
!pip install scikit-learn


Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.16.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.2-cp313-cp313-win_amd64.whl (8.7 MB)
Using cached scipy-1.16.2-cp313-cp313-win_amd64.whl (38.5 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.7.2 scipy-1.16.2 threadpoolctl-3.6.0



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [69]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.svm import SVC 


In [70]:
models = {
    "naive_bayes": MultinomialNB(),
    "random_forest": RandomForestClassifier(),
    "log_reg": LogisticRegression(max_iter=1000),
    "gaussian":GaussianNB(),
    "svm":SVC()
}

In [71]:
param_grids = {
    "bow": {
        "vectorizer__ngram_range": [(1,1), (1,2)],  
        "vectorizer__max_df": [0.7, 0.85, 1.0],            
        "vectorizer__min_df": [1, 2, 5],                   
        "vectorizer__max_features": [None, 5000, 10000],    
        "vectorizer__binary": [False, True],               
        "vectorizer__stop_words": [None, 'english'],        
    },

    # --- TF-IDF Vectorizer ---
    "tfidf": {
        "vectorizer__ngram_range": [(1,1), (1,2), (1,3)],
        "vectorizer__max_df": [0.7, 0.85, 1.0],
        "vectorizer__min_df": [1, 2, 5],
        "vectorizer__max_features": [None, 5000, 10000],
        "vectorizer__use_idf": [True, False],
        "vectorizer__smooth_idf": [True, False],
        "vectorizer__sublinear_tf": [True, False],
        "vectorizer__stop_words": [None, 'english'],
    },
"naive_bayes": {
    "model__alpha": [0.01, 0.1, 0.5, 1.0, 2.0, 5.0],
    "model__fit_prior": [True, False]
},
    "random_forest": {
    "model__n_estimators": [100, 200, 500],
    "model__max_depth": [None, 10, 20, 40],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log2", None],
    "model__bootstrap": [True, False]
},
    "log_reg": {
    "model__C": [0.01, 0.1, 1, 10, 100],
    "model__penalty": ["l1", "l2", "elasticnet", None],
    "model__solver": ["lbfgs", "liblinear", "saga"],
    "model__max_iter": [500, 1000, 2000],
    "model__l1_ratio": [0.0, 0.5, 1.0]  # only used with elasticnet
},
"svm": {
    "model__C": [0.1, 1, 10, 100],
    "model__kernel": ["linear", "rbf", "poly", "sigmoid"],
    "model__gamma": ["scale", "auto"],
    "model__degree": [2, 3, 4]
}
}


In [72]:
text=corpus
labels=df['target']

In [73]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    text,
    labels,
    test_size=0.3,
    random_state=42,
    stratify=labels  # if classification is imbalanced
)

In [74]:
vectorizers = {
    "bow": CountVectorizer(),
    "tfidf": TfidfVectorizer(),
}

In [77]:
from sklearn.base import TransformerMixin

class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if hasattr(X, "toarray"):  # check if sparse
            return X.toarray()
        else:
            return X

# Main loop
from sklearn.model_selection import RandomizedSearchCV
results = []

for vec_name, vectorizer in vectorizers.items():
    for model_name, model in models.items():
        print(f"\n🔍 Testing {vec_name.upper()} + {model_name.upper()}")

        # Define pipeline with conditional dense conversion for GaussianNB
        if model_name == 'gaussian':
            pipe = Pipeline([
                ("vectorizer", vectorizer),
                ("todense", DenseTransformer()),
                ("model", model)
            ])
            # Use fewer parallel jobs for GaussianNB to reduce memory usage
            n_jobs_to_use = 1
            n_iter_to_use = 20
        else:
            pipe = Pipeline([
                ("vectorizer", vectorizer),
                ("model", model)
            ])
            n_jobs_to_use = -1
            n_iter_to_use = 50

        # Combine param grids safely
        params = {}
        params.update(param_grids.get(vec_name, {}))
        params.update(param_grids.get(model_name, {}))

        # Randomized Search setup
        random_search = RandomizedSearchCV(
            estimator=pipe,
            param_distributions=params,
            n_iter=n_iter_to_use,
            cv=3,
            scoring='accuracy',
            n_jobs=n_jobs_to_use,
            verbose=2,
            random_state=42
        )

        # Fit model
        random_search.fit(X_train, y_train)

        # Predict on test set
        y_pred = random_search.predict(X_test)

        # Compute metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        print("\n📊 Performance Metrics:")
        print(f"  Accuracy : {acc:.4f}")
        print(f"  Precision: {prec:.4f}")
        print(f"  Recall   : {rec:.4f}")
        print(f"  F1 Score : {f1:.4f}")
        print("\nBest Parameters:", random_search.best_params_)
        print("-" * 70)

        # Save detailed results
        results.append({
            "vectorizer": vec_name,
            "model": model_name,
            "best_cv_score": random_search.best_score_,
            "test_accuracy": acc,
            "test_precision": prec,
            "test_recall": rec,
            "test_f1": f1,
            "best_params": random_search.best_params_
        })

# Create summary DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="test_f1", ascending=False)

print("\n🏆 Final Summary (Sorted by F1 Score):")
print(results_df[["vectorizer", "model", "test_accuracy", "test_precision", "test_recall", "test_f1"]])


🔍 Testing BOW + NAIVE_BAYES
Fitting 3 folds for each of 50 candidates, totalling 150 fits

📊 Performance Metrics:
  Accuracy : 0.7623
  Precision: 0.7800
  Recall   : 0.7623
  F1 Score : 0.7662

Best Parameters: {'vectorizer__stop_words': 'english', 'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 1, 'vectorizer__max_features': None, 'vectorizer__max_df': 0.85, 'vectorizer__binary': True, 'model__fit_prior': True, 'model__alpha': 0.5}
----------------------------------------------------------------------

🔍 Testing BOW + RANDOM_FOREST
Fitting 3 folds for each of 50 candidates, totalling 150 fits

📊 Performance Metrics:
  Accuracy : 0.8154
  Precision: 0.8226
  Recall   : 0.8154
  F1 Score : 0.8173

Best Parameters: {'vectorizer__stop_words': None, 'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 1, 'vectorizer__max_features': 5000, 'vectorizer__max_df': 0.85, 'vectorizer__binary': True, 'model__n_estimators': 500, 'model__min_samples_split': 10, 'model__min_samples_lea

69 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
33 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\adity\Downloads\minor sem5\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\adity\Downloads\minor sem5\.venv\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\adity\Downloads\minor sem5\.venv\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    ~~~~~~~~~~~~~~~


📊 Performance Metrics:
  Accuracy : 0.8059
  Precision: 0.8080
  Recall   : 0.8059
  F1 Score : 0.8065

Best Parameters: {'vectorizer__stop_words': None, 'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 2, 'vectorizer__max_features': None, 'vectorizer__max_df': 0.85, 'vectorizer__binary': False, 'model__solver': 'liblinear', 'model__penalty': 'l2', 'model__max_iter': 2000, 'model__l1_ratio': 0.5, 'model__C': 0.1}
----------------------------------------------------------------------

🔍 Testing BOW + GAUSSIAN
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END vectorizer__binary=True, vectorizer__max_df=1.0, vectorizer__max_features=5000, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english; total time=   0.6s
[CV] END vectorizer__binary=True, vectorizer__max_df=1.0, vectorizer__max_features=5000, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english; total time=   0.6s
[CV] END vectorizer__binary=

57 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\adity\Downloads\minor sem5\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\adity\Downloads\minor sem5\.venv\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\adity\Downloads\minor sem5\.venv\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    ~~~~~~~~~~~~~~~


📊 Performance Metrics:
  Accuracy : 0.7975
  Precision: 0.8035
  Recall   : 0.7975
  F1 Score : 0.7989

Best Parameters: {'vectorizer__use_idf': True, 'vectorizer__sublinear_tf': True, 'vectorizer__stop_words': 'english', 'vectorizer__smooth_idf': True, 'vectorizer__ngram_range': (1, 3), 'vectorizer__min_df': 1, 'vectorizer__max_features': 10000, 'vectorizer__max_df': 0.85, 'model__solver': 'lbfgs', 'model__penalty': 'l2', 'model__max_iter': 500, 'model__l1_ratio': 0.5, 'model__C': 1}
----------------------------------------------------------------------

🔍 Testing TFIDF + GAUSSIAN
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END vectorizer__max_df=0.85, vectorizer__max_features=5000, vectorizer__min_df=1, vectorizer__ngram_range=(1, 2), vectorizer__smooth_idf=True, vectorizer__stop_words=english, vectorizer__sublinear_tf=False, vectorizer__use_idf=True; total time=   1.5s
[CV] END vectorizer__max_df=0.85, vectorizer__max_features=5000, vectorizer__min_df=1, vecto