In [31]:
import pandas as pd
import os

print("Current working directory:")
print(os.getcwd())

data1 = pd.read_csv("cleanData.csv")
data2 = pd.read_csv("Mental_Health_and_Social_Media_Balance_Dataset.csv")

combined = pd.concat([data1, data2])

# Create the 'processed' directory if it doesn't exist
os.makedirs("processed", exist_ok=True)  # 'exist_ok=True' prevents error if directory already exists

combined.to_csv("processed/combined.csv", index=False)

print("Done merging!")

Current working directory:
/home/c2631990-ebe7-45de-91d9-dcf1fee5926b
Done merging!


In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load merged dataset
df = pd.read_csv("processed/combined.csv")

print("Available columns:")
print(list(df.columns))

# --- Auto-detect TEXT column ---
text_candidates = [
    "text", "Text", "statement", "Statement", "post", "Post",
    "content", "Content", "message", "Message", "tweet", "Tweet"
]
TEXT_COLUMN = next((c for c in text_candidates if c in df.columns), None)

# If not found, guess: choose the object (string) column with the longest average text
if TEXT_COLUMN is None:
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if not obj_cols:
        raise ValueError("No text-like (object/string) columns found. Check your CSV.")

    avg_len = {}
    for c in obj_cols:
        s = df[c].astype(str).fillna("")
        avg_len[c] = s.str.len().mean()

    TEXT_COLUMN = max(avg_len, key=avg_len.get)

print("Using TEXT_COLUMN =", TEXT_COLUMN)

# --- Auto-detect LABEL column ---
label_candidates = [
    "status", "Status", "label", "Label", "class", "Class",
    "category", "Category", "target", "Target"
]
LABEL_COLUMN = next((c for c in label_candidates if c in df.columns), None)

# If not found, guess: pick a low-cardinality non-numeric column (but not the text column)
if LABEL_COLUMN is None:
    candidate_cols = [c for c in df.columns if c != TEXT_COLUMN]
    best = None
    best_score = -1
    for c in candidate_cols:
        # convert to string to handle mixed types
        n_unique = df[c].astype(str).nunique(dropna=True)
        # score labels as "small number of unique classes"
        if 2 <= n_unique <= 20:
            best = c
            best_score = n_unique
            break
    if best is None:
        raise ValueError(
            "Could not auto-detect label column. Please set LABEL_COLUMN manually "
            "after looking at df.columns."
        )
    LABEL_COLUMN = best

print("Using LABEL_COLUMN =", LABEL_COLUMN)

# Keep only needed columns and drop missing
df2 = df[[TEXT_COLUMN, LABEL_COLUMN]].dropna()
df2[TEXT_COLUMN] = df2[TEXT_COLUMN].astype(str)

# Encode labels
le = LabelEncoder()
y = le.fit_transform(df2[LABEL_COLUMN].astype(str))

X = df2[TEXT_COLUMN]

print("Classes found:", list(le.classes_))
print("Total rows used:", len(df2))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Available columns:
['index', 'statement', 'status', 'User_ID', 'Age', 'Gender', 'Daily_Screen_Time(hrs)', 'Sleep_Quality(1-10)', 'Stress_Level(1-10)', 'Days_Without_Social_Media', 'Exercise_Frequency(week)', 'Social_Media_Platform', 'Happiness_Index(1-10)']
Using TEXT_COLUMN = statement
Using LABEL_COLUMN = status
Classes found: ['Anxiety', 'Bipolar', 'Depression', 'Normal', 'Personality disorder', 'Stress', 'Suicidal']
Total rows used: 52680


In [None]:
# Multinomial Naive Bayes Classifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=20000,
    ngram_range=(1,2)
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict
nb_preds = nb_model.predict(X_test_tfidf)

print("Multinomial NB Accuracy:", accuracy_score(y_test, nb_preds))
print(classification_report(y_test, nb_preds, target_names=le.classes_))

In [22]:
# Deep Learning Neural Network

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("processed/combined.csv")
print("Columns:", list(df.columns))

# Try common column names (adjust if needed)
text_candidates  = ["text","Text","statement","Statement","post","Post","content","Content","message","Message"]
label_candidates = ["status","Status","label","Label","class","Class","category","Category","target","Target"]

TEXT_COLUMN = next((c for c in text_candidates if c in df.columns), None)
LABEL_COLUMN = next((c for c in label_candidates if c in df.columns), None)

if TEXT_COLUMN is None or LABEL_COLUMN is None:
    # fallback: pick likely text column as longest avg string column
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if TEXT_COLUMN is None:
        TEXT_COLUMN = max(obj_cols, key=lambda c: df[c].astype(str).str.len().mean())
    if LABEL_COLUMN is None:
        # pick a low-cardinality column different from text
        for c in obj_cols:
            if c == TEXT_COLUMN: 
                continue
            n = df[c].astype(str).nunique(dropna=True)
            if 2 <= n <= 20:
                LABEL_COLUMN = c
                break

print("Using TEXT_COLUMN =", TEXT_COLUMN)
print("Using LABEL_COLUMN =", LABEL_COLUMN)

df = df[[TEXT_COLUMN, LABEL_COLUMN]].dropna()
df[TEXT_COLUMN] = df[TEXT_COLUMN].astype(str)

le = LabelEncoder()
y = le.fit_transform(df[LABEL_COLUMN].astype(str))
X = df[TEXT_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Classes:", list(le.classes_))
print("Train size:", len(X_train), "Test size:", len(X_test))

Columns: ['index', 'statement', 'status', 'User_ID', 'Age', 'Gender', 'Daily_Screen_Time(hrs)', 'Sleep_Quality(1-10)', 'Stress_Level(1-10)', 'Days_Without_Social_Media', 'Exercise_Frequency(week)', 'Social_Media_Platform', 'Happiness_Index(1-10)']
Using TEXT_COLUMN = statement
Using LABEL_COLUMN = status
Classes: ['Anxiety', 'Bipolar', 'Depression', 'Normal', 'Personality disorder', 'Stress', 'Suicidal']
Train size: 42144 Test size: 10536


In [29]:
# Deep Learning Neural Network

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, accuracy_score

num_classes = len(le.classes_)
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat  = to_categorical(y_test,  num_classes=num_classes)

inputs = tf.keras.Input(shape=(X_train_tfidf.shape[1],), sparse=True, name="tfidf")
x = layers.Dense(256, activation="relu")(inputs)
x = layers.Dropout(0.4)(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.4)(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = Model(inputs, outputs)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

early_stop = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)

history = model.fit(
    X_train_tfidf, y_train_cat,
    validation_split=0.1,
    epochs=10,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

loss, acc = model.evaluate(X_test_tfidf, y_test_cat, verbose=0)
print("Neural Network Accuracy:", acc)

probs = model.predict(X_test_tfidf, verbose=0)
preds = np.argmax(probs, axis=1)

print("Classification Report:")
print(classification_report(y_test, preds, target_names=le.classes_))

Epoch 1/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 32ms/step - accuracy: 0.6756 - loss: 0.8799 - val_accuracy: 0.7725 - val_loss: 0.6280
Epoch 2/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 24ms/step - accuracy: 0.8496 - loss: 0.4284 - val_accuracy: 0.7609 - val_loss: 0.6518
Epoch 3/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 24ms/step - accuracy: 0.9203 - loss: 0.2344 - val_accuracy: 0.7540 - val_loss: 0.7688
Neural Network Accuracy: 0.7656605839729309
Classification Report:
                      precision    recall  f1-score   support

             Anxiety       0.82      0.78      0.80       768
             Bipolar       0.82      0.76      0.79       555
          Depression       0.71      0.71      0.71      3081
              Normal       0.87      0.94      0.90      3269
Personality disorder       0.98      0.25      0.40       215
              Stress       0.61      0.46      0.52       518
      

In [30]:
# Compare Models 

print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_preds))
print("Neural Network Accuracy:", acc)

Naive Bayes Accuracy: 0.6642938496583144
Neural Network Accuracy: 0.7656605839729309
