In [None]:
import pandas as pd
df = pd.read_csv("mental_health_social_media_dataset.csv")
print(df.head())
print(df.isna().sum())
print(df.describe())
print(f"\nDuplicate rows: {df.duplicated().sum()}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_cols].hist(figsize=(14,12), bins=20, edgecolor='black')
plt.suptitle("Distribution of Numeric Columns")
plt.show()

plt.figure(figsize=(10,6))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

features_for_clustering = [
    'daily_screen_time_min',
    'social_media_time_min',
    'negative_interactions_count',
    'positive_interactions_count',
    'sleep_hours',
    'physical_activity_min',
    'anxiety_level',
    'stress_level',
    'mood_level'
]

X = df[features_for_clustering]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)
print("Cluster value counts:\n", df['cluster'].value_counts())
pca = PCA(n_components=2)
components = pca.fit_transform(X_scaled)
cluster_labels = {
    0: "Moderate Usage & Moderate Stress",
    1: "Low Stress & Healthy Lifestyle",
    2: "High Screen Time & High Stress"
}
df['cluster_label'] = df['cluster'].map(cluster_labels)
plt.figure(figsize=(8,6))
sns.scatterplot(
    x=components[:,0],
    y=components[:,1],
    hue=df['cluster_label'],
    palette='Set1',
    s=60
)
plt.title("User Behavioral Clustering Visualization")
plt.xlabel("Lifestyle–Usage Pattern")
plt.ylabel("Stress–Activity Variation")
plt.legend(title="Cluster Label")
plt.show()



In [None]:
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score

sil_score = silhouette_score(X_scaled, df['cluster'])
print("Silhouette Score for KMeans:", sil_score)
print("KMeans Inertia (WCSS):", kmeans.inertia_)
db_score = davies_bouldin_score(X_scaled, df['cluster'])
print("Davies-Bouldin Score:", db_score)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df['stress_label'] = df['stress_level'].apply(lambda x: 1 if x > df['stress_level'].median() else 0)
print(df['stress_label'].value_counts())
supervised_features = [
    'daily_screen_time_min',
    'social_media_time_min',
    'negative_interactions_count',
    'positive_interactions_count',
    'sleep_hours',
    'physical_activity_min',
    'anxiety_level',
    'mood_level'
]

X = df[supervised_features]
y = df['stress_label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
importances = rf.feature_importances_
feature_names = supervised_features
fi_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=fi_df)
plt.title("Feature Importance in Random Forest Model")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Random Forest Classifier")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()