# CRISP-DM — Social Media & Mental Health
_Last updated: 2025-11-01 17:42_


# Kaggle API setup (Colab-safe)

1. Get your API key from Kaggle: Account → **Create New API Token** → downloads `kaggle.json`.
2. In **Colab**: upload `kaggle.json` when prompted in the first cell below.



In [None]:

#@title 🔑 Kaggle setup & dataset download
# This cell works in Colab. If running locally, ensure kaggle is installed and KAGGLE_CONFIG_DIR is set.
import os, json, pathlib, zipfile, subprocess, sys
from google.colab import files

print("Upload your kaggle.json (Kaggle → Account → Create New API Token).")
uploaded = files.upload()
assert 'kaggle.json' in uploaded, "Please upload kaggle.json"
os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'wb') as f:
    f.write(uploaded['kaggle.json'])
os.chmod('/root/.kaggle/kaggle.json', 0o600)

!pip -q install kaggle
!kaggle datasets download -d ayeshaimran123/social-media-and-mental-health-balance -p data --force
os.makedirs("data", exist_ok=True)
# Unzip all archives
for z in os.listdir('data'):
    if z.endswith('.zip'):
        import zipfile
        with zipfile.ZipFile(os.path.join('data', z)) as zz:
            zz.extractall('data')
print("✅ Dataset downloaded to ./data")


## 1. Business Understanding
Describe objectives, KPIs, constraints, stakeholders, risks, hypotheses.

In [None]:
# TODO: Define success metrics (e.g., F1, AUC) and business utility mapping.
SUCCESS_METRIC = 'F1'

## 2. Data Understanding
Profile dataset: schema, missingness, distributions, correlations, target feasibility.

In [None]:

# EDA starter
import pandas as pd, numpy as np, matplotlib.pyplot as plt
import glob, os

files = glob.glob('data/*.*')
print('Found data files:', files)

# Try loading common names
candidates = [f for f in files if f.lower().endswith(('.csv','.tsv','.parquet','.xlsx'))]
if not candidates:
    raise SystemExit("Place the main CSV/TSV/XLSX/Parquet file in ./data")
path = candidates[0]
print("Loading:", path)
if path.endswith('.csv') or path.endswith('.tsv'):
    df = pd.read_csv(path)
elif path.endswith('.xlsx'):
    df = pd.read_excel(path)
else:
    df = pd.read_parquet(path)

print(df.shape); display(df.head())
display(df.describe(include='all'))
missing = df.isna().mean().sort_values(ascending=False).to_frame('missing_rate')
display(missing.head(20))


## 3. Data Preparation
Target definition, leakage audit, splits, encoding, feature engineering.

In [None]:

# Example: simple cleaning scaffold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

df_clean = df.copy()
# TODO: set TARGET column name correctly
TARGET = 'target'  # <-- change me after inspecting the dataset

assert TARGET in df_clean.columns, "Set TARGET to the correct label column"
y = df_clean[TARGET]
X = df_clean.drop(columns=[TARGET])

num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(exclude=['number']).columns.tolist()

numeric_proc = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_proc = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("oh", OneHotEncoder(handle_unknown="ignore"))
])
pre = ColumnTransformer([
    ("num", numeric_proc, num_cols),
    ("cat", categorical_proc, cat_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y if y.nunique()<20 else None, random_state=42)
print(X_train.shape, X_test.shape)


## 4. Modeling
Baselines, model selection, CV, hyperparameters.

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

model = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=200))])
scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1_macro")
print("CV F1_macro:", scores.mean(), "+/-", scores.std())
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))


## 5. Evaluation
Holdout integrity, error analysis, fairness, sensitivity.

In [None]:

import numpy as np
# TODO: add confusion matrix, calibration plots, subgroup analysis


## 6. Deployment
Packaging plan, API or batch, monitoring & drift.

In [None]:
# TODO: Export pipeline with joblib, add model card, define monitoring metrics.