# PsyPredict: ML Framework for Mental Health Disorders

In [1]:
import os, logging, re, hashlib
from typing import Dict
import numpy as np, pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import warnings

In [2]:
logger = logging.getLogger("ml_preprocessing_v4")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.handlers = [handler]

In [3]:
DATA_PATH = "D:\Programming Languages\Machine Learning\Projects\PsyPredict\Data\mental_disorders_dataset.csv"
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
EMAIL_REGEX = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
PHONE_REGEX = re.compile(r"(\+?\d{1,3}[\s-]?)?(\(?\d{2,4}\)?[\s-]?)?\d{6,12}")
PII_COLUMN_KEYWORDS = ["name", "email", "phone", "address", "ssn", "id", "userid", "patient number", "patient_number", "patientid"]

  DATA_PATH = "D:\Programming Languages\Machine Learning\Projects\PsyPredict\Data\mental_disorders_dataset.csv"


In [4]:
def safe_hash_series(s: pd.Series, salt: str):
    return s.fillna("").astype(str).apply(lambda x: hashlib.sha256((salt + x).encode("utf-8")).hexdigest())

def detect_pii_columns(df: pd.DataFrame, sample_size: int = 200) -> Dict[str, str]:
    reasons = {}
    for col in df.columns:
        lname = col.lower()
        if any(k in lname for k in PII_COLUMN_KEYWORDS):
            reasons[col] = f"column name '{col}' contains PII keyword"
            continue
        sample = df[col].dropna().astype(str).head(sample_size).tolist()
        if any(EMAIL_REGEX.search(x) for x in sample):
            reasons[col] = "contains email-like values"
            continue
        if any(PHONE_REGEX.search(x) for x in sample):
            reasons[col] = "contains phone-like values"
            continue
    return reasons

In [5]:
class WinsorizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, lower_quantile=0.01, upper_quantile=0.99):
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
    def fit(self, X, y=None):
        import pandas as _pd
        if isinstance(X, np.ndarray):
            X = _pd.DataFrame(X)
        self.lower_bounds_ = X.quantile(self.lower_quantile)
        self.upper_bounds_ = X.quantile(self.upper_quantile)
        self.feature_names_in_ = list(X.columns)
        return self
    def transform(self, X):
        import pandas as _pd
        if isinstance(X, np.ndarray):
            X = _pd.DataFrame(X, columns=self.feature_names_in_)
        X_w = X.copy()
        for col in X_w.columns:
            low = self.lower_bounds_.loc[col]
            high = self.upper_bounds_.loc[col]
            X_w[col] = X_w[col].clip(lower=low, upper=high)
        return X_w.values

In [6]:
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        import pandas as _pd
        if isinstance(X, np.ndarray):
            X = _pd.DataFrame(X)
        self.maps_ = {}
        for col in X.columns:
            vc = X[col].fillna("__MISSING__").astype(str).value_counts(normalize=True)
            self.maps_[col] = vc.to_dict()
        self.feature_names_in_ = list(X.columns)
        return self
    def transform(self, X):
        import pandas as _pd
        if isinstance(X, np.ndarray):
            X = _pd.DataFrame(X, columns=self.feature_names_in_)
        out = _pd.DataFrame(index=X.index)
        for col in X.columns:
            map_ = self.maps_.get(col, {})
            out[col] = X[col].fillna("__MISSING__").astype(str).map(map_).fillna(0.0)
        return out.values

### Step 1: Load

In [7]:
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(DATA_PATH)
df = pd.read_csv(DATA_PATH)
logger.info(f"Loaded dataset shape: {df.shape}")

2025-09-28 19:39:11,911 - INFO - Loaded dataset shape: (120, 19)


### Step 2: Detect Personal Identification IDs

In [8]:

initial_pii = detect_pii_columns(df)
logger.info(f"PII-like columns detected (pre-clean): {list(initial_pii.keys())}")

2025-09-28 19:39:11,921 - INFO - PII-like columns detected (pre-clean): ['Patient Number', 'Suicidal thoughts']


### Step 3A: Drop IDs for Anonymization

In [9]:
nrows = df.shape[0]
id_like = [c for c in df.columns if (df[c].nunique()/max(1,nrows) > 0.99) and ("patient" in c.lower() or c.lower().startswith("id") or "number" in c.lower())]
if id_like:
    logger.info(f"Dropping id-like columns: {id_like}")
    df.drop(columns = id_like, inplace = True)

2025-09-28 19:39:11,931 - INFO - Dropping id-like columns: ['Patient Number']


### Step 3B: Pseudonymize PII and Drop original unique IDs

In [10]:
pii_reasons = detect_pii_columns(df)
if pii_reasons:
    logger.info(f"Pseudonymizing columns: {list(pii_reasons.keys())}")
    salt = os.environ.get("PII_HASH_SALT", "static_demo_salt_change_in_prod")  
    for c in pii_reasons:
        df[f"__hashed__{c}"] = safe_hash_series(df[c], salt=salt)
    for c in pii_reasons:
        if c in df.columns:
            df.drop(columns=[c], inplace=True)
            logger.info(f"Dropped original PII column: {c}")

2025-09-28 19:39:11,943 - INFO - Pseudonymizing columns: ['Suicidal thoughts']
2025-09-28 19:39:11,945 - INFO - Dropped original PII column: Suicidal thoughts


### Step 4: Target detection & Excluding hashed columns

In [11]:
candidates = [c for c in df.columns if (not c.startswith("__hashed__")) and c.lower() in ("target", "label", "diagnosis", "outcome", "mental_disorder", "has_disorder", "class")]
candidates += [c for c in df.columns if (not c.startswith("__hashed__")) and df[c].nunique() <= 10 and c.lower().startswith("y_")]
candidates = list(dict.fromkeys(candidates))
target_col = candidates[0] if candidates else None
if not target_col:
    for c in df.columns[::-1]:
        if c.startswith("__hashed__"): continue
        if df[c].nunique() <= 5 and df[c].nunique() > 1 and c.lower() not in ['id', 'patient_id']:
            target_col = c
            break
logger.info(f"Inferred target: {target_col}")

2025-09-28 19:39:11,952 - INFO - Inferred target: Expert Diagnose


### Step 5: Create feature matrix X & Drop hashed PII from X by default

In [12]:
if target_col:
    X = df.drop(columns=[target_col]).copy()
    y = df[target_col].copy()
else:
    X = df.copy()
    y = None
hashed_cols = [c for c in X.columns if c.startswith("__hashed__")]
if hashed_cols:
    logger.info(f"Dropping hashed PII columns from features: {hashed_cols}")
    X.drop(columns = hashed_cols, inplace = True)
# Drop any remaining id-like in X
id_like_2 = [c for c in X.columns if (X[c].nunique()/max(1,X.shape[0]) > 0.99) and ("patient" in c.lower() or c.lower().startswith("id") or "number" in c.lower())]
if id_like_2:
    logger.info(f"Dropping id-like in features: {id_like_2}")
    X.drop(columns=id_like_2, inplace=True)

2025-09-28 19:39:11,961 - INFO - Dropping hashed PII columns from features: ['__hashed__Suicidal thoughts']


### Step 6: Determine dtypes & missingness and add missing flags

In [13]:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
null_percent = (X.isnull().mean()*100).sort_values(ascending=False)
numeric_small_missing = [c for c in num_cols if X[c].isnull().mean() < 0.05]
numeric_medium_missing = [c for c in num_cols if 0.05 <= X[c].isnull().mean() <= 0.30]
numeric_large_missing = [c for c in num_cols if X[c].isnull().mean() > 0.30]
cat_small_missing = [c for c in cat_cols if X[c].isnull().mean() < 0.05]
cat_large_missing = [c for c in cat_cols if X[c].isnull().mean() >= 0.05]
for c in numeric_medium_missing + numeric_large_missing + cat_large_missing:
    X[f"__missing_flag__{c}"] = X[c].isnull().astype(int)

### Step 7: Build ColumnTransformer from final X's columns

In [14]:
ohe_cols = [c for c in cat_cols if X[c].nunique() <= 20]
freq_cols = [c for c in cat_cols if X[c].nunique() > 20]
numeric_pipeline = Pipeline([("imputer", SimpleImputer(strategy="median")), ("winsor", WinsorizerTransformer()), ("scaler", StandardScaler())])
ohe_pipeline = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])
freq_pipeline = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("freq", FrequencyEncoder())])
transformers = []
if num_cols:
    transformers.append(("num", numeric_pipeline, num_cols))
if ohe_cols:
    transformers.append(("ohe", ohe_pipeline, ohe_cols))
if freq_cols:
    transformers.append(("freq", freq_pipeline, freq_cols))
if not transformers:
    raise RuntimeError("No transformers configured for these features")
preprocessor = ColumnTransformer(transformers=transformers, sparse_threshold=0)

### Step 8: Train/Test Split

In [15]:
if y is not None:
    stratify = None
    try:
        vc = y.value_counts()
        if vc.min() >= 2:
            stratify = y
        else:
            logger.warning("Not enough class members to stratify; splitting without stratify")
    except Exception:
        logger.warning("Stratify check failed; splitting without stratify")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=stratify)
else:
    X_train = X_test = X.copy()
    y_train = y_test = None

### Step 9: Fit and transform

In [16]:
preprocessor.fit(X_train)
X_train_t = preprocessor.transform(X_train)
logger.info(f"Transformed training shape: {X_train_t.shape}")

2025-09-28 19:39:12,010 - INFO - Transformed training shape: (96, 60)


### Step 10: Model Preparation and Evaluation

In [17]:
warnings.filterwarnings("ignore", category=FutureWarning)
logger = logging.getLogger("step10_output")
logger.setLevel(logging.INFO)
if not logger.handlers:
    ch = logging.StreamHandler()
    ch.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
    logger.addHandler(ch)

if 'y' not in globals() or y is None:
    print("No target (y) provided — skipping Step (10).")
else:
    try:
        clf = LogisticRegression(
            max_iter=2000,
            random_state=RANDOM_SEED,
            class_weight="balanced",
            solver="saga",           
            multi_class = "multinomial",
            n_jobs=None
        )
        pipeline = Pipeline([("preproc", preprocessor), ("clf", clf)])
        scoring = ["accuracy", "f1_weighted", "precision_weighted", "recall_weighted"]
        print("\nRunning 5-fold cross-validation (this may take a moment)...")
        cv_res = cross_validate(
            pipeline,
            X,
            y,
            cv=5,
            scoring=scoring,
            n_jobs=1,
            return_train_score=False
        )
        cv_df = pd.DataFrame({
            "fold": np.arange(1, len(cv_res["test_accuracy"]) + 1),
            "accuracy": cv_res["test_accuracy"],
            "f1_weighted": cv_res["test_f1_weighted"],
            "precision_weighted": cv_res["test_precision_weighted"],
            "recall_weighted": cv_res["test_recall_weighted"],
        })

        cv_summary = cv_df[["accuracy", "f1_weighted", "precision_weighted", "recall_weighted"]].agg(["mean", "std"]).T.round(4)
        print("\nCross-validation results (per fold):")
        print(cv_df.to_string(index=False))
        print("\nCross-validation summary (mean ± std):")
        for metric in cv_summary.index:
            mean = cv_summary.loc[metric, "mean"]
            std = cv_summary.loc[metric, "std"]
            print(f"- {metric}: {mean:.4f} ± {std:.4f}")
        logger.info("CV finished. Mean accuracy: %.4f", cv_res["test_accuracy"].mean())
        if ('X_train' in globals() and 'X_test' in globals() and 'y_train' in globals() and 'y_test' in globals()
            and X_train is not None and X_test is not None and y_train is not None and y_test is not None):
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            holdout_acc = accuracy_score(y_test, y_pred)
            print(f"\nHoldout accuracy: {holdout_acc:.4f}")
            report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
            report_df = pd.DataFrame(report_dict).T.round(4).rename_axis("class_or_metric")
            print("\nClassification report (holdout):")
            print(report_df.to_string())
        else:
            print("\nHoldout split not available in this environment; only CV results were computed.")

        print("\nStep (10) complete — printed CV and holdout results.")
    except Exception as exc:
        print("\nERROR during Step (10):", str(exc))
        logger.exception("Exception in Step (10)")

2025-09-28 19:39:12,106 - INFO - CV finished. Mean accuracy: 0.8500



Running 5-fold cross-validation (this may take a moment)...

Cross-validation results (per fold):
 fold  accuracy  f1_weighted  precision_weighted  recall_weighted
    1  0.833333     0.832993            0.848810         0.833333
    2  0.916667     0.915449            0.927827         0.916667
    3  0.791667     0.786713            0.792857         0.791667
    4  0.875000     0.869872            0.886905         0.875000
    5  0.833333     0.830769            0.880952         0.833333

Cross-validation summary (mean ± std):
- accuracy: 0.8500 ± 0.0475
- f1_weighted: 0.8472 ± 0.0482
- precision_weighted: 0.8675 ± 0.0503
- recall_weighted: 0.8500 ± 0.0475

Holdout accuracy: 0.8750

Classification report (holdout):
                 precision  recall  f1-score  support
class_or_metric                                      
Bipolar Type-1      1.0000  0.8333    0.9091    6.000
Bipolar Type-2      0.7500  1.0000    0.8571    6.000
Depression          0.8571  1.0000    0.9231    6.000
Nor