In [None]:
from pathlib import Path
from scipy import sparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os, json
from joblib import dump


# Project APIs
from addiction import (
    load_raw, load_interim, save_interim, basic_cleanup, train_test_split_safe,
    build_features, infer_column_types, make_preprocessor,
    get_feature_names_after_preprocessor, save_preprocessor, load_preprocessor
)

# Direct config (paths & knobs read from .env by addiction.config)
from addiction import __version__ as ADDICTION_VERSION
from addiction.config import (
    RAW_DATA_DEFAULT_PATH, INTERIM_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR,
    TARGET, TEST_SIZE, RANDOM_STATE, ENCODE_CATEGORICALS
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)



In [188]:
print("addiction package:", ADDICTION_VERSION)
print("RAW:", RAW_DATA_DEFAULT_PATH)
print("TARGET:", TARGET, "| TEST_SIZE:", TEST_SIZE, "| RANDOM_STATE:", RANDOM_STATE)

addiction package: 0.0.1
RAW: /Users/christianfullerton/Developer/Python Workspace/Cigarette-and-Drinking-Data/data/raw/addiction_population_data.csv
TARGET: has_health_issues | TEST_SIZE: 0.2 | RANDOM_STATE: 42


In [189]:
INTERIM = INTERIM_DATA_DIR / "dataset.csv"

if INTERIM.exists():
    df = load_interim(INTERIM)
else:
    df = basic_cleanup(load_raw(RAW_DATA_DEFAULT_PATH))
    save_interim(df, INTERIM)

display(df.head())
print("Shape:", df.shape)
assert TARGET in df.columns, f"Missing target column: {TARGET!r}"
df[TARGET].value_counts(dropna=False)

[32m2025-11-07 16:44:56.535[0m | [1mINFO    [0m | [36maddiction.dataset[0m:[36mload_interim[0m:[36m206[0m - [1mLoading interim data: /Users/christianfullerton/Developer/Python Workspace/Cigarette-and-Drinking-Data/data/interim/dataset.csv[0m


Unnamed: 0,id,name,age,gender,country,city,education_level,employment_status,annual_income_usd,marital_status,...,attempts_to_quit_smoking,attempts_to_quit_drinking,has_health_issues,mental_health_status,exercise_frequency,diet_quality,sleep_hours,bmi,social_support,therapy_history
0,1,michael bates,66,other,Yemen,Martinmouth,secondary,student,45595,married,...,6,2,True,good,daily,average,5.6,22.4,,Current
1,2,brian thompson,29,male,Saudi Arabia,Harperhaven,primary,self-employed,145842,single,...,1,6,False,poor,weekly,good,6.7,24.1,,
2,3,steven little,75,male,Togo,Chanport,postgraduate,unemployed,162480,single,...,9,9,True,good,never,good,6.2,22.2,,
3,4,michael mathews,35,other,Togo,North Cory,university,unemployed,16023,in a relationship,...,5,7,False,average,daily,good,7.2,25.5,,Current
4,5,nicholas sanchez,38,female,Morocco,Danielberg,college,self-employed,62933,in a relationship,...,4,7,True,poor,weekly,good,8.5,31.2,,Past


Shape: (3000, 25)


has_health_issues
False    1510
True     1490
Name: count, dtype: int64

In [190]:
Xtr_raw, Xte_raw, ytr, yte = train_test_split_safe(
    df, target=TARGET, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=True
)
Xtr_raw.shape, Xte_raw.shape

[32m2025-11-07 16:44:56.661[0m | [1mINFO    [0m | [36maddiction.dataset[0m:[36mtrain_test_split_safe[0m:[36m248[0m - [1mSplit complete: X_train=(2400, 24), X_test=(600, 24), y_train=(2400,), y_test=(600,)[0m


((2400, 24), (600, 24))

In [191]:
ytr01 = np.asarray(ytr)
yte01 = np.asarray(yte)
if ytr01.dtype == bool: ytr01 = ytr01.astype(int)
if yte01.dtype == bool: yte01 = yte01.astype(int)

ytr.shape, yte.shape

((2400,), (600,))

In [192]:
Xtr = build_features(Xtr_raw, target=TARGET)
Xte = build_features(Xte_raw, target=TARGET)

display(Xtr.head())
print("Missing after features (top 10):")
Xtr.isna().sum().sort_values(ascending=False).head(10)

  grp = out.groupby(["employment_status", "income_band"])["education_level"].transform(_mode_safe)
  grp = out.groupby(["employment_status", "income_band"])["education_level"].transform(_mode_safe)


Unnamed: 0,age,gender,country,city,education_level,employment_status,annual_income_usd,marital_status,children_count,smokes_per_day,...,therapy_history,income_band,log_income,income_z,income_decile,smoke_intensity,drink_intensity,dependents_ratio,quit_effort_smoke_norm,quit_effort_drink_norm
2267,77,male,Turks and Caicos Islands,Porterborough,university,employed,6970,divorced,5,8,...,Current,<25k,8.849514,-1.578753,0,light,very_low,0.000861,0.777778,1.333333
2370,26,other,Israel,Ryanborough,secondary,self-employed,6230,married,1,11,...,Past,<25k,8.737292,-1.591625,0,med,very_low,0.000321,0.166667,1.142857
658,71,male,Latvia,Arellanohaven,college,student,166232,in a relationship,0,15,...,Past,150k+,12.021146,1.191531,8,med,very_low,6e-06,0.375,1.333333
161,46,male,Togo,Olsonhaven,high school,self-employed,184387,widowed,1,9,...,Current,150k+,12.124798,1.507328,9,light,very_low,1.1e-05,0.5,2.0
2835,47,male,Pakistan,Williamhaven,primary,retired,68446,widowed,0,5,...,Current,50–75k,11.133815,-0.509408,3,ultra,very_low,1.5e-05,0.833333,1.0


Missing after features (top 10):


social_support       2400
gender                  0
age                     0
city                    0
education_level         0
employment_status       0
country                 0
marital_status          0
children_count          0
smokes_per_day          0
dtype: int64

In [193]:
num_cols, cat_cols = infer_column_types(Xtr)

pre = make_preprocessor(
    numeric_cols=num_cols,
    categorical_cols=cat_cols,
    encode_categoricals=ENCODE_CATEGORICALS,
)

# Fit on TRAIN only
pre.fit(Xtr)

# Persist for reuse
MODELS_DIR.mkdir(parents=True, exist_ok=True)
save_preprocessor(pre, MODELS_DIR / "preprocessor.joblib")

# Transform both splits
Xtr_mat = pre.transform(Xtr)
Xte_mat = pre.transform(Xte)

# Ensure DataFrame output (sklearn>=1.2) or convert with names
if not isinstance(Xtr_mat, pd.DataFrame):
    feat_names = get_feature_names_after_preprocessor(pre, num_cols, cat_cols)
    Xtr_mat = pd.DataFrame(Xtr_mat, columns=feat_names, index=Xtr.index)
    Xte_mat = pd.DataFrame(Xte_mat, columns=feat_names, index=Xte.index)

assert not Xtr_mat.isna().any().any() and not Xte_mat.isna().any().any()
print("✅ No NaNs after preprocessing")
Xtr_mat.shape, Xte_mat.shape




[32m2025-11-07 16:44:57.026[0m | [1mINFO    [0m | [36maddiction.preprocessor[0m:[36msave_preprocessor[0m:[36m239[0m - [1m[preprocessor] saved → /Users/christianfullerton/Developer/Python Workspace/Cigarette-and-Drinking-Data/models/preprocessor.joblib[0m
✅ No NaNs after preprocessing




((2400, 2478), (600, 2478))

In [194]:
def _dense(X): return X.toarray() if sparse.issparse(X) else X

Xd_tr = _dense(Xtr_mat)
Xd_te = _dense(Xte_mat)

In [195]:
clf = RandomForestClassifier(
    n_estimators=500,
    max_samples=20,
    max_depth=5,
    min_samples_leaf=3,
    min_samples_split=10,
    max_features="sqrt",
    class_weight="balanced",
    n_jobs=-1,
    random_state=RANDOM_STATE,
)

In [196]:
clf.fit(Xd_tr, ytr01)

pred = clf.predict(Xd_te).astype(int)
scores = clf.predict_proba(Xd_te)[:, 1]


In [197]:
acc  = accuracy_score(yte01, pred)
prec = precision_score(yte01, pred, zero_division=0)
rec  = recall_score(yte01, pred, zero_division=0)
f1   = f1_score(yte01, pred, zero_division=0)
auc  = roc_auc_score(yte01, scores)
cm   = confusion_matrix(yte01, pred)

In [198]:
print("=== RandomForestClassifier ===")
print(f"ROC-AUC  : {auc:.6f}")
print(f"Accuracy : {acc:.6f}")
print(f"Precision: {prec:.6f}")
print(f"Recall   : {rec:.6f}")
print(f"F1-score : {f1:.6f}")
print("Confusion matrix [ [tn fp]\n                     [fn tp] ]:")
print(cm)

=== RandomForestClassifier ===
ROC-AUC  : 0.476832
Accuracy : 0.496667
Precision: 0.494444
Recall   : 0.597315
F1-score : 0.541033
Confusion matrix [ [tn fp]
                     [fn tp] ]:
[[120 182]
 [120 178]]
