<a href="https://colab.research.google.com/github/bl4ckf0xk/ModelX_First_Order/blob/main/ModelX_Model_XGB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, recall_score, precision_score
import joblib

In [2]:
!pip install shap
import shap



In [3]:
DATA_PATH = '/content/drive/MyDrive/Dementia Prediction Dataset.csv'
MODEL_OUT = '/content/dementia_nonmedical_model.pkl'
RANDOM_STATE = 42
TARGET = 'DEMENTED'
SUBJECT_ID = 'NACCID'  # change if your identifier column has a different name

In [4]:
def load_data(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Data file not found at {path}. Upload it to Colab or set DATA_PATH correctly.")
    df = pd.read_csv(path)
    return df

In [5]:
NON_MEDICAL_WHITELIST = [
    # Subject demographics
    'BIRTHYR', 'BIRTHMO', 'SEX', 'HISPANIC', 'HISPOR', 'HISPORX',
    'RACE', 'RACEX', 'RACESEC', 'RACESECX', 'RACETER', 'RACETERX',
    'EDUC', 'MARISTAT', 'PRIMLANG', 'PRIMLANX', 'RESIDENC', 'HANDED', 'NACCLIVS', 'INDEPEND',
    # Co-participant
    'INBIRYR', 'INBIRMO', 'INSEX', 'INHISP', 'INHISPOR', 'INHISPOX', 'INRACE', 'INRACEX', 'INRASEC', 'INRASECX', 'INRATER', 'INRATERX', 'INEDUC', 'INRELTO', 'INRELTOX', 'NEWINF',
    # Lifestyle
    'TOBAC30', 'TOBAC100', 'SMOKYRS', 'PACKSPER', 'QUITSMOK', 'ALCOCCAS', 'ALCFREQ',
    # Visit metadata
    'NACCVNUM', 'NACCNVST', 'NACCAVST', 'NACCDAYS', 'NACCFDYS', 'PACKET', 'FORMVER', 'TELCOV', 'TELMOD',
    # Family history (non-genetic fields)
    'NACCFAM', 'NACCMOM', 'NACCDAD'
]

In [6]:
print('Loading data...')
df = load_data(DATA_PATH)
print('Rows:', len(df), 'Columns:', len(df.columns))

Loading data...


  df = pd.read_csv(path)


Rows: 195196 Columns: 1024


In [7]:
# Ensure target and subject id exist
if TARGET not in df.columns:
    raise KeyError(f"Target column '{TARGET}' not found in dataset.")
if SUBJECT_ID not in df.columns:
    raise KeyError(f"Subject ID column '{SUBJECT_ID}' not found in dataset.")

# Reduce to columns we care about (keep subject id and target)
available_features = [c for c in NON_MEDICAL_WHITELIST if c in df.columns]
print(f'Using {len(available_features)} non-medical features (of {len(NON_MEDICAL_WHITELIST)} whitelist)')

# Warn about missing whitelist columns
missing = set(NON_MEDICAL_WHITELIST) - set(available_features)
if missing:
    print('Warning: The following whitelist columns were not found in your CSV and will be skipped:')
    print(sorted(list(missing)))

keep_cols = [SUBJECT_ID, TARGET] + available_features
df = df[keep_cols].copy()

Using 55 non-medical features (of 55 whitelist)


Handle NACC special codes -> convert common missing codes to NaN

In [8]:
# Typical NACC codes: -4 = Not applicable, -1 or 9 = Unknown, 88/99 = other/missing depending on variable
NA_CODES = [-4, -1, 8, 9, 88, 95, 96, 97, 98, 99, 999]
for v in df.columns:
    if df[v].dtype.kind in 'biufc':
        df[v] = df[v].replace(NA_CODES, np.nan)
    else:
        # for object/string fields, keep as-is and handle missing later
        df[v] = df[v].replace([str(x) for x in NA_CODES], np.nan)

# Also treat target code '9' explicitly as NaN (unknown)
df[TARGET] = df[TARGET].replace(9, np.nan)

# Drop rows where target is missing
df = df[df[TARGET].notna()].copy()
print('After dropping unknown target, rows:', len(df))

After dropping unknown target, rows: 195196


Feature engineering

In [9]:
# AGE if present; compute age-at-visit if possible using BIRTHYR and an approximate VISITYR if available.
# If NACCAGE exists, prefer it. Otherwise derive AGE from BIRTHYR with caveat.
if 'NACCAGE' in df.columns:
    df['AGE'] = df['NACCAGE']
elif 'BIRTHYR' in df.columns and 'NACCVNUM' in df.columns:
    # cheaper approximation: assume visit year unknown; fallback to age buckets via EDUC if needed
    df['AGE'] = np.nan
else:
    df['AGE'] = np.nan

# Pack-years and simple booleans
if 'PACKSPER' in df.columns and 'SMOKYRS' in df.columns:
    df['PACK_YEARS'] = df['PACKSPER'] * df['SMOKYRS']
else:
    df['PACK_YEARS'] = np.nan

if 'TOBAC30' in df.columns and 'TOBAC100' in df.columns:
    df['EVER_SMOKE'] = ((df['TOBAC30'] == 1) | (df['TOBAC100'] == 1)).astype('Int64')
else:
    df['EVER_SMOKE'] = pd.Series([pd.NA] * len(df))

# Heavy alcohol heuristic
if 'ALCFREQ' in df.columns:
    # interpret codes: (user should adjust according to their codebook)
    # treat high frequency codes (e.g., weekly/daily) as heavy
    df['HEAVY_ALCOHOL'] = df['ALCFREQ'].apply(lambda x: 1 if (pd.notna(x) and float(x) >= 5) else 0).astype('Int64')
else:
    df['HEAVY_ALCOHOL'] = pd.Series([pd.NA] * len(df))

# Years in study
if 'NACCDAYS' in df.columns:
    df['YEARS_IN_STUDY'] = df['NACCDAYS'] / 365.25
else:
    df['YEARS_IN_STUDY'] = np.nan

# Lives alone bool from NACCLIVS (if 1 = lives alone in your version)
if 'NACCLIVS' in df.columns:
    df['LIVES_ALONE'] = df['NACCLIVS'].apply(lambda x: 1 if x == 1 else 0).astype('Int64')
else:
    df['LIVES_ALONE'] = pd.Series([pd.NA] * len(df))

# Final feature list (keep engineered features)
engineered = ['AGE', 'PACK_YEARS', 'EVER_SMOKE', 'HEAVY_ALCOHOL', 'YEARS_IN_STUDY', 'LIVES_ALONE']
for e in engineered:
    if e not in df.columns:
        df[e] = np.nan

Final feature set building (numeric vs categorical)

In [10]:
# Start from available_features (from whitelist) and add engineered numeric features
features = available_features.copy()
# Remove potential duplicates if present
for c in engineered:
    if c not in features:
        features.append(c)

# Remove subject id and target if present mistakenly
features = [f for f in features if f not in [SUBJECT_ID, TARGET]]

# Separate numeric and categorical heuristically
numeric_feats = []
categorical_feats = []
for f in features:
    if pd.api.types.is_numeric_dtype(df[f]) or f in engineered:
        numeric_feats.append(f)
    else:
        categorical_feats.append(f)

print('Numeric features:', len(numeric_feats))
print('Categorical features:', len(categorical_feats))

Numeric features: 50
Categorical features: 11


Preprocessing pipeline

In [12]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_feats),
    ("cat", categorical_transformer, categorical_feats)
])

In [14]:
X = df[features].copy()
y = df[TARGET].astype(int)
groups = df[SUBJECT_ID]

# Group-aware train-test split: ensure subjects do not leak across splits
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss.split(X, y, groups=groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

print('Train subjects:', groups.iloc[train_idx].nunique(), 'Test subjects:', groups.iloc[test_idx].nunique())

Train subjects: 42029 Test subjects: 10508


Model pipeline (XGBoost) + hyperparameter search

In [13]:
# xgb_clf = XGBClassifier(
#     objective='binary:logistic',
#     use_label_encoder=False,
#     eval_metric='auc',
#     random_state=RANDOM_STATE,
#     n_jobs=1
# )

# pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('clf', xgb_clf)])

model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist'
)

clf = Pipeline([
    ("preprocess", preprocessor),
    ("model", model)
])



In [15]:
clf.fit(X_train, y_train)
print("Accuracy:", clf.score(X_test, y_test))



Accuracy: 0.902682691820072


In [16]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = clf.predict(X_test)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Confusion Matrix:
 [[26143  1880]
 [ 1958  9457]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93     28023
           1       0.83      0.83      0.83     11415

    accuracy                           0.90     39438
   macro avg       0.88      0.88      0.88     39438
weighted avg       0.90      0.90      0.90     39438



In [17]:
import joblib
joblib.dump(clf, "dementia_pipeline_xgb.joblib")
print("Model saved as dementia_pipeline_xgb.joblib")

Model saved as dementia_pipeline_xgb.joblib


In [19]:
loaded = joblib.load("dementia_pipeline_xgb.joblib")

# Single prediction example (replace with real row)
example = X_test.iloc[[0]]
pred = loaded.predict(example)
print("\nPrediction for sample row:", pred)

import gc
gc.collect()





Prediction for sample row: [0]


309

In [22]:
import pandas as pd
import numpy as np
import joblib

# Load the full pipeline
clf = joblib.load("dementia_pipeline_xgb.joblib")

# Example user input
user_input = {
    "AGE": 67,
    "EDUCATION": "High school",
    "LIVES_WITH": "Spouse",
    "SMOKING": "No",
    "ALCOHOL": "Occasional",
    "HEART_ATTACK": "No",
    "STROKE": "Yes",
    "EXERCISE": "Sometimes"
}

# Build DataFrame with all pipeline features
X_pred = pd.DataFrame(columns=clf.named_steps['preprocess'].feature_names_in_)
X_pred.loc[0] = np.nan  # initialize row with NaNs

# Fill in user input
for k, v in user_input.items():
    if k in X_pred.columns:
        X_pred.loc[0, k] = v

# Predict
label = clf.predict(X_pred)[0]
risk = clf.predict_proba(X_pred)[:,1][0] * 100

print("Predicted label:", "At risk" if label == 1 else "Not at risk")
print("Estimated dementia risk: {:.2f}%".format(risk))

Predicted label: Not at risk
Estimated dementia risk: 5.37%


