<a href="https://colab.research.google.com/github/akashsandeepa11/model-x-dementia-risk-predictor/blob/main/Model_x.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Core libraries for data manipulation
import pandas as pd
import numpy as np

# Libraries for preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Machine Learning Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier  # A popular gradient boosting algorithm

# Libraries for evaluation
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

# To ignore warnings
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully.")

Libraries imported successfully.


In [3]:
# --- IMPORTANT ---
# Load your dataset here.
# Change 'nacc_dataset.csv' to the path of your actual file.
try:
    df = pd.read_csv('/content/drive/MyDrive/Dementia Prediction Dataset.csv')
    print(f"Dataset loaded successfully. Shape: {df.shape}")
except FileNotFoundError:
    print("Error: Dataset file not found.")
    print("Please update the 'pd.read_csv()' line with your file path.")
    # As a placeholder, I'll create an empty DataFrame to allow subsequent cells to run
    df = pd.DataFrame()

# Display the first 5 rows to understand the data
print(df.head())

Dataset loaded successfully. Shape: (195196, 1024)
       NACCID  NACCADC PACKET  FORMVER  VISITMO  VISITDAY  VISITYR  NACCVNUM  \
0  NACC002909      186      I      3.0       12        28     2022         1   
1  NACC002909      186      F      3.0        1        23     2024         2   
2  NACC003487      186      I      3.0       11        15     2023         1   
3  NACC004352      186      I      3.0       10         5     2021         1   
4  NACC004687      186      I      3.0       11        14     2022         1   

   NACCAVST  NACCNVST  ...  NPATGAM1  NPATGAM2  NPATGAM3  NPATGAM4  NPATGAM5  \
0         2         2  ...        -4        -4        -4        -4        -4   
1         2         2  ...        -4        -4        -4        -4        -4   
2         1         1  ...        -4        -4        -4        -4        -4   
3         1         1  ...        -4        -4        -4        -4        -4   
4         1         1  ...        -4        -4        -4        -4  

In [7]:
if not df.empty:
    # 1. DEFINE YOUR TARGET VARIABLE
    TARGET_VARIABLE = 'DEMENTED'

    # 2. DEFINE THE MASTER LIST of all possible non-medical features
    ALL_POSSIBLE_FEATURES = [
        # Form A1: Subject Demographics
        'NACCAGE', 'SEX', 'EDUC', 'MARISTAT', 'NACCLIVS', 'RESIDENC', 'HANDED',
        'HISPANIC', 'RACE', 'RACESEC', 'RACETER', 'PRIMLANG', 'INDEPEND',

        # Form A2: Co-participant Demographics
        'INRELTO', 'INLIVWTH', 'INVISITS', 'INCALLS',

        # Form A3: Subject Family History
        'NACCFAM', 'NACCMOM', 'NACCDAD',

        # Form A4: Subject Medications
        'ANYMEDS',

        # Form A5: Subject Health History
        'TOBAC30', 'TOBAC100', 'SMOKYRS', 'PACKSPER', 'QUITSMOK', 'ALCOCCAS',
        'ALCFREQ', 'ALCOHOL', 'ABUSOTHR', 'CVHATT', 'CVAFIB', 'CVANGIO',
        'CVBYPASS', 'CVPACDEF', 'CVPACE', 'CVCHF', 'CVANGINA', 'CVHVALVE',
        'CVOTHR', 'HYPERTEN', 'HYPERCHO', 'CBSTROKE', 'NACCSTYR', 'CBTIA',
        'NACCTIYR', 'PD', 'PDYR', 'SEIZURES', 'NACCTBI', 'DIABETES', 'DIABTYPE',
        'B12DEF', 'THYROID', 'ARTHRIT', 'APNEA', 'RBD', 'INSOMN', 'OTHSLEEP',
        'PTSD', 'BIPOLAR', 'SCHIZ', 'DEP2YRS', 'DEPOTHR', 'ANXIETY', 'OCD',
        'INCONTU', 'INCONTF',

        # Form B1: Physical
        'HEIGHT', 'WEIGHT', 'NACCBMI', 'VISION', 'VISCORR', 'VISWCORR',
        'HEARING', 'HEARAID', 'HEARWAID',

        # Form B9: Self-Reported Decline
        'DECSUB', 'DECIN',

        # Form B7: Functional Activities
        'BILLS', 'TAXES', 'SHOPPING', 'GAMES', 'STOVE', 'MEALPREP',
        'EVENTS', 'PAYATTN', 'REMDATES', 'TRAVEL',

        # Milestones Form
        'NACCNURP',

        # Form CLS: Linguistic History (These are the ones causing the error)
        'APREFLAN', 'AYRSPAN', 'AYRENGL', 'APCSPAN', 'APCENGL',
        'NACCSPNL', 'NACCENGL'
    ]

    # 3. CRITICAL STEP: Filter the list to only include features in your CSV
    NON_MEDICAL_FEATURES = [col for col in ALL_POSSIBLE_FEATURES if col in df.columns]

    # Find and report any missing features (for your information)
    missing_from_csv = [col for col in ALL_POSSIBLE_FEATURES if col not in df.columns]

    print(f"Target variable set to: {TARGET_VARIABLE}")
    print(f"Found {len(NON_MEDICAL_FEATURES)} available non-medical features in your file.")

    if missing_from_csv:
        print("\nNote: The following features from the data dictionary were not found in your file and will be skipped:")
        print(missing_from_csv)

else:
    print("DataFrame is empty. Please load data in Cell 2.")

Target variable set to: DEMENTED
Found 90 available non-medical features in your file.

Note: The following features from the data dictionary were not found in your file and will be skipped:
['APREFLAN', 'AYRSPAN', 'AYRENGL', 'APCSPAN', 'APCENGL', 'NACCSPNL', 'NACCENGL']


In [8]:
if not df.empty:
    # 1. --- Select X and y ---

    # Drop rows where the target variable is missing
    df_clean = df.dropna(subset=[TARGET_VARIABLE])

    # Use our new *filtered* list of features
    X = df_clean[NON_MEDICAL_FEATURES]
    y = df_clean[TARGET_VARIABLE].astype(int)

    # 2. --- Data Cleaning (Best Practice) ---

    # Define all codes from the Data Dictionary that mean "Unknown" or "Not Assessed"
    MISSING_CODES = [
        -4, 8, 9, 88, 99, 888, 999, 8888, 9999,
        95, 96, 97, 98, 995, 996, 997, 998
    ]

    print(f"Original missing values in X: {X.isna().sum().sum()}")

    # Replace all special missing codes with np.nan
    X = X.replace(MISSING_CODES, np.nan)

    print(f"Total missing values after cleaning (now np.nan): {X.isna().sum().sum()}")

    # 3. --- Define Preprocessing Pipelines (Now fully dynamic) ---

    # Define the *master list* of known numeric features
    ALL_POSSIBLE_NUMERIC_FEATURES = [
        'NACCAGE', 'EDUC', 'SMOKYRS', 'PACKSPER', 'QUITSMOK', 'NACCSTYR',
        'NACCTIYR', 'PDYR', 'HEIGHT', 'WEIGHT', 'NACCBMI', 'AYRSPAN',
        'AYRENGL', 'APCSPAN', 'APCENGL'
    ]

    # Filter NUMERIC_FEATURES to only those in our available NON_MEDICAL_FEATURES
    NUMERIC_FEATURES = [col for col in NON_MEDICAL_FEATURES if col in ALL_POSSIBLE_NUMERIC_FEATURES]

    # CATEGORICAL_FEATURES is everything else that's left
    CATEGORICAL_FEATURES = [col for col in NON_MEDICAL_FEATURES if col not in NUMERIC_FEATURES]

    print(f"\nIdentified {len(NUMERIC_FEATURES)} numeric features.")
    print(f"Identified {len(CATEGORICAL_FEATURES)} categorical features.")

    # Pipeline for numerical data:
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Pipeline for categorical data:
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine both pipelines into a single preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, NUMERIC_FEATURES),
            ('cat', categorical_transformer, CATEGORICAL_FEATURES)
        ],
        remainder='passthrough'
    )

    print("Preprocessing pipelines defined successfully.")

    # 4. --- Split the Data ---

    # Split the cleaned data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,    # 20% for testing
        random_state=42,  # For reproducible results
        stratify=y        # Critical for classification
    )

    print(f"\nData split into training and testing sets.")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")

else:
    print("DataFrame is empty. Please load data in Cell 2.")

Original missing values in X: 970
Total missing values after cleaning (now np.nan): 6850211

Identified 11 numeric features.
Identified 79 categorical features.
Preprocessing pipelines defined successfully.

Data split into training and testing sets.
X_train shape: (156156, 90)
X_test shape: (39040, 90)


In [9]:
# Logistic Regression (Baseline)

if not df.empty:
    # Create the full model pipeline
    lr_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42, max_iter=1000))
    ])

    # --- Train the model ---
    print("Training Logistic Regression model...")
    lr_pipeline.fit(X_train, y_train)

    # --- Evaluate the model ---
    print("\n--- Logistic Regression Evaluation ---")
    y_pred_lr = lr_pipeline.predict(X_test)
    y_proba_lr = lr_pipeline.predict_proba(X_test)[:, 1] # Probability of "1" (Dementia)

    print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
    print(f"AUC Score: {roc_auc_score(y_test, y_proba_lr):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_lr, target_names=["Not at risk (0)", "At risk (1)"]))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred_lr))

else:
    print("DataFrame is empty. Please load data in Cell 2.")

Training Logistic Regression model...

--- Logistic Regression Evaluation ---
Accuracy: 0.9287
AUC Score: 0.9752

Classification Report:
                 precision    recall  f1-score   support

Not at risk (0)       0.94      0.96      0.95     27522
    At risk (1)       0.90      0.86      0.88     11518

       accuracy                           0.93     39040
      macro avg       0.92      0.91      0.91     39040
   weighted avg       0.93      0.93      0.93     39040


Confusion Matrix:
[[26389  1133]
 [ 1651  9867]]


In [10]:
# Random Forest Classifier

if not df.empty:
    # Create the full model pipeline
    rf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
    ])

    # --- Train the model ---
    print("Training Random Forest model...")
    rf_pipeline.fit(X_train, y_train)

    # --- Evaluate the model ---
    print("\n--- Random Forest Evaluation ---")
    y_pred_rf = rf_pipeline.predict(X_test)
    y_proba_rf = rf_pipeline.predict_proba(X_test)[:, 1] # Probability of "1" (Dementia)

    print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
    print(f"AUC Score: {roc_auc_score(y_test, y_proba_rf):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_rf, target_names=["Not at risk (0)", "At risk (1)"]))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred_rf))

else:
    print("DataFrame is empty. Please load data in Cell 2.")

Training Random Forest model...

--- Random Forest Evaluation ---
Accuracy: 0.9332
AUC Score: 0.9783

Classification Report:
                 precision    recall  f1-score   support

Not at risk (0)       0.95      0.96      0.95     27522
    At risk (1)       0.89      0.88      0.89     11518

       accuracy                           0.93     39040
      macro avg       0.92      0.92      0.92     39040
   weighted avg       0.93      0.93      0.93     39040


Confusion Matrix:
[[26329  1193]
 [ 1415 10103]]


In [11]:
# XGBoost Classifier (Gradient Boosting)

if not df.empty:
    # Create the full model pipeline
    xgb_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
    ])

    # --- Train the model ---
    print("Training XGBoost model...")
    xgb_pipeline.fit(X_train, y_train)

    # --- Evaluate the model ---
    print("\n--- XGBoost Evaluation ---")
    y_pred_xgb = xgb_pipeline.predict(X_test)
    y_proba_xgb = xgb_pipeline.predict_proba(X_test)[:, 1] # Probability of "1" (Dementia)

    print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
    print(f"AUC Score: {roc_auc_score(y_test, y_proba_xgb):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_xgb, target_names=["Not at risk (0)", "At risk (1)"]))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred_xgb))

else:
    print("DataFrame is empty. Please load data in Cell 2.")

Training XGBoost model...

--- XGBoost Evaluation ---
Accuracy: 0.9345
AUC Score: 0.9790

Classification Report:
                 precision    recall  f1-score   support

Not at risk (0)       0.95      0.96      0.95     27522
    At risk (1)       0.90      0.88      0.89     11518

       accuracy                           0.93     39040
      macro avg       0.92      0.92      0.92     39040
   weighted avg       0.93      0.93      0.93     39040


Confusion Matrix:
[[26355  1167]
 [ 1390 10128]]


In [12]:
if not df.empty:
    print("--- Model Performance Summary ---")

    # Create a simple DataFrame for comparison
    results = {
        'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
        'Accuracy': [
            accuracy_score(y_test, y_pred_lr),
            accuracy_score(y_test, y_pred_rf),
            accuracy_score(y_test, y_pred_xgb)
        ],
        'AUC Score': [
            roc_auc_score(y_test, y_proba_lr),
            roc_auc_score(y_test, y_proba_rf),
            roc_auc_score(y_test, y_proba_xgb)
        ],
        'F1-Score (At risk)': [
            f1_score(y_test, y_pred_lr, pos_label=1),
            f1_score(y_test, y_pred_rf, pos_label=1),
            f1_score(y_test, y_pred_xgb, pos_label=1)
        ]
    }

    results_df = pd.DataFrame(results)
    print(results_df.to_markdown(index=False, floatfmt=".4f"))

    print("\n\n--- Next Steps (Hackathon Requirements) ---")
    print("1.  **Hyperparameter Tuning:** Use GridSearchCV or RandomizedSearchCV on your best-performing model (e.g., Random Forest or XGBoost) to find better settings[cite: 1421].")
    print("2.  **Feature Engineering:** Try creating new features. For example, 'Age x Education' or 'BMI_Category' from NACCBMI[cite: 1421].")
    print("3.  **Explainability:** For your report, get the 'feature_importances_' from the trained Random Forest or XGBoost model to see which non-medical factors were most predictive[cite: 1421].")

else:
    print("DataFrame is empty. Please load data in Cell 2.")

--- Model Performance Summary ---
| Model               |   Accuracy |   AUC Score |   F1-Score (At risk) |
|:--------------------|-----------:|------------:|---------------------:|
| Logistic Regression |     0.9287 |      0.9752 |               0.8764 |
| Random Forest       |     0.9332 |      0.9783 |               0.8857 |
| XGBoost             |     0.9345 |      0.9790 |               0.8879 |


--- Next Steps (Hackathon Requirements) ---
1.  **Hyperparameter Tuning:** Use GridSearchCV or RandomizedSearchCV on your best-performing model (e.g., Random Forest or XGBoost) to find better settings[cite: 1421].
2.  **Feature Engineering:** Try creating new features. For example, 'Age x Education' or 'BMI_Category' from NACCBMI[cite: 1421].
3.  **Explainability:** For your report, get the 'feature_importances_' from the trained Random Forest or XGBoost model to see which non-medical factors were most predictive[cite: 1421].
