In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, f1_score
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import warnings
from collections import Counter
import numpy as np
import joblib

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Master Columns ---

# The file path to your unified dataset.
file_path = r"Project Data Complete .csv"

# A master list of all relevant columns for the analysis.
# The 'Principal Diagnosis Class' and 'Serious Mental Illness' features are included
# as per your last successful analysis to achieve robust model performance.
master_columns = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness',
    'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class',
    'Criminal Justice Status',
    'Region Served',
    'Mental Illness'
]

# --- 2. Load and Prepare Data ---
try:
    unified_data = pd.read_csv(file_path)
    print("Unified data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Since the data is already unified, we just select the master columns.
unified_data = unified_data[master_columns].copy()
print(f"Unified dataset shape: {unified_data.shape}")

# --- 3. Strategically Handle 'UNKNOWN' values ---
cols_with_unknown = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness',
    'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class',
    'Criminal Justice Status'
]

for col in cols_with_unknown:
    if 'UNKNOWN' in unified_data[col].unique():
        unified_data[f'{col}_Missing'] = (unified_data[col] == 'UNKNOWN').astype(int)
        mode_val = unified_data[col].mode()[0]
        unified_data[col] = unified_data[col].replace('UNKNOWN', mode_val)

print("\nMissingness indicators created and 'UNKNOWN' values have been imputed.")

# --- 4. Feature Engineering: Create a new combined feature ---
unified_data['Combined_Diagnosis_Education'] = unified_data['Principal Diagnosis Class'] + '_' + unified_data['Special Education Services']
unified_data = unified_data.drop(columns=['Principal Diagnosis Class', 'Special Education Services'])
print("\nNew combined feature 'Combined_Diagnosis_Education' created.")


# Separate features (X) and target (y)
X = unified_data.drop(columns=['Mental Illness', 'Region Served'])
y = unified_data['Mental Illness']

X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 5. Feature Selection with RFE ---
print("\nPerforming Recursive Feature Elimination (RFE) to select top 15 features...")
rfe_estimator = BalancedRandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, sampling_strategy='not minority')
rfe = RFE(estimator=rfe_estimator, n_features_to_select=15, step=1)
rfe.fit(X_encoded, y)

selected_features = X_encoded.columns[rfe.support_].tolist()
print(f"RFE selected the following features: {selected_features}")

X_rfe = X_encoded[selected_features]

# --- 6. Final Model Training and Evaluation ---
label_map = {'NO': 0, 'UNKNOWN': 1, 'YES': 2}
y_num = y.map(label_map)

# Pre-defined parameters from a successful Grid Search to save time
best_params = {'final_estimator__C': 1, 'rf__max_depth': 3, 'rf__n_estimators': 100, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 3, 'xgb__n_estimators': 100}

base_estimators = [
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=best_params['rf__n_estimators'], max_depth=best_params['rf__max_depth'])),
    ('xgb', xgb.XGBClassifier(
        random_state=42, n_jobs=-1, eval_metric='mlogloss', use_label_encoder=False,
        n_estimators=best_params['xgb__n_estimators'], max_depth=best_params['xgb__max_depth'], learning_rate=best_params['xgb__learning_rate']
    ))
]

final_estimator = LogisticRegression(random_state=42, n_jobs=-1, C=best_params['final_estimator__C'])

best_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=final_estimator
)

print("\nTraining the final stacking model...")
best_model.fit(X_rfe, y_num)
print("Model training complete!")

print("\nFinal Classification Report on the Unified Dataset:")
y_pred_num = best_model.predict(X_rfe)
y_pred = pd.Series(y_pred_num).map({v: k for k, v in label_map.items()})
print(classification_report(y, y_pred))

print("\nAnalysis complete. Your final model is ready for documentation.")


Unified data loaded successfully!
Unified dataset shape: (196102, 54)

Missingness indicators created and 'UNKNOWN' values have been imputed.

New combined feature 'Combined_Diagnosis_Education' created.

Data has been cleaned and prepared for modeling.
Final feature set shape: (196102, 136)

Performing Recursive Feature Elimination (RFE) to select top 15 features...
RFE selected the following features: ['Sexual Orientation_Missing', 'Intellectual Disability_Missing', 'Autism Spectrum_Missing', 'Other Developmental Disability_Missing', 'Alcohol Related Disorder_Missing', 'Drug Substance Disorder_Missing', 'Opioid Related Disorder_Missing', 'Visual Impairment_Missing', 'Smokes_Missing', 'Serious Mental Illness_Missing', 'Principal Diagnosis Class_Missing', 'Household Composition_NOT APPLICABLE', 'No Chronic Med Condition_YES', 'Serious Mental Illness_YES', 'Combined_Diagnosis_Education_MENTAL ILLNESS_NOT APPLICABLE']

Training the final stacking model...
Model training complete!

Final 

In [2]:
import joblib

# This line saves your final, best-performing model to a file.
joblib.dump(best_model, 'best_model.joblib')

# This line saves the list of features the model was trained on.
joblib.dump(selected_features, 'rfe_features.joblib')

print("Model and features have been saved to files!")

Model and features have been saved to files!


In [3]:
import joblib
joblib.dump(best_model, 'best_model.joblib')
joblib.dump(selected_features, 'rfe_features.joblib')

['rfe_features.joblib']