In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import numpy as np
import shap

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Master Columns ---

# The combined file path for all data.
file_path = r"Female complete Data CSV.csv"

# A master list of all relevant columns for the analysis.
master_columns = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status',
    'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the master list of columns
data = data[master_columns]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# --- 3. Strategically Handle 'UNKNOWN' values ---
# Create missingness indicators and then impute with the mode.
cols_with_unknown = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status'
]

for col in cols_with_unknown:
    if 'UNKNOWN' in data[col].unique():
        # Create a new binary column to indicate if the value was originally 'UNKNOWN'
        data[f'{col}_Missing'] = (data[col] == 'UNKNOWN').astype(int)
        
        # Perform mode imputation on the original column
        mode_val = data[col].mode()[0]
        data[col] = data[col].replace('UNKNOWN', mode_val)

print("\nMissingness indicators created and 'UNKNOWN' values have been imputed.")

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert all categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 4. Feature Selection with RFE ---

print("\nPerforming Recursive Feature Elimination (RFE) to select top 15 features...")
rfe_estimator = BalancedRandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, sampling_strategy='not majority')
rfe = RFE(estimator=rfe_estimator, n_features_to_select=15, step=1)
rfe.fit(X_encoded, y)

selected_features = X_encoded.columns[rfe.support_].tolist()
print(f"RFE selected the following features: {selected_features}")

X_rfe = X_encoded[selected_features]

# --- 5. Train-Test Split on RFE-selected data ---
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 6. Train the BalancedRandomForestClassifier on RFE-selected data ---
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the final BalancedRandomForestClassifier on RFE-selected data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 7. Evaluate the Model ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- 8. Generate SHAP Summary Plot for Interpretability ---
print("\nGenerating SHAP summary plot for model interpretability...")

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

label_names = ['NO', 'UNKNOWN', 'YES']
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False, class_names=label_names)
plt.title('SHAP Feature Importance Plot')
output_file_shap = "combined_shap_summary_plot_brf.png"
plt.savefig(output_file_shap)
print(f"SHAP plot saved successfully as '{output_file_shap}'")
plt.close()


Data loaded successfully!

Missingness indicators created and 'UNKNOWN' values have been imputed.

Data has been cleaned and prepared for modeling.
Final feature set shape: (99244, 128)

Performing Recursive Feature Elimination (RFE) to select top 15 features...
RFE selected the following features: ['Transgender_Missing', 'Education Status_Missing', 'Intellectual Disability_Missing', 'Autism Spectrum_Missing', 'Other Developmental Disability_Missing', 'Alcohol Related Disorder_Missing', 'Drug Substance Disorder_Missing', 'Serious Mental Illness_Missing', 'Drug/Substance 12m Service_Missing', 'Principal Diagnosis Class_Missing', 'Race_WHITE ONLY', 'Religious Preference_I BELONG TO A FORMAL RELIGIOUS GROUP', 'Employment Status_NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING FOR WORK', 'No Chronic Med Condition_YES', 'Serious Mental Illness_YES']

Original class distribution in training data: Counter({'YES': 66984, 'NO': 1794, 'UNKNOWN': 692})

Training the final BalancedRandomForestClassif

In [2]:
import pandas as pd
import warnings

# Suppress potential warnings from pandas and other libraries
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Segmentation Columns ---

# The combined file path for all data.
file_path = r"Female complete Data CSV.csv"

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# --- 3. Segment the data by Region Served ---

print("\nSegmenting data by Region Served...")

# List of all unique regions to create a dictionary of dataframes
regions = data['Region Served'].unique()
regional_dataframes = {}

for region in regions:
    # Filter the data for each region
    regional_dataframes[region] = data[data['Region Served'] == region].copy()
    
    # Drop the 'Region Served' column from the new dataframe as it's no longer needed
    if 'Region Served' in regional_dataframes[region].columns:
        regional_dataframes[region].drop(columns=['Region Served'], inplace=True)

# Print the shape of each new dataframe to confirm success
print("\nData segmentation complete. The shape of each regional dataframe is:")
for region, df in regional_dataframes.items():
    print(f"  - {region}: {df.shape}")

# You can now access each dataframe using its key, e.g., regional_dataframes['New York City Region']
# This output sets us up for the next phase of analysis for each region.


Data loaded successfully!

Segmenting data by Region Served...

Data segmentation complete. The shape of each regional dataframe is:
  - WESTERN REGION: (17813, 76)
  - LONG ISLAND REGION: (7582, 76)
  - HUDSON RIVER REGION: (13405, 76)
  - NEW YORK CITY REGION: (49337, 76)
  - CENTRAL NY REGION: (11107, 76)


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import numpy as np
import shap

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Master Columns ---

# The combined file path for all data.
file_path = r"Female complete Data CSV.csv"

# A master list of all relevant columns for the analysis.
master_columns = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status',
    'Region Served', # We need this to filter the data
    'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the master list of columns
data = data[master_columns]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# --- 3. Filter for New York City Region ---

print("\nFiltering data for 'New York City Region'...")
data = data[data['Region Served'] == 'NEW YORK CITY REGION'].copy()
data.drop(columns=['Region Served'], inplace=True)
print(f"New York City Region data shape: {data.shape}")


# --- 4. Strategically Handle 'UNKNOWN' values ---
# Create missingness indicators and then impute with the mode.
cols_with_unknown = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status'
]

for col in cols_with_unknown:
    if 'UNKNOWN' in data[col].unique():
        data[f'{col}_Missing'] = (data[col] == 'UNKNOWN').astype(int)
        mode_val = data[col].mode()[0]
        data[col] = data[col].replace('UNKNOWN', mode_val)

print("\nMissingness indicators created and 'UNKNOWN' values have been imputed.")

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert all categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 4. Feature Selection with RFE ---

print("\nPerforming Recursive Feature Elimination (RFE) to select top 15 features...")
rfe_estimator = BalancedRandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, sampling_strategy='not majority')
rfe = RFE(estimator=rfe_estimator, n_features_to_select=15, step=1)
rfe.fit(X_encoded, y)

selected_features = X_encoded.columns[rfe.support_].tolist()
print(f"RFE selected the following features: {selected_features}")

X_rfe = X_encoded[selected_features]

# --- 5. Train-Test Split on RFE-selected data ---
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 6. Train the BalancedRandomForestClassifier on RFE-selected data ---
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the final BalancedRandomForestClassifier on RFE-selected data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 7. Evaluate the Model ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- 8. Generate SHAP Summary Plot for Interpretability ---
print("\nGenerating SHAP summary plot for model interpretability...")

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

label_names = ['NO', 'UNKNOWN', 'YES']
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False, class_names=label_names)
plt.title('SHAP Feature Importance Plot for New York City Region')
output_file_shap = "nyc_shap_summary_plot_brf.png"
plt.savefig(output_file_shap)
print(f"SHAP plot saved successfully as '{output_file_shap}'")
plt.close()


Data loaded successfully!

Filtering data for 'New York City Region'...
New York City Region data shape: (49337, 53)

Missingness indicators created and 'UNKNOWN' values have been imputed.

Data has been cleaned and prepared for modeling.
Final feature set shape: (49337, 128)

Performing Recursive Feature Elimination (RFE) to select top 15 features...
RFE selected the following features: ['Living Situation_Missing', 'Education Status_Missing', 'Special Education Services_Missing', 'Intellectual Disability_Missing', 'Other Developmental Disability_Missing', 'Alcohol Related Disorder_Missing', 'Drug Substance Disorder_Missing', 'Cannabis Recreational Use_Missing', 'Serious Mental Illness_Missing', 'Principal Diagnosis Class_Missing', 'Hispanic Ethnicity_YES, HISPANIC/LATINO', 'Religious Preference_I BELONG TO A FORMAL RELIGIOUS GROUP', 'Employment Status_NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING FOR WORK', 'No Chronic Med Condition_YES', 'Serious Mental Illness_YES']

Original class 

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import numpy as np
import shap

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Master Columns ---

# The combined file path for all data.
file_path = r"Female complete Data CSV.csv"

# A master list of all relevant columns for the analysis.
master_columns = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status',
    'Region Served', # We need this to filter the data
    'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the master list of columns
data = data[master_columns]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# --- 3. Filter for Western Region ---

print("\nFiltering data for 'Western Region'...")
data = data[data['Region Served'] == 'WESTERN REGION'].copy()
data.drop(columns=['Region Served'], inplace=True)
print(f"Western Region data shape: {data.shape}")


# --- 4. Strategically Handle 'UNKNOWN' values ---
# Create missingness indicators and then impute with the mode.
cols_with_unknown = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status'
]

for col in cols_with_unknown:
    if 'UNKNOWN' in data[col].unique():
        data[f'{col}_Missing'] = (data[col] == 'UNKNOWN').astype(int)
        mode_val = data[col].mode()[0]
        data[col] = data[col].replace('UNKNOWN', mode_val)

print("\nMissingness indicators created and 'UNKNOWN' values have been imputed.")

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert all categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 4. Feature Selection with RFE ---

print("\nPerforming Recursive Feature Elimination (RFE) to select top 15 features...")
rfe_estimator = BalancedRandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, sampling_strategy='not majority')
rfe = RFE(estimator=rfe_estimator, n_features_to_select=15, step=1)
rfe.fit(X_encoded, y)

selected_features = X_encoded.columns[rfe.support_].tolist()
print(f"RFE selected the following features: {selected_features}")

X_rfe = X_encoded[selected_features]

# --- 5. Train-Test Split on RFE-selected data ---
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 6. Train the BalancedRandomForestClassifier on RFE-selected data ---
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the final BalancedRandomForestClassifier on RFE-selected data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 7. Evaluate the Model ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- 8. Generate SHAP Summary Plot for Interpretability ---
print("\nGenerating SHAP summary plot for model interpretability...")

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

label_names = ['NO', 'UNKNOWN', 'YES']
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False, class_names=label_names)
plt.title('SHAP Feature Importance Plot for Western Region')
output_file_shap = "western_shap_summary_plot_brf.png"
plt.savefig(output_file_shap)
print(f"SHAP plot saved successfully as '{output_file_shap}'")
plt.close()


Data loaded successfully!

Filtering data for 'Western Region'...
Western Region data shape: (17813, 53)

Missingness indicators created and 'UNKNOWN' values have been imputed.

Data has been cleaned and prepared for modeling.
Final feature set shape: (17813, 128)

Performing Recursive Feature Elimination (RFE) to select top 15 features...
RFE selected the following features: ['Sexual Orientation_Missing', 'Living Situation_Missing', 'Intellectual Disability_Missing', 'Autism Spectrum_Missing', 'Alcohol Related Disorder_Missing', 'Hearing Impairment_Missing', 'Serious Mental Illness_Missing', 'Principal Diagnosis Class_Missing', 'Race_WHITE ONLY', 'Religious Preference_I BELONG TO A FORMAL RELIGIOUS GROUP', 'Employment Status_NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING FOR WORK', 'Number Of Hours Worked Each Week_NOT APPLICABLE', 'Education Status_MIDDLE SCHOOL TO HIGH SCHOOL', 'No Chronic Med Condition_YES', 'Serious Mental Illness_YES']

Original class distribution in training data

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import numpy as np
import shap

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Master Columns ---

# The combined file path for all data.
file_path = r"Female complete Data CSV.csv"

# A master list of all relevant columns for the analysis.
master_columns = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status',
    'Region Served', # We need this to filter the data
    'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the master list of columns
data = data[master_columns]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# --- 3. Filter for Central NY Region ---

print("\nFiltering data for 'Central NY Region'...")
data = data[data['Region Served'] == 'CENTRAL NY REGION'].copy()
data.drop(columns=['Region Served'], inplace=True)
print(f"Central NY Region data shape: {data.shape}")


# --- 4. Strategically Handle 'UNKNOWN' values ---
# Create missingness indicators and then impute with the mode.
cols_with_unknown = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status'
]

for col in cols_with_unknown:
    if 'UNKNOWN' in data[col].unique():
        data[f'{col}_Missing'] = (data[col] == 'UNKNOWN').astype(int)
        mode_val = data[col].mode()[0]
        data[col] = data[col].replace('UNKNOWN', mode_val)

print("\nMissingness indicators created and 'UNKNOWN' values have been imputed.")

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert all categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 4. Feature Selection with RFE ---

print("\nPerforming Recursive Feature Elimination (RFE) to select top 15 features...")
rfe_estimator = BalancedRandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, sampling_strategy='not majority')
rfe = RFE(estimator=rfe_estimator, n_features_to_select=15, step=1)
rfe.fit(X_encoded, y)

selected_features = X_encoded.columns[rfe.support_].tolist()
print(f"RFE selected the following features: {selected_features}")

X_rfe = X_encoded[selected_features]

# --- 5. Train-Test Split on RFE-selected data ---
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 6. Train the BalancedRandomForestClassifier on RFE-selected data ---
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the final BalancedRandomForestClassifier on RFE-selected data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 7. Evaluate the Model ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- 8. Generate SHAP Summary Plot for Interpretability ---
print("\nGenerating SHAP summary plot for model interpretability...")

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

label_names = ['NO', 'UNKNOWN', 'YES']
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False, class_names=label_names)
plt.title('SHAP Feature Importance Plot for Central NY Region')
output_file_shap = "central_ny_shap_summary_plot_brf.png"
plt.savefig(output_file_shap)
print(f"SHAP plot saved successfully as '{output_file_shap}'")
plt.close()


Data loaded successfully!

Filtering data for 'Central NY Region'...
Central NY Region data shape: (11107, 53)

Missingness indicators created and 'UNKNOWN' values have been imputed.

Data has been cleaned and prepared for modeling.
Final feature set shape: (11107, 128)

Performing Recursive Feature Elimination (RFE) to select top 15 features...
RFE selected the following features: ['Transgender_Missing', 'Intellectual Disability_Missing', 'Autism Spectrum_Missing', 'Drug Substance Disorder_Missing', 'Serious Mental Illness_Missing', 'Principal Diagnosis Class_Missing', 'Race_WHITE ONLY', 'Religious Preference_I BELONG TO A FORMAL RELIGIOUS GROUP', 'Religious Preference_I CONSIDER MYSELF SPIRITUAL, BUT NOT RELIGIOUS', 'Employment Status_NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING FOR WORK', 'Education Status_MIDDLE SCHOOL TO HIGH SCHOOL', 'Special Education Services_NOT APPLICABLE', 'No Chronic Med Condition_YES', 'Smokes_YES', 'Serious Mental Illness_YES']

Original class distributi

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import numpy as np
import shap

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Master Columns ---

# The combined file path for all data.
file_path = r"Female complete Data CSV.csv"

# A master list of all relevant columns for the analysis.
master_columns = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status',
    'Region Served', # We need this to filter the data
    'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the master list of columns
data = data[master_columns]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# --- 3. Filter for Hudson River Region ---

print("\nFiltering data for 'Hudson River Region'...")
data = data[data['Region Served'] == 'HUDSON RIVER REGION'].copy()
data.drop(columns=['Region Served'], inplace=True)
print(f"Hudson River Region data shape: {data.shape}")


# --- 4. Strategically Handle 'UNKNOWN' values ---
# Create missingness indicators and then impute with the mode.
cols_with_unknown = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status'
]

for col in cols_with_unknown:
    if 'UNKNOWN' in data[col].unique():
        data[f'{col}_Missing'] = (data[col] == 'UNKNOWN').astype(int)
        mode_val = data[col].mode()[0]
        data[col] = data[col].replace('UNKNOWN', mode_val)

print("\nMissingness indicators created and 'UNKNOWN' values have been imputed.")

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert all categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 5. Feature Selection with RFE ---

print("\nPerforming Recursive Feature Elimination (RFE) to select top 15 features...")
rfe_estimator = BalancedRandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, sampling_strategy='not majority')
rfe = RFE(estimator=rfe_estimator, n_features_to_select=15, step=1)
rfe.fit(X_encoded, y)

selected_features = X_encoded.columns[rfe.support_].tolist()
print(f"RFE selected the following features: {selected_features}")

X_rfe = X_encoded[selected_features]

# --- 6. Train-Test Split on RFE-selected data ---
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 7. Train the BalancedRandomForestClassifier on RFE-selected data ---
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the final BalancedRandomForestClassifier on RFE-selected data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 8. Evaluate the Model ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- 9. Generate SHAP Summary Plot for Interpretability ---
print("\nGenerating SHAP summary plot for model interpretability...")

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

label_names = ['NO', 'UNKNOWN', 'YES']
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False, class_names=label_names)
plt.title('SHAP Feature Importance Plot for Hudson River Region')
output_file_shap = "hudson_river_shap_summary_plot_brf.png"
plt.savefig(output_file_shap)
print(f"SHAP plot saved successfully as '{output_file_shap}'")
plt.close()


Data loaded successfully!

Filtering data for 'Hudson River Region'...
Hudson River Region data shape: (13405, 53)

Missingness indicators created and 'UNKNOWN' values have been imputed.

Data has been cleaned and prepared for modeling.
Final feature set shape: (13405, 128)

Performing Recursive Feature Elimination (RFE) to select top 15 features...
RFE selected the following features: ['Transgender_Missing', 'Sexual Orientation_Missing', 'Intellectual Disability_Missing', 'Autism Spectrum_Missing', 'Serious Mental Illness_Missing', 'Principal Diagnosis Class_Missing', 'Hispanic Ethnicity_YES, HISPANIC/LATINO', 'Race_WHITE ONLY', 'Religious Preference_I BELONG TO A FORMAL RELIGIOUS GROUP', 'Religious Preference_I CONSIDER MYSELF SPIRITUAL, BUT NOT RELIGIOUS', 'Employment Status_NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING FOR WORK', 'Education Status_MIDDLE SCHOOL TO HIGH SCHOOL', 'Special Education Services_NOT APPLICABLE', 'No Chronic Med Condition_YES', 'Serious Mental Illness_YES'

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import numpy as np
import shap

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Master Columns ---

# The combined file path for all data.
file_path = r"Female complete Data CSV.csv"

# A master list of all relevant columns for the analysis.
master_columns = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status',
    'Region Served', # We need this to filter the data
    'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the master list of columns
data = data[master_columns]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# --- 3. Filter for Long Island Region ---

print("\nFiltering data for 'Long Island Region'...")
data = data[data['Region Served'] == 'LONG ISLAND REGION'].copy()
data.drop(columns=['Region Served'], inplace=True)
print(f"Long Island Region data shape: {data.shape}")


# --- 4. Strategically Handle 'UNKNOWN' values ---
# Create missingness indicators and then impute with the mode.
cols_with_unknown = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status'
]

for col in cols_with_unknown:
    if 'UNKNOWN' in data[col].unique():
        data[f'{col}_Missing'] = (data[col] == 'UNKNOWN').astype(int)
        mode_val = data[col].mode()[0]
        data[col] = data[col].replace('UNKNOWN', mode_val)

print("\nMissingness indicators created and 'UNKNOWN' values have been imputed.")

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert all categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 5. Feature Selection with RFE ---

print("\nPerforming Recursive Feature Elimination (RFE) to select top 15 features...")
rfe_estimator = BalancedRandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, sampling_strategy='not majority')
rfe = RFE(estimator=rfe_estimator, n_features_to_select=15, step=1)
rfe.fit(X_encoded, y)

selected_features = X_encoded.columns[rfe.support_].tolist()
print(f"RFE selected the following features: {selected_features}")

X_rfe = X_encoded[selected_features]

# --- 6. Train-Test Split on RFE-selected data ---
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 7. Train the BalancedRandomForestClassifier on RFE-selected data ---
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the final BalancedRandomForestClassifier on RFE-selected data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 8. Evaluate the Model ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- 9. Generate SHAP Summary Plot for Interpretability ---
print("\nGenerating SHAP summary plot for model interpretability...")

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

label_names = ['NO', 'UNKNOWN', 'YES']
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False, class_names=label_names)
plt.title('SHAP Feature Importance Plot for Long Island Region')
output_file_shap = "long_island_shap_summary_plot_brf.png"
plt.savefig(output_file_shap)
print(f"SHAP plot saved successfully as '{output_file_shap}'")
plt.close()


Data loaded successfully!

Filtering data for 'Long Island Region'...
Long Island Region data shape: (7582, 53)

Missingness indicators created and 'UNKNOWN' values have been imputed.

Data has been cleaned and prepared for modeling.
Final feature set shape: (7582, 128)

Performing Recursive Feature Elimination (RFE) to select top 15 features...
RFE selected the following features: ['Education Status_Missing', 'Drug Substance Disorder_Missing', 'Serious Mental Illness_Missing', 'Principal Diagnosis Class_Missing', 'Hispanic Ethnicity_YES, HISPANIC/LATINO', 'Race_OTHER', 'Race_WHITE ONLY', 'Living Situation_OTHER LIVING SITUATION', 'Religious Preference_I BELONG TO A FORMAL RELIGIOUS GROUP', 'Employment Status_NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING FOR WORK', 'Education Status_MIDDLE SCHOOL TO HIGH SCHOOL', 'Special Education Services_NOT APPLICABLE', 'No Chronic Med Condition_YES', 'Smokes_YES', 'Serious Mental Illness_YES']

Original class distribution in training data: Counter

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import numpy as np
import shap

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Master Columns ---

# The combined file path for all data.
file_path = r"Female complete Data CSV.csv"

# A master list of all relevant columns for the analysis.
master_columns = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status',
    'Region Served', # We need this to filter the data
    'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the master list of columns
data = data[master_columns]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# --- 3. Filter for Long Island Region ---

print("\nFiltering data for 'Long Island Region'...")
data = data[data['Region Served'] == 'LONG ISLAND REGION'].copy()
data.drop(columns=['Region Served'], inplace=True)
print(f"Long Island Region data shape: {data.shape}")


# --- 4. Strategically Handle 'UNKNOWN' values ---
# Create missingness indicators and then impute with the mode.
cols_with_unknown = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status'
]

for col in cols_with_unknown:
    if 'UNKNOWN' in data[col].unique():
        data[f'{col}_Missing'] = (data[col] == 'UNKNOWN').astype(int)
        mode_val = data[col].mode()[0]
        data[col] = data[col].replace('UNKNOWN', mode_val)

print("\nMissingness indicators created and 'UNKNOWN' values have been imputed.")

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert all categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 5. Feature Selection with RFE ---

print("\nPerforming Recursive Feature Elimination (RFE) to select top 15 features...")
rfe_estimator = BalancedRandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, sampling_strategy='not majority')
rfe = RFE(estimator=rfe_estimator, n_features_to_select=15, step=1)
rfe.fit(X_encoded, y)

selected_features = X_encoded.columns[rfe.support_].tolist()
print(f"RFE selected the following features: {selected_features}")

X_rfe = X_encoded[selected_features]

# --- 6. Train-Test Split on RFE-selected data ---
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 7. Train the BalancedRandomForestClassifier on RFE-selected data ---
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the final BalancedRandomForestClassifier on RFE-selected data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 8. Evaluate the Model ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- 9. Generate SHAP Summary Plot for Interpretability ---
print("\nGenerating SHAP summary plot for model interpretability...")

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

label_names = ['NO', 'UNKNOWN', 'YES']
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False, class_names=label_names)
plt.title('SHAP Feature Importance Plot for Long Island Region')
output_file_shap = "long_island_shap_summary_plot_brf.png"
plt.savefig(output_file_shap)
print(f"SHAP plot saved successfully as '{output_file_shap}'")
plt.close()


Data loaded successfully!

Filtering data for 'Long Island Region'...
Long Island Region data shape: (7582, 53)

Missingness indicators created and 'UNKNOWN' values have been imputed.

Data has been cleaned and prepared for modeling.
Final feature set shape: (7582, 128)

Performing Recursive Feature Elimination (RFE) to select top 15 features...
RFE selected the following features: ['Education Status_Missing', 'Drug Substance Disorder_Missing', 'Serious Mental Illness_Missing', 'Principal Diagnosis Class_Missing', 'Hispanic Ethnicity_YES, HISPANIC/LATINO', 'Race_OTHER', 'Race_WHITE ONLY', 'Living Situation_OTHER LIVING SITUATION', 'Religious Preference_I BELONG TO A FORMAL RELIGIOUS GROUP', 'Employment Status_NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING FOR WORK', 'Education Status_MIDDLE SCHOOL TO HIGH SCHOOL', 'Special Education Services_NOT APPLICABLE', 'No Chronic Med Condition_YES', 'Smokes_YES', 'Serious Mental Illness_YES']

Original class distribution in training data: Counter