In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from imblearn.over_sampling import SMOTE
from collections import Counter

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your dataset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

# The columns to be used in the model. This list has been refined based on our analysis.
# We've removed 'Principal Diagnosis Class' and other highly correlated '_UNKNOWN' columns.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Criminal Justice Status'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    # Exit if the file is not found
    exit()

target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]
X_encoded = pd.get_dummies(X, drop_first=True)

# Remove multicollinear columns identified from our analysis
columns_to_drop = [
    'Household Composition_NOT APPLICABLE', 'Autism Spectrum_UNKNOWN', 'Other Developmental Disability_UNKNOWN',
    'Drug Substance Disorder_UNKNOWN', 'Opioid Related Disorder_UNKNOWN', 'Mobility Impairment Disorder_UNKNOWN',
    'Hearing Impairment_UNKNOWN', 'Visual Impairment_UNKNOWN', 'Speech Impairment_UNKNOWN',
    'Hyperlipidemia_UNKNOWN', 'High Blood Pressure_UNKNOWN', 'Diabetes_UNKNOWN', 'Obesity_UNKNOWN',
    'Heart Attack_UNKNOWN', 'Stroke_UNKNOWN', 'Other Cardiac_UNKNOWN', 'Pulmonary Asthma_UNKNOWN',
    'Alzheimer or Dementia_UNKNOWN', 'Kidney Disease_UNKNOWN', 'Liver Disease_UNKNOWN',
    'Endocrine Condition_UNKNOWN', 'Neurological Condition_UNKNOWN', 'Traumatic Brain Injury_UNKNOWN',
    'Joint Disease_UNKNOWN', 'Cancer_UNKNOWN', 'Other Chronic Med Condition_UNKNOWN',
    'No Chronic Med Condition_UNKNOWN', 'Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN',
    'Drug/Substance 12m Service_UNKNOWN'
]

for col in columns_to_drop:
    if col in X_encoded.columns:
        X_encoded = X_encoded.drop(columns=[col])

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 3. Address Class Imbalance with SMOTE ---

print("Applying SMOTE to the training data to balance the classes...")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("New class distribution in training data:", Counter(y_train_res))

# --- 4. Train the Random Forest Model on Balanced Data ---

model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
print("\nTraining the Random Forest model on the new, balanced data...")
model.fit(X_train_res, y_train_res)
print("Model training complete!")

# --- 5. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy after SMOTE: {accuracy:.4f}")
print("\nClassification Report after SMOTE:")
print(classification_report(y_test, y_pred))

feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features (Cleaned Model with SMOTE):")
print(feature_importances.head(10))

# --- 6. Plot and Save Feature Importances ---

top_10_features = feature_importances.head(10)
plt.figure(figsize=(12, 8))
top_10_features.sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Features for Predicting Mental Illness (Cleaned Model with SMOTE)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = "feature_importance_plot_cleaned_model_smote.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()

# --- 7. Visualize New Key Features ---

print("\nGenerating visualizations for new key features...")
visual_df = data.copy()
visual_df['Alcohol Related Disorder_UNKNOWN'] = visual_df['Alcohol Related Disorder'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)
visual_df['Intellectual Disability_UNKNOWN'] = visual_df['Intellectual Disability'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))
sns.countplot(data=visual_df, x='Alcohol Related Disorder_UNKNOWN', hue=target_column, ax=axes[0])
axes[0].set_title('Mental Illness by Alcohol Related Disorder Status')
axes[0].set_xlabel('Alcohol Related Disorder Status (0=Known, 1=Unknown)')
axes[0].set_ylabel('Count')

sns.countplot(data=visual_df, x='Intellectual Disability_UNKNOWN', hue=target_column, ax=axes[1])
axes[1].set_title('Mental Illness by Intellectual Disability Status')
axes[1].set_xlabel('Intellectual Disability Status (0=Known, 1=Unknown)')
axes[1].set_ylabel('Count')

plt.tight_layout()
output_file_visuals = "new_feature_visualizations.png"
plt.savefig(output_file_visuals)
print(f"Visualizations for new key features saved as '{output_file_visuals}'")
plt.close()


Data loaded successfully!

Data has been cleaned and prepared for modeling.
Final feature set shape: (99244, 93)

Original class distribution in training data: Counter({'YES': 66984, 'NO': 1794, 'UNKNOWN': 692})
Applying SMOTE to the training data to balance the classes...


TypeError: numpy boolean subtract, the `-` operator, is not supported, use the bitwise_xor, the `^` operator, or the logical_xor function instead.