In [1]:
# --- 1. Import Libraries ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib

warnings.filterwarnings('ignore')

# --- 2. Load Data ---
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

try:
    data = pd.read_csv(file_path)
    print(" Data loaded successfully!")
except FileNotFoundError:
    print(f" Error: File not found at {file_path}")
    raise
# --- 3. Define Columns ---
target_column = 'Mental Illness'
columns_to_use = [  # your full list of features including target
    # [same list as before, including 'Mental Illness']
]

data.dropna(subset=[target_column], inplace=True)
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]
X_encoded = pd.get_dummies(X, drop_first=True)

# --- 4. Drop Multicollinear Columns ---
columns_to_drop = [
    # [same list of '_UNKNOWN' columns as before]
]
X_encoded.drop(columns=[col for col in columns_to_drop if col in X_encoded.columns], inplace=True)

print(f" Final feature set shape: {X_encoded.shape}")

# --- 5. Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print(" Original class distribution:", Counter(y_train))

# --- 6. Apply SMOTE ---
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print(" Resampled class distribution:", Counter(y_train_res))

# --- 7. Train Model ---
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train_res, y_train_res)
print(" Model training complete!")

# Save model
joblib.dump(model, "random_forest_model.pkl")

# --- 8. Evaluate Model ---
y_pred = model.predict(X_test)
print(f" Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(" Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.tight_layout()
plt.savefig("confusion_matrix.png")
plt.close()

# --- 9. Feature Importance ---
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print(" Top 10 Features:\n", feature_importances.head(10))

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title("Top 10 Most Important Features")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.savefig("feature_importance_plot.png")
plt.close()

# --- 10. Additional Visualizations ---
visual_df = data.copy()
visual_df['Alcohol Related Disorder_UNKNOWN'] = visual_df['Alcohol Related Disorder'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)
visual_df['Intellectual Disability_UNKNOWN'] = visual_df['Intellectual Disability'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))
sns.countplot(data=visual_df, x='Alcohol Related Disorder_UNKNOWN', hue=target_column, ax=axes[0])
axes[0].set_title('Mental Illness by Alcohol Related Disorder Status')
axes[0].set_xlabel('Alcohol Related Disorder Status (0=Known, 1=Unknown)')

sns.countplot(data=visual_df, x='Intellectual Disability_UNKNOWN', hue=target_column, ax=axes[1])
axes[1].set_title('Mental Illness by Intellectual Disability Status')
axes[1].set_xlabel('Intellectual Disability Status (0=Known, 1=Unknown)')

plt.tight_layout()
plt.savefig("new_feature_visualizations.png")
plt.close()


 Data loaded successfully!


KeyError: "['Mental Illness'] not found in axis"

In [None]:
# --- 1. Import Libraries ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib

warnings.filterwarnings('ignore')

# --- 2. Load Data ---
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

try:
    data = pd.read_csv(file_path)
    print("✅ Data loaded successfully!")
except FileNotFoundError:
    print(f"❌ Error: File not found at {file_path}")
    raise

# --- 3. Define Columns ---
target_column = 'Mental Illness'
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Criminal Justice Status'
]

# --- 4. Prepare Data ---
data.dropna(subset=[target_column], inplace=True)
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]
X_encoded = pd.get_dummies(X, drop_first=True)

# --- 5. Drop Multicollinear Columns ---
columns_to_drop = [
    'Household Composition_NOT APPLICABLE', 'Autism Spectrum_UNKNOWN', 'Other Developmental Disability_UNKNOWN',
    'Drug Substance Disorder_UNKNOWN', 'Opioid Related Disorder_UNKNOWN', 'Mobility Impairment Disorder_UNKNOWN',
    'Hearing Impairment_UNKNOWN', 'Visual Impairment_UNKNOWN', 'Speech Impairment_UNKNOWN',
    'Hyperlipidemia_UNKNOWN', 'High Blood Pressure_UNKNOWN', 'Diabetes_UNKNOWN', 'Obesity_UNKNOWN',
    'Heart Attack_UNKNOWN', 'Stroke_UNKNOWN', 'Other Cardiac_UNKNOWN', 'Pulmonary Asthma_UNKNOWN',
    'Alzheimer or Dementia_UNKNOWN', 'Kidney Disease_UNKNOWN', 'Liver Disease_UNKNOWN',
    'Endocrine Condition_UNKNOWN', 'Neurological Condition_UNKNOWN', 'Traumatic Brain Injury_UNKNOWN',
    'Joint Disease_UNKNOWN', 'Cancer_UNKNOWN', 'Other Chronic Med Condition_UNKNOWN',
    'No Chronic Med Condition_UNKNOWN', 'Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN',
    'Drug/Substance 12m Service_UNKNOWN'
]
X_encoded.drop(columns=[col for col in columns_to_drop if col in X_encoded.columns], inplace=True)

print(f"✅ Final feature set shape: {X_encoded.shape}")

# --- 6. Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("📊 Original class distribution:", Counter(y_train))

# --- 7. Apply SMOTE ---
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("📊 Resampled class distribution:", Counter(y_train_res))

# --- 8. Train Model ---
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train_res, y_train_res)
print("✅ Model training complete!")

# Save model
joblib.dump(model, "random_forest_model.pkl")

# --- 9. Evaluate Model ---
y_pred = model.predict(X_test)
print(f"🎯 Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("📋 Classification Report:\n", classification_report(y_test, y_pred))

#

In [None]:
# 1. Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib
import warnings

warnings.filterwarnings('ignore')

# 2. Load Data
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    raise

# 3. Define Columns
target_column = 'Mental Illness'
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Criminal Justice Status'
]

# 4. Prepare Data
data.dropna(subset=[target_column], inplace=True)
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]
X_encoded = pd.get_dummies(X, drop_first=True)

# 5. Drop Multicollinear Columns
columns_to_drop = [
    'Household Composition_NOT APPLICABLE', 'Autism Spectrum_UNKNOWN', 'Other Developmental Disability_UNKNOWN',
    'Drug Substance Disorder_UNKNOWN', 'Opioid Related Disorder_UNKNOWN', 'Mobility Impairment Disorder_UNKNOWN',
    'Hearing Impairment_UNKNOWN', 'Visual Impairment_UNKNOWN', 'Speech Impairment_UNKNOWN',
    'Hyperlipidemia_UNKNOWN', 'High Blood Pressure_UNKNOWN', 'Diabetes_UNKNOWN', 'Obesity_UNKNOWN',
    'Heart Attack_UNKNOWN', 'Stroke_UNKNOWN', 'Other Cardiac_UNKNOWN', 'Pulmonary Asthma_UNKNOWN',
    'Alzheimer or Dementia_UNKNOWN', 'Kidney Disease_UNKNOWN', 'Liver Disease_UNKNOWN',
    'Endocrine Condition_UNKNOWN', 'Neurological Condition_UNKNOWN', 'Traumatic Brain Injury_UNKNOWN',
    'Joint Disease_UNKNOWN', 'Cancer_UNKNOWN', 'Other Chronic Med Condition_UNKNOWN',
    'No Chronic Med Condition_UNKNOWN', 'Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN',
    'Drug/Substance 12m Service_UNKNOWN'
]
X_encoded.drop(columns=[col for col in columns_to_drop if col in X_encoded.columns], inplace=True)

print(f"Final feature set shape: {X_encoded.shape}")

# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("Original class distribution:", Counter(y_train))

# 7. Apply SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("Resampled class distribution:", Counter(y_train_res))

# 8. Train Model
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train_res, y_train_res)
print("Model training complete.")

# Save model
joblib.dump(model, "random_forest_model.pkl")

# 9. Evaluate Model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.tight_layout()
plt.savefig("confusion_matrix.png")
plt.close()

# 10. Feature Importance
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("Top 10 Features:\n", feature_importances.head(10))

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title("Top 10 Most Important Features")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.savefig("feature_importance_plot.png")
plt.close()

# 11. Additional Visualizations
visual_df = data.copy()
visual_df['Alcohol Related Disorder_UNKNOWN'] = visual_df['Alcohol Related Disorder'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)
visual_df['Intellectual Disability_UNKNOWN'] = visual_df['Intellectual Disability'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))
sns.countplot(data=visual_df, x='Alcohol Related Disorder_UNKNOWN', hue=target_column, ax=axes[0])
axes[0].set_title('Mental Illness by Alcohol Related Disorder Status')
axes[0].set_xlabel('Alcohol Related Disorder Status (0=Known, 1=Unknown)')

sns.countplot(data=visual_df, x='Intellectual Disability_UNKNOWN', hue=target_column, ax=axes[1])
axes[1].set_title('Mental Illness by Intellectual Disability Status')
axes[1].set_xlabel('Intellectual Disability Status (0=Known, 1=Unknown)')

plt.tight_layout()
plt.savefig("new_feature_visualizations.png")
plt.close()


In [None]:
# Ensure all features are numeric
X_train = X_train.astype(float)
X_test = X_test.astype(float)


In [None]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


In [None]:
print(X_train.dtypes.value_counts())


In [None]:
print(y_train.dtype)
print(y_train.unique())


In [None]:
# Convert target labels to numeric
y_train = y_train.map({'YES': 1, 'NO': 0}).dropna()
X_train = X_train.loc[y_train.index]  # align X with filtered y


In [None]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("Resampled class distribution:", Counter(y_train_res))


In [None]:
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train_res, y_train_res)


In [None]:
# Predict on test set
y_pred = model.predict(X_test)

# Evaluate performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()


In [None]:
# Convert y_test to numeric
y_test = y_test.map({'YES': 1, 'NO': 0}).dropna()
X_test = X_test.loc[y_test.index]  # align X_test with filtered y_test


In [None]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'  #  This is the key change
)

model.fit(X_train_res, y_train_res)


In [None]:
class_weight='balanced'


In [None]:
y_pred = model.predict(X_test)


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()


In [None]:
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns)


In [None]:
# Sort and display top 10 features
top_features = feature_importances.sort_values(ascending=False).head(10)
print(top_features)

# Plot them
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
top_features.sort_values().plot(kind='barh')
plt.title("Top 10 Most Important Features")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"
df = pd.read_csv(file_path)

# Drop rows with missing target values
df = df[df['Mental Illness'].isin(['YES', 'NO'])]

# Convert target to numeric
df['Mental Illness'] = df['Mental Illness'].map({'YES': 1, 'NO': 0})

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df)

# Drop multicollinear _UNKNOWN columns
columns_to_drop = [col for col in df_encoded.columns if '_UNKNOWN' in col]
df_encoded.drop(columns=columns_to_drop, inplace=True)

# Separate features and target
X = df_encoded.drop('Mental Illness', axis=1)
y = df_encoded['Mental Illness']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Train BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=200, random_state=42)
brf.fit(X_resampled, y_resampled)

# Predict on test set
y_pred = brf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.2%}")
print("\nClassification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from imblearn.over_sampling import SMOTE
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your dataset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

# The columns to be used in the model. This list has been refined based on our analysis.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Criminal Justice Status'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]
X_encoded = pd.get_dummies(X, drop_first=True)

# Remove multicollinear columns identified from our analysis
columns_to_drop = [
    'Household Composition_NOT APPLICABLE', 'Autism Spectrum_UNKNOWN', 'Other Developmental Disability_UNKNOWN',
    'Drug Substance Disorder_UNKNOWN', 'Opioid Related Disorder_UNKNOWN', 'Mobility Impairment Disorder_UNKNOWN',
    'Hearing Impairment_UNKNOWN', 'Visual Impairment_UNKNOWN', 'Speech Impairment_UNKNOWN',
    'Hyperlipidemia_UNKNOWN', 'High Blood Pressure_UNKNOWN', 'Diabetes_UNKNOWN', 'Obesity_UNKNOWN',
    'Heart Attack_UNKNOWN', 'Stroke_UNKNOWN', 'Other Cardiac_UNKNOWN', 'Pulmonary Asthma_UNKNOWN',
    'Alzheimer or Dementia_UNKNOWN', 'Kidney Disease_UNKNOWN', 'Liver Disease_UNKNOWN',
    'Endocrine Condition_UNKNOWN', 'Neurological Condition_UNKNOWN', 'Traumatic Brain Injury_UNKNOWN',
    'Joint Disease_UNKNOWN', 'Cancer_UNKNOWN', 'Other Chronic Med Condition_UNKNOWN',
    'No Chronic Med Condition_UNKNOWN', 'Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN',
    'Drug/Substance 12m Service_UNKNOWN'
]

for col in columns_to_drop:
    if col in X_encoded.columns:
        X_encoded = X_encoded.drop(columns=[col])

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 3. Run a baseline model to get feature importances ---
print("\nRunning a baseline model to identify top features...")
baseline_model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
baseline_model.fit(X_train, y_train)
feature_importances = pd.Series(baseline_model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)

# Select top 20 features to reduce the data size for SMOTE
top_features = feature_importances.head(20).index.tolist()
print(f"Selected top {len(top_features)} features for the final model.")
print("Top features are:", top_features)

# Filter the data to include only the top features
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

# --- 4. Address Class Imbalance with SMOTE on a smaller dataset ---

print("\nApplying SMOTE to the training data with top features to balance the classes...")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_top, y_train)
print("New class distribution in training data:", Counter(y_train_res))

# --- 5. Train the Random Forest Model on Balanced Data ---

model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
print("\nTraining the Random Forest model on the new, balanced data...")
model.fit(X_train_res, y_train_res)
print("Model training complete!")

# --- 6. Evaluate the Model ---

y_pred = model.predict(X_test_top)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy after SMOTE: {accuracy:.4f}")
print("\nClassification Report after SMOTE:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
final_feature_importances = pd.Series(model.feature_importances_, index=X_train_top.columns).sort_values(ascending=False)
print("\nTop 10 most important features (Cleaned Model with SMOTE):")
print(final_feature_importances.head(10))

# --- 7. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
final_feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Features for Predicting Mental Illness (Cleaned Model with SMOTE)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = "feature_importance_plot_cleaned_model_smote.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()

# --- 8. Visualize New Key Features ---

print("\nGenerating visualizations for new key features...")
visual_df = data.copy()
# Create one-hot encoded columns for visualization
visual_df['Alcohol Related Disorder_UNKNOWN'] = visual_df['Alcohol Related Disorder'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)
visual_df['Intellectual Disability_UNKNOWN'] = visual_df['Intellectual Disability'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))
sns.countplot(data=visual_df, x='Alcohol Related Disorder_UNKNOWN', hue=target_column, ax=axes[0])
axes[0].set_title('Mental Illness by Alcohol Related Disorder Status')
axes[0].set_xlabel('Alcohol Related Disorder Status (0=Known, 1=Unknown)')
axes[0].set_ylabel('Count')

sns.countplot(data=visual_df, x='Intellectual Disability_UNKNOWN', hue=target_column, ax=axes[1])
axes[1].set_title('Mental Illness by Intellectual Disability Status')
axes[1].set_xlabel('Intellectual Disability Status (0=Known, 1=Unknown)')
axes[1].set_ylabel('Count')

plt.tight_layout()
output_file_visuals = "new_feature_visualizations.png"
plt.savefig(output_file_visuals)
print(f"Visualizations for new key features saved as '{output_file_visuals}'")
plt.close()


In [None]:
# --- 4. Address Class Imbalance with SMOTE on a smaller dataset ---

print("\nApplying SMOTE to the training data with top features to balance the classes...")

# Ensure all features are numeric (convert bool to int)
X_train_top = X_train_top.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_top, y_train)

print("New class distribution in training data:", Counter(y_train_res))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from imblearn.over_sampling import SMOTE
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your social data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social data Part 2 new .csv"

# The columns relevant to this dataset, as requested.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# We need to make sure the data contains only the columns we need for this analysis
# and handle any missing values in the target variable.
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
# We are not filtering for multicollinearity at this stage, as requested.
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 3. Address Class Imbalance with SMOTE ---

print("\nApplying SMOTE to the training data to balance the classes...")
# We will oversample the minority classes to a smaller, more manageable size
target_counts = Counter(y_train)
# Set the sampling strategy to oversample minority classes to a more balanced number
sampling_strategy = {
    'NO': int(target_counts['NO'] * 1.5),
    'UNKNOWN': int(target_counts['UNKNOWN'] * 1.5),
    'YES': target_counts['YES']
}

sm = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("New class distribution in training data:", Counter(y_train_res))

# --- 4. Train the Random Forest Model on Balanced Data ---

model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
print("\nTraining the Random Forest model on the new, balanced data...")
model.fit(X_train_res, y_train_res)
print("Model training complete!")

# --- 5. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy after SMOTE: {accuracy:.4f}")
print("\nClassification Report after SMOTE:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 6. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Social Features for Predicting Mental Illness (with SMOTE)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = "social_feature_importance_plot.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings

warnings.filterwarnings('ignore')

# 1. Load Data
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social data Part 2 new .csv"
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# 2. Clean and Prepare
data = data[columns_to_use]
data = data[data['Mental Illness'].isin(['YES', 'NO'])]  # Remove rare/unknown classes
target_column = 'Mental Illness'
X = data.drop(columns=[target_column])
y = data[target_column]

X_encoded = pd.get_dummies(X, drop_first=True)
print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.3, random_state=42, stratify=y
)
print("\nOriginal class distribution in training data:", Counter(y_train))

# 4. Apply SMOTE
print("\nApplying SMOTE to the training data to balance the classes...")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("New class distribution in training data:", Counter(y_train_res))

# 5. Train Model
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
print("\nTraining the Random Forest model on the new, balanced data...")
model.fit(X_train_res, y_train_res)
print("Model training complete!")

# 6. Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy after SMOTE: {accuracy:.4f}")
print("\nClassification Report after SMOTE:")
print(classification_report(y_test, y_pred))

# 7. Feature Importances
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))


In [None]:
# Convert boolean columns to integers before SMOTE
X_train = X_train.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)

# Apply SMOTE
print("\nApplying SMOTE to the training data to balance the classes...")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("New class distribution in training data:", Counter(y_train_res))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings

warnings.filterwarnings('ignore')

# 1. Load Data
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social data Part 2 new .csv"
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# 2. Clean and Prepare
data = data[columns_to_use]
data.dropna(subset=['Mental Illness'], inplace=True)
data = data[data['Mental Illness'].isin(['YES', 'NO'])]

X = data.drop(columns=['Mental Illness'])
y = data['Mental Illness']

X_encoded = pd.get_dummies(X, drop_first=True)
X_encoded = X_encoded.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.3, random_state=42, stratify=y
)
print("Original class distribution in training data:", Counter(y_train))

# 4. Apply SMOTE
print("\nApplying SMOTE to the training data to balance the classes...")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("New class distribution in training data:", Counter(y_train_res))

# 5. Train Model
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train_res, y_train_res)
print("Model training complete!")

# 6. Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 7. Feature Importances
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from imblearn.over_sampling import SMOTE
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your social data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social data Part 2 new .csv"

# The columns relevant to this dataset, as requested.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# We need to make sure the data contains only the columns we need for this analysis
# and handle any missing values in the target variable.
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 3. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', mask=mask)
plt.title('Correlation Matrix of Social Features')
plt.savefig("social_correlation_matrix.png")
print("\nCorrelation matrix heatmap plot saved as 'social_correlation_matrix.png'")
plt.close()

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# The rest of the code for SMOTE, model training, and visualizations will go in the next steps.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from imblearn.over_sampling import SMOTE
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your social data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social data Part 2 new .csv"

# The columns relevant to this dataset, as requested.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# We need to make sure the data contains only the columns we need for this analysis
# and handle any missing values in the target variable.
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 3. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', mask=mask)
plt.title('Correlation Matrix of Social Features')
plt.savefig("social_correlation_matrix.png")
print("\nCorrelation matrix heatmap plot saved as 'social_correlation_matrix.png'")
plt.close()

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# The rest of the code for SMOTE, model training, and visualizations will go in the next steps.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from imblearn.over_sampling import SMOTE
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your social data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social data Part 2 new .csv"

# The columns relevant to this dataset, as requested.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# We need to make sure the data contains only the columns we need for this analysis
# and handle any missing values in the target variable.
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 3. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', mask=mask)
plt.title('Correlation Matrix of Social Features')
plt.savefig("social_correlation_matrix.png")
print("\nCorrelation matrix heatmap plot saved as 'social_correlation_matrix.png'")
plt.close()

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# --- Drop Multicollinear Features ---
if 'Household Composition_NOT APPLICABLE' in X_encoded.columns:
    X_encoded = X_encoded.drop(columns=['Household Composition_NOT APPLICABLE'])
    print("\nRemoved multicollinear feature: Household Composition_NOT APPLICABLE")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 4. Address Class Imbalance with SMOTE ---

print("\nApplying SMOTE to the training data to balance the classes...")
# We will oversample the minority classes to a smaller, more manageable size
target_counts = Counter(y_train)
# Set the sampling strategy to oversample minority classes to a more balanced number
sampling_strategy = {
    'NO': int(target_counts['NO'] * 1.5),
    'UNKNOWN': int(target_counts['UNKNOWN'] * 1.5),
    'YES': target_counts['YES']
}

sm = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("New class distribution in training data:", Counter(y_train_res))

# --- 5. Train the Random Forest Model on Balanced Data ---

model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
print("\nTraining the Random Forest model on the new, balanced data...")
model.fit(X_train_res, y_train_res)
print("Model training complete!")

# --- 6. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy after SMOTE: {accuracy:.4f}")
print("\nClassification Report after SMOTE:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 7. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Social Features for Predicting Mental Illness (with SMOTE)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = "social_feature_importance_plot.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()

# --- 8. Generate new visualizations for key features ---
print("\nGenerating new visualizations for key features...")

# Create a figure with a subplot for each plot
fig, axes = plt.subplots(2, 2, figsize=(20, 15))
fig.suptitle('Relationship Between Top Social Features and Mental Illness', fontsize=18)
axes = axes.flatten()

# Plot 1: Special Education Services vs. Mental Illness
sns.countplot(data=data, x='Special Education Services', hue=target_column, ax=axes[0])
axes[0].set_title('Mental Illness by Special Education Services')
axes[0].set_xlabel('Special Education Services')
axes[0].set_ylabel('Count')

# Plot 2: Household Composition vs. Mental Illness
sns.countplot(data=data, x='Household Composition', hue=target_column, ax=axes[1])
axes[1].set_title('Mental Illness by Household Composition')
axes[1].set_xlabel('Household Composition')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

# Plot 3: Race vs. Mental Illness
sns.countplot(data=data, x='Race', hue=target_column, ax=axes[2])
axes[2].set_title('Mental Illness by Race')
axes[2].set_xlabel('Race')
axes[2].set_ylabel('Count')
axes[2].tick_params(axis='x', rotation=45)

# Plot 4: Religious Preference vs. Mental Illness
sns.countplot(data=data, x='Religious Preference', hue=target_column, ax=axes[3])
axes[3].set_title('Mental Illness by Religious Preference')
axes[3].set_xlabel('Religious Preference')
axes[3].set_ylabel('Count')
axes[3].tick_params(axis='x', rotation=45)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
output_file_visuals = "social_feature_visualizations.png"
plt.savefig(output_file_visuals)
print(f"New visualizations for key social features saved as '{output_file_visuals}'")
plt.close()


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from imblearn.over_sampling import SMOTE
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your social data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social data Part 2 new .csv"

# The columns relevant to this dataset, as requested.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# We need to make sure the data contains only the columns we need for this analysis
# and handle any missing values in the target variable.
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 3. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', mask=mask)
plt.title('Correlation Matrix of Social Features')
plt.savefig("social_correlation_matrix.png")
print("\nCorrelation matrix heatmap plot saved as 'social_correlation_matrix.png'")
plt.close()

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# --- Drop Multicollinear Features ---
if 'Household Composition_NOT APPLICABLE' in X_encoded.columns:
    X_encoded = X_encoded.drop(columns=['Household Composition_NOT APPLICABLE'])
    print("\nRemoved multicollinear feature: Household Composition_NOT APPLICABLE")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 4. Address Class Imbalance with SMOTE ---

print("\nApplying SMOTE to the training data to balance the classes...")
# We will oversample the minority classes to a smaller, more manageable size
target_counts = Counter(y_train)
# Set the sampling strategy to oversample minority classes to a more balanced number
sampling_strategy = {
    'NO': int(target_counts['NO'] * 1.5),
    'UNKNOWN': int(target_counts['UNKNOWN'] * 1.5),
    'YES': target_counts['YES']
}

sm = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("New class distribution in training data:", Counter(y_train_res))

# --- 5. Train the Random Forest Model on Balanced Data ---

model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
print("\nTraining the Random Forest model on the new, balanced data...")
model.fit(X_train_res, y_train_res)
print("Model training complete!")

# --- 6. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy after SMOTE: {accuracy:.4f}")
print("\nClassification Report after SMOTE:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 7. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Social Features for Predicting Mental Illness (with SMOTE)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = "social_feature_importance_plot.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()

# --- 8. Generate new visualizations for key features ---
print("\nGenerating new visualizations for key features...")

# Create a figure with a subplot for each plot
fig, axes = plt.subplots(2, 2, figsize=(20, 15))
fig.suptitle('Relationship Between Top Social Features and Mental Illness', fontsize=18)
axes = axes.flatten()

# Plot 1: Special Education Services vs. Mental Illness
sns.countplot(data=data, x='Special Education Services', hue=target_column, ax=axes[0])
axes[0].set_title('Mental Illness by Special Education Services')
axes[0].set_xlabel('Special Education Services')
axes[0].set_ylabel('Count')

# Plot 2: Household Composition vs. Mental Illness
sns.countplot(data=data, x='Household Composition', hue=target_column, ax=axes[1])
axes[1].set_title('Mental Illness by Household Composition')
axes[1].set_xlabel('Household Composition')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

# Plot 3: Race vs. Mental Illness
sns.countplot(data=data, x='Race', hue=target_column, ax=axes[2])
axes[2].set_title('Mental Illness by Race')
axes[2].set_xlabel('Race')
axes[2].set_ylabel('Count')
axes[2].tick_params(axis='x', rotation=45)

# Plot 4: Religious Preference vs. Mental Illness
sns.countplot(data=data, x='Religious Preference', hue=target_column, ax=axes[3])
axes[3].set_title('Mental Illness by Religious Preference')
axes[3].set_xlabel('Religious Preference')
axes[3].set_ylabel('Count')
axes[3].tick_params(axis='x', rotation=45)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
output_file_visuals = "social_feature_visualizations.png"
plt.savefig(output_file_visuals)
print(f"New visualizations for key social features saved as '{output_file_visuals}'")
plt.close()


Data loaded successfully!

Data has been cleaned and prepared for modeling.
Final feature set shape: (99244, 48)

Checking for multicollinearity...

Correlation matrix heatmap plot saved as 'social_correlation_matrix.png'

Features with multicollinearity (correlation > 0.8):
['Household Composition_NOT APPLICABLE']

Removed multicollinear feature: Household Composition_NOT APPLICABLE

Original class distribution in training data: Counter({'YES': 66984, 'NO': 1794, 'UNKNOWN': 692})

Applying SMOTE to the training data to balance the classes...


TypeError: numpy boolean subtract, the `-` operator, is not supported, use the bitwise_xor, the `^` operator, or the logical_xor function instead.