In [None]:
!pip install pandas scikit-learn imbalanced-learn matplotlib seaborn

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from imblearn.over_sampling import SMOTE
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your social data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social data Part 2 new .csv"

# The columns relevant to this dataset, as requested.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# We need to make sure the data contains only the columns we need for this analysis
# and handle any missing values in the target variable.
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 3. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', mask=mask)
plt.title('Correlation Matrix of Social Features')
plt.savefig("social_correlation_matrix.png")
print("\nCorrelation matrix heatmap plot saved as 'social_correlation_matrix.png'")
plt.close()

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# --- Drop Multicollinear Features ---
if 'Household Composition_NOT APPLICABLE' in X_encoded.columns:
    X_encoded = X_encoded.drop(columns=['Household Composition_NOT APPLICABLE'])
    print("\nRemoved multicollinear feature: Household Composition_NOT APPLICABLE")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 4. Address Class Imbalance with SMOTE ---

print("\nApplying SMOTE to the training data to balance the classes...")
# We will oversample the minority classes to a smaller, more manageable size
target_counts = Counter(y_train)
# Set the sampling strategy to oversample minority classes to a more balanced number
sampling_strategy = {
    'NO': int(target_counts['NO'] * 1.5),
    'UNKNOWN': int(target_counts['UNKNOWN'] * 1.5),
    'YES': target_counts['YES']
}

sm = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("New class distribution in training data:", Counter(y_train_res))

# --- 5. Train the Random Forest Model on Balanced Data ---

model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
print("\nTraining the Random Forest model on the new, balanced data...")
model.fit(X_train_res, y_train_res)
print("Model training complete!")

# --- 6. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy after SMOTE: {accuracy:.4f}")
print("\nClassification Report after SMOTE:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 7. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Social Features for Predicting Mental Illness (with SMOTE)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = "social_feature_importance_plot.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()

# --- 8. Generate new visualizations for key features ---
print("\nGenerating new visualizations for key features...")

# Create a figure with a subplot for each plot
fig, axes = plt.subplots(2, 2, figsize=(20, 15))
fig.suptitle('Relationship Between Top Social Features and Mental Illness', fontsize=18)
axes = axes.flatten()

# Plot 1: Special Education Services vs. Mental Illness
sns.countplot(data=data, x='Special Education Services', hue=target_column, ax=axes[0])
axes[0].set_title('Mental Illness by Special Education Services')
axes[0].set_xlabel('Special Education Services')
axes[0].set_ylabel('Count')

# Plot 2: Household Composition vs. Mental Illness
sns.countplot(data=data, x='Household Composition', hue=target_column, ax=axes[1])
axes[1].set_title('Mental Illness by Household Composition')
axes[1].set_xlabel('Household Composition')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

# Plot 3: Race vs. Mental Illness
sns.countplot(data=data, x='Race', hue=target_column, ax=axes[2])
axes[2].set_title('Mental Illness by Race')
axes[2].set_xlabel('Race')
axes[2].set_ylabel('Count')
axes[2].tick_params(axis='x', rotation=45)

# Plot 4: Religious Preference vs. Mental Illness
sns.countplot(data=data, x='Religious Preference', hue=target_column, ax=axes[3])
axes[3].set_title('Mental Illness by Religious Preference')
axes[3].set_xlabel('Religious Preference')
axes[3].set_ylabel('Count')
axes[3].tick_params(axis='x', rotation=45)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
output_file_visuals = "social_feature_visualizations.png"
plt.savefig(output_file_visuals)
print(f"New visualizations for key social features saved as '{output_file_visuals}'")
plt.close()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
import numpy as np

warnings.filterwarnings('ignore')

# 1. Load Data
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social data Part 2 new .csv"
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

data = pd.read_csv(file_path)
print("Data loaded successfully!")

# 2. Clean and Prepare
data = data[columns_to_use]
data.dropna(subset=['Mental Illness'], inplace=True)
data = data[data['Mental Illness'].isin(['YES', 'NO'])]  # Remove 'UNKNOWN'

X = data.drop(columns=['Mental Illness'])
y = data['Mental Illness']

X_encoded = pd.get_dummies(X, drop_first=True)
X_encoded = X_encoded.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# 3. Check for Multicollinearity
corr_matrix = X_encoded.corr()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]

if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > 0.8):")
    print(to_drop)
    X_encoded.drop(columns=to_drop, inplace=True)
    print(f"\nRemoved multicollinear features: {to_drop}")
else:
    print("\nNo features with multicollinearity found.")

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.3, random_state=42, stratify=y
)
print("\nOriginal class distribution in training data:", Counter(y_train))

# 5. Apply SMOTE
print("\nApplying SMOTE to the training data to balance the classes...")
X_train = X_train.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("New class distribution in training data:", Counter(y_train_res))

# 6. Train Model
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train_res, y_train_res)
print("\nModel training complete!")

# 7. Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy after SMOTE: {accuracy:.4f}")
print("\nClassification Report after SMOTE:")
print(classification_report(y_test, y_pred))

# 8. Feature Importances
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your social data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social data Part 2 new .csv"

# The columns relevant to this dataset, as requested.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# We need to make sure the data contains only the columns we need for this analysis
# and handle any missing values in the target variable.
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 3. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', mask=mask)
plt.title('Correlation Matrix of Social Features')
plt.savefig("social_correlation_matrix.png")
print("\nCorrelation matrix heatmap plot saved as 'social_correlation_matrix.png'")
plt.close()

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# --- Drop Multicollinear Features ---
if 'Household Composition_NOT APPLICABLE' in X_encoded.columns:
    X_encoded = X_encoded.drop(columns=['Household Composition_NOT APPLICABLE'])
    print("\nRemoved multicollinear feature: Household Composition_NOT APPLICABLE")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 4. Train the BalancedRandomForestClassifier ---

# Using BalancedRandomForestClassifier instead of SMOTE
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the BalancedRandomForestClassifier on the data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 5. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 6. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Social Features (BalancedRandomForestClassifier)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = "social_feature_importance_plot_brf.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your social data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social data Part 2 new .csv"

# The columns relevant to this dataset, as requested.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# We need to make sure the data contains only the columns we need for this analysis
# and handle any missing values in the target variable.
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 3. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', mask=mask)
plt.title('Correlation Matrix of Social Features')
plt.savefig("social_correlation_matrix.png")
print("\nCorrelation matrix heatmap plot saved as 'social_correlation_matrix.png'")
plt.close()

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# --- Drop Multicollinear Features ---
if 'Household Composition_NOT APPLICABLE' in X_encoded.columns:
    X_encoded = X_encoded.drop(columns=['Household Composition_NOT APPLICABLE'])
    print("\nRemoved multicollinear feature: Household Composition_NOT APPLICABLE")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 4. Train the BalancedRandomForestClassifier ---

# Using BalancedRandomForestClassifier instead of SMOTE
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the BalancedRandomForestClassifier on the data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 5. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 6. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Social Features (BalancedRandomForestClassifier)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = "social_feature_importance_plot_brf.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()


In [None]:
import os; os.getcwd()


In [None]:
import os

file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social_data_part2.csv"
print("Exists:", os.path.exists(file_path))


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
import matplotlib.pyplot as plt

# 1. Load Data
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social_data_part2.csv"
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

data = pd.read_csv(file_path)
print("Data loaded successfully!")

# 2. Clean and Prepare
data = data[columns_to_use]
data.dropna(subset=['Mental Illness'], inplace=True)
data = data[data['Mental Illness'].isin(['YES', 'NO'])]

X = data.drop(columns=['Mental Illness'])
y = data['Mental Illness']

X_encoded = pd.get_dummies(X, drop_first=True)
X_encoded = X_encoded.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# 3. Check for Multicollinearity
corr_matrix = X_encoded.corr()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > 0.8):")
    print(to_drop)
    X_encoded.drop(columns=to_drop, inplace=True)

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.3, random_state=42, stratify=y
)
print("\nOriginal class distribution in training data:", Counter(y_train))

# 5. Train BalancedRandomForestClassifier
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("\nModel training complete!")

# 6. Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 7. Feature Importances
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
top_features = feature_importances.head(10)
print("\nTop 10 most important features:")
print(top_features)

# 8. Export Feature Importance Plot
plt.figure(figsize=(10, 6))
top_features.sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Features (BalancedRandomForestClassifier)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig("top_10_feature_importance.png")
plt.close()
print("\nFeature importance plot saved as 'top_10_feature_importance.png'")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier

# Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social_data_part2.csv"
df = pd.read_csv(file_path)

# Drop the multicollinear feature
df = df.drop(columns=['Household Composition'])

# Define target and features
target = 'Mental Illness'
X = df.drop(columns=[target])
y = df[target]

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# One-hot encode categorical variables
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
    remainder='passthrough'
)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Preprocess the data
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Apply SMOTE + Tomek Links
smote_tomek = SMOTETomek(random_state=42)
X_resampled_tomek, y_resampled_tomek = smote_tomek.fit_resample(X_train_encoded, y_train)

# Apply SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_resampled_enn, y_resampled_enn = smote_enn.fit_resample(X_train_encoded, y_train)

# Train and evaluate BalancedRandomForestClassifier on SMOTE + Tomek Links
model_tomek = BalancedRandomForestClassifier(random_state=42)
model_tomek.fit(X_resampled_tomek, y_resampled_tomek)
y_pred_tomek = model_tomek.predict(X_test_encoded)

print("=== SMOTE + Tomek Links ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tomek))
print(classification_report(y_test, y_pred_tomek))
print(confusion_matrix(y_test, y_pred_tomek))

# Train and evaluate BalancedRandomForestClassifier on SMOTEENN
model_enn = BalancedRandomForestClassifier(random_state=42)
model_enn.fit(X_resampled_enn, y_resampled_enn)
y_pred_enn = model_enn.predict(X_test_encoded)

print("\n=== SMOTEENN ===")
print("Accuracy:", accuracy_score(y_test, y_pred_enn))
print(classification_report(y_test, y_pred_enn))
print(confusion_matrix(y_test, y_pred_enn))


In [None]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social_data_part2.csv"
df = pd.read_csv(file_path)

# Show basic info
print(df.shape)
print(df.columns)
print(df.head())


In [None]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social_data_part2.csv"
df = pd.read_csv(file_path)

# Show basic info
print(df.shape)
print(df.columns)
print(df.head())


In [None]:
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier

# Step 2: Load dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social_data_part2.csv"
df = pd.read_csv(file_path)

# Step 3: Drop irrelevant and multicollinear columns
columns_to_drop = [
    'Household Composition', 'Survey Year', 'Program Category',
    'Region Served', 'Patient ID', 'Age Group', 'Sex'
]
df = df.drop(columns=columns_to_drop)

# Step 4: Define features and target
target = 'Mental Illness'
X = df.drop(columns=[target])
y = df[target]

# Step 5: Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Step 6: One-hot encode categorical variables
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
    remainder='passthrough'
)

# Step 7: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Step 8: Preprocess data
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Step 9: Apply SMOTE + Tomek Links
smote_tomek = SMOTETomek(random_state=42)
X_resampled_tomek, y_resampled_tomek = smote_tomek.fit_resample(X_train_encoded, y_train)

# Step 10: Apply SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_resampled_enn, y_resampled_enn = smote_enn.fit_resample(X_train_encoded, y_train)

# Step 11: Train and evaluate BalancedRandomForestClassifier on SMOTE + Tomek Links
model_tomek = BalancedRandomForestClassifier(random_state=42)
model_tomek.fit(X_resampled_tomek, y_resampled_tomek)
y_pred_tomek = model_tomek.predict(X_test_encoded)

print("=== SMOTE + Tomek Links ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tomek))
print(classification_report(y_test, y_pred_tomek))
print(confusion_matrix(y_test, y_pred_tomek))

# Step 12: Train and evaluate BalancedRandomForestClassifier on SMOTEENN
model_enn = BalancedRandomForestClassifier(random_state=42)
model_enn.fit(X_resampled_enn, y_resampled_enn)
y_pred_enn = model_enn.predict(X_test_encoded)

print("\n=== SMOTEENN ===")
print("Accuracy:", accuracy_score(y_test, y_pred_enn))
print(classification_report(y_test, y_pred_enn))
print(confusion_matrix(y_test, y_pred_enn))


In [None]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social_data_part2.csv"
df = pd.read_csv(file_path)

# Inspect the data
print(df.shape)
print(df.columns)
print(df.head())
# Drop multicollinear and irrelevant columns
columns_to_drop = [
    'Household Composition', 'Survey Year', 'Program Category',
    'Region Served', 'Patient ID', 'Age Group', 'Sex'
]
df = df.drop(columns=columns_to_drop)
# Define target and features
target = 'Mental Illness'
X = df.drop(columns=[target])
y = df[target]
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# One-hot encode categorical variables
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
    remainder='passthrough'
)

# Fit and transform
X_encoded = preprocessor.fit_transform(X)
from sklearn.model_selection import train_test_split

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, stratify=y, test_size=0.3, random_state=42)
from imblearn.combine import SMOTETomek, SMOTEENN

# SMOTE + Tomek Links
smote_tomek = SMOTETomek(random_state=42)
X_resampled_tomek, y_resampled_tomek = smote_tomek.fit_resample(X_train, y_train)

# SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_resampled_enn, y_resampled_enn = smote_enn.fit_resample(X_train, y_train)


In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train on SMOTE + Tomek Links
model_tomek = BalancedRandomForestClassifier(random_state=42)
model_tomek.fit(X_resampled_tomek, y_resampled_tomek)
y_pred_tomek = model_tomek.predict(X_test)

print("=== SMOTE + Tomek Links ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tomek))
print(classification_report(y_test, y_pred_tomek))
print(confusion_matrix(y_test, y_pred_tomek))

# Train on SMOTEENN
model_enn = BalancedRandomForestClassifier(random_state=42)
model_enn.fit(X_resampled_enn, y_resampled_enn)
y_pred_enn = model_enn.predict(X_test)

print("\n=== SMOTEENN ===")
print("Accuracy:", accuracy_score(y_test, y_pred_enn))
print(classification_report(y_test, y_pred_enn))
print(confusion_matrix(y_test, y_pred_enn))


In [None]:
pip install imbalanced-learn scikit-learn pandas


In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train on SMOTE + Tomek Links
model_tomek = BalancedRandomForestClassifier(random_state=42)
model_tomek.fit(X_resampled_tomek, y_resampled_tomek)
y_pred_tomek = model_tomek.predict(X_test)

print("=== SMOTE + Tomek Links ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tomek))
print(classification_report(y_test, y_pred_tomek))
print(confusion_matrix(y_test, y_pred_tomek))


In [None]:
from imblearn.combine import SMOTETomek

# Try resampling only SMOTE + Tomek Links
try:
    smote_tomek = SMOTETomek(random_state=42)
    X_resampled_tomek, y_resampled_tomek = smote_tomek.fit_resample(X_train, y_train)
    print("SMOTE + Tomek Links resampling successful!")
    print("Resampled shape:", X_resampled_tomek.shape, y_resampled_tomek.shape)
except Exception as e:
    print("Error during SMOTE + Tomek Links:", e)


In [None]:
from sklearn.model_selection import train_test_split

# Split the encoded data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, stratify=y, test_size=0.3, random_state=42)


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# One-hot encode categorical variables
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
    remainder='passthrough'
)

# Fit and transform the data
X_encoded = preprocessor.fit_transform(X)


In [None]:
# Define target and features
target = 'Mental Illness'
X = df.drop(columns=[target])
y = df[target]


In [None]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social_data_part2.csv"
df = pd.read_csv(file_path)

# Drop unnecessary columns
columns_to_drop = [
    'Household Composition', 'Survey Year', 'Program Category',
    'Region Served', 'Patient ID', 'Age Group', 'Sex'
]
df = df.drop(columns=columns_to_drop)


In [None]:
# Define target and features
target = 'Mental Illness'
X = df.drop(columns=[target])
y = df[target]


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# One-hot encode categorical variables
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
    remainder='passthrough'
)

# Fit and transform the data
X_encoded = preprocessor.fit_transform(X)


In [None]:
from sklearn.model_selection import train_test_split

# Split the encoded data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, stratify=y, test_size=0.3, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


In [None]:
from imblearn.combine import SMOTETomek, SMOTEENN

# SMOTE + Tomek Links
smote_tomek = SMOTETomek(random_state=42)
X_resampled_tomek, y_resampled_tomek = smote_tomek.fit_resample(X_train, y_train)

# SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_resampled_enn, y_resampled_enn = smote_enn.fit_resample(X_train, y_train)

print("SMOTE + Tomek Links shape:", X_resampled_tomek.shape)
print("SMOTEENN shape:", X_resampled_enn.shape)


In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train on SMOTE + Tomek Links
model_tomek = BalancedRandomForestClassifier(random_state=42)
model_tomek.fit(X_resampled_tomek, y_resampled_tomek)
y_pred_tomek = model_tomek.predict(X_test)

print("=== SMOTE + Tomek Links ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tomek))
print(classification_report(y_test, y_pred_tomek))
print(confusion_matrix(y_test, y_pred_tomek))

# Train on SMOTEENN
model_enn = BalancedRandomForestClassifier(random_state=42)
model_enn.fit(X_resampled_enn, y_resampled_enn)
y_pred_enn = model_enn.predict(X_test)

print("\n=== SMOTEENN ===")
print("Accuracy:", accuracy_score(y_test, y_pred_enn))
print(classification_report(y_test, y_pred_enn))
print(confusion_matrix(y_test, y_pred_enn))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your social data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social_data_part2.csv"

# The columns relevant to this dataset, as requested.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# We need to make sure the data contains only the columns we need for this analysis
# and handle any missing values in the target variable.
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 3. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', mask=mask)
plt.title('Correlation Matrix of Social Features')
plt.savefig("social_correlation_matrix.png")
print("\nCorrelation matrix heatmap plot saved as 'social_correlation_matrix.png'")
plt.close()

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# --- Drop Multicollinear Features ---
if 'Household Composition_NOT APPLICABLE' in X_encoded.columns:
    X_encoded = X_encoded.drop(columns=['Household Composition_NOT APPLICABLE'])
    print("\nRemoved multicollinear feature: Household Composition_NOT APPLICABLE")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 4. Train the BalancedRandomForestClassifier ---

# Using BalancedRandomForestClassifier for handling class imbalance
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the BalancedRandomForestClassifier on the data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 5. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 6. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Social Features (BalancedRandomForestClassifier)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
# The plot will be saved in your project's main folder
output_file = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\social_feature_importance_plot_brf.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import seaborn as sns
import numpy as np
import shap

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your social data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social_data_part2.csv"

# The columns relevant to this dataset, as requested.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# We need to make sure the data contains only the columns we need for this analysis
# and handle any missing values in the target variable.
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- Drop Multicollinear Features ---
if 'Household Composition_NOT APPLICABLE' in X_encoded.columns:
    X_encoded = X_encoded.drop(columns=['Household Composition_NOT APPLICABLE'])
    print("\nRemoved multicollinear feature: Household Composition_NOT APPLICABLE")
    print(f"New feature set shape after removing multicollinearity: {X_encoded.shape}")

# --- 3. Feature Selection with RFE ---

print("\nPerforming Recursive Feature Elimination (RFE) to select top 10 features...")
# Use BalancedRandomForestClassifier as the estimator for RFE
rfe_estimator = BalancedRandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, sampling_strategy='not majority')
rfe = RFE(estimator=rfe_estimator, n_features_to_select=10, step=1)
rfe.fit(X_encoded, y)

# Get the list of top 10 selected features
selected_features = X_encoded.columns[rfe.support_].tolist()
print(f"RFE selected the following features: {selected_features}")

# Filter the data to include only the selected features
X_rfe = X_encoded[selected_features]

# --- 4. Train-Test Split on RFE-selected data ---
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 5. Train the BalancedRandomForestClassifier on RFE-selected data ---
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the final BalancedRandomForestClassifier on RFE-selected data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 6. Evaluate the Model ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- 7. Generate SHAP Summary Plot for Interpretability ---
print("\nGenerating SHAP summary plot for model interpretability...")

# Using RFE-selected features for SHAP analysis
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Create and save a SHAP summary plot
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
plt.title('SHAP Feature Importance Plot')
output_file_shap = "social_shap_summary_plot_brf.png"
plt.savefig(output_file_shap)
print(f"SHAP plot saved successfully as '{output_file_shap}'")
plt.close()


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import seaborn as sns
import numpy as np
import shap

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your social data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Social_data_part2.csv"

# The columns relevant to this dataset, as requested.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# We need to make sure the data contains only the columns we need for this analysis
# and handle any missing values in the target variable.
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- Drop Multicollinear Features ---
if 'Household Composition_NOT APPLICABLE' in X_encoded.columns:
    X_encoded = X_encoded.drop(columns=['Household Composition_NOT APPLICABLE'])
    print("\nRemoved multicollinear feature: Household Composition_NOT APPLICABLE")
    print(f"New feature set shape after removing multicollinearity: {X_encoded.shape}")

# --- 3. Feature Selection with RFE ---

print("\nPerforming Recursive Feature Elimination (RFE) to select top 10 features...")
# Use BalancedRandomForestClassifier as the estimator for RFE
rfe_estimator = BalancedRandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, sampling_strategy='not majority')
rfe = RFE(estimator=rfe_estimator, n_features_to_select=10, step=1)
rfe.fit(X_encoded, y)

# Get the list of top 10 selected features
selected_features = X_encoded.columns[rfe.support_].tolist()
print(f"RFE selected the following features: {selected_features}")

# Filter the data to include only the selected features
X_rfe = X_encoded[selected_features]

# --- 4. Train-Test Split on RFE-selected data ---
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 5. Train the BalancedRandomForestClassifier on RFE-selected data ---
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the final BalancedRandomForestClassifier on RFE-selected data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 6. Evaluate the Model ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- 7. Generate SHAP Summary Plot for Interpretability ---
print("\nGenerating SHAP summary plot for model interpretability...")

# Using RFE-selected features for SHAP analysis
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Create and save a SHAP summary plot
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
plt.title('SHAP Feature Importance Plot')
output_file_shap = "social_shap_summary_plot_brf.png"
plt.savefig(output_file_shap)
print(f"SHAP plot saved successfully as '{output_file_shap}'")
plt.close()


Data loaded successfully!

Data has been cleaned and prepared for modeling.
Final feature set shape: (99244, 48)

Removed multicollinear feature: Household Composition_NOT APPLICABLE
New feature set shape after removing multicollinearity: (99244, 47)

Performing Recursive Feature Elimination (RFE) to select top 10 features...
RFE selected the following features: ['Transgender_UNKNOWN', 'Hispanic Ethnicity_UNKNOWN', 'Race_UNKNOWN RACE', 'Race_WHITE ONLY', 'Living Situation_PRIVATE RESIDENCE', 'Household Composition_UNKNOWN', 'Veteran Status_UNKNOWN', 'Employment Status_UNKNOWN EMPLOYMENT STATUS', 'Education Status_UNKNOWN', 'Special Education Services_UNKNOWN']

Original class distribution in training data: Counter({'YES': 66984, 'NO': 1794, 'UNKNOWN': 692})

Training the final BalancedRandomForestClassifier on RFE-selected data...
Model training complete!

Model Accuracy: 0.9646

Classification Report:
              precision    recall  f1-score   support

          NO       0.00      