In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"
df = pd.read_csv(file_path)

# Define the columns to use
columns = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness', 'Intellectual Disability',
    'Autism Spectrum', 'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder', 'Mobility Impairment Disorder',
    'Hearing Impairment', 'Visual Impairment', 'Speech Impairment',
    'Hyperlipidemia', 'High Blood Pressure', 'Diabetes', 'Obesity',
    'Heart Attack', 'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition', 'Traumatic Brain Injury',
    'Joint Disease', 'Cancer', 'Other Chronic Med Condition',
    'No Chronic Med Condition', 'Unknown Chronic Med Condition',
    'Cannabis Recreational Use', 'Cannabis Medicinal Use', 'Smokes',
    'Received Smoking Medication', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service', 'Opioid 12m Service',
    'Drug/Substance 12m Service', 'Principal Diagnosis Class',
    'Criminal Justice Status'
]

# Filter and clean the dataset
df_filtered = df[columns].dropna()

# Encode categorical variables
label_encoders = {}
for col in df_filtered.columns:
    if df_filtered[col].dtype == 'object':
        le = LabelEncoder()
        df_filtered[col] = le.fit_transform(df_filtered[col].astype(str))
        label_encoders[col] = le

# Separate features and target
X = df_filtered.drop('Mental Illness', axis=1)
y = df_filtered['Mental Illness']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Feature importances
importances = rf.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Plot top 20 feature importances
plt.figure(figsize=(12, 10))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(20), palette='viridis')
plt.title('Top 20 Feature Importances for Predicting Mental Illness')
plt.tight_layout()
plt.show()


In [None]:
plt.savefig("C:/Users/arunc/OneDrive/Desktop/Python Project/RandomForest_Importance.png")
# Plot top 20 feature importances and save the chart
plt.figure(figsize=(12, 10))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(20), palette='viridis')
plt.title('Top 20 Feature Importances for Predicting Mental Illness')
plt.tight_layout()
plt.savefig("C:/Users/arunc/OneDrive/Desktop/Python Project/RandomForest_Importance.png")
plt.show()


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV 1.csv"
df = pd.read_csv(file_path)

# Define the columns to use
columns = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness', 'Intellectual Disability',
    'Autism Spectrum', 'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder', 'Mobility Impairment Disorder',
    'Hearing Impairment', 'Visual Impairment', 'Speech Impairment',
    'Hyperlipidemia', 'High Blood Pressure', 'Diabetes', 'Obesity',
    'Heart Attack', 'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition', 'Traumatic Brain Injury',
    'Joint Disease', 'Cancer', 'Other Chronic Med Condition',
    'No Chronic Med Condition', 'Unknown Chronic Med Condition',
    'Cannabis Recreational Use', 'Cannabis Medicinal Use', 'Smokes',
    'Received Smoking Medication', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service', 'Opioid 12m Service',
    'Drug/Substance 12m Service', 'Principal Diagnosis Class',
    'Criminal Justice Status'
]

# Filter and clean the dataset
df_filtered = df[columns].dropna()

# Encode categorical variables
label_encoders = {}
for col in df_filtered.columns:
    if df_filtered[col].dtype == 'object':
        le = LabelEncoder()
        df_filtered[col] = le.fit_transform(df_filtered[col].astype(str))
        label_encoders[col] = le

# Separate features and target
X = df_filtered.drop('Mental Illness', axis=1)
y = df_filtered['Mental Illness']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Feature importances
importances = rf.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Plot and save top 20 feature importances
plt.figure(figsize=(12, 10))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(20), palette='viridis')
plt.title('Top 20 Feature Importances for Predicting Mental Illness')
plt.tight_layout()
plt.savefig(r"C:\Users\arunc\OneDrive\Desktop\Python Project\RandomForest_Importance.png")
plt.show()


In [None]:
import pandas as pd

# Correct and verified file path
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV 1.csv"

# Try loading the CSV file
try:
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
    display(df.head())  # Shows first few rows
except FileNotFoundError:
    print("Error: File not found. Please check the file path and name.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your dataset. The 'r' before the string is a raw string literal,
# which is good practice for Windows paths to handle backslashes correctly.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

# The columns to be used in the model, based on the image you provided.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status'
]

# --- 2. Load and Prepare Data ---

# Load the dataset from the specified file path.
try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    # You can add code here to handle the error, such as exiting the program.

# Assuming 'Mental Illness' is the target variable you want to predict.
target_column = 'Mental Illness'

# Drop any rows where the target variable is missing, as these cannot be used for training.
data.dropna(subset=[target_column], inplace=True)

# Separate the features (X) and the target variable (y).
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]

# Convert categorical features into a numerical format using one-hot encoding.
# 'drop_first=True' is used to avoid multicollinearity.
X_encoded = pd.get_dummies(X, drop_first=True)

# Split the data into a training set (70%) and a testing set (30%).
# random_state ensures that the split is the same every time the code is run.
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

print("\nData has been prepared and split into training and testing sets.")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# --- 3. Train the Random Forest Model ---

# Initialize the RandomForestClassifier.
# n_estimators: The number of decision trees in the forest. More trees can lead to better accuracy but take longer to run.
# random_state: Ensures the results are reproducible.
# n_jobs: Uses all available CPU cores for faster training (-1 means use all cores).
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

# Fit the model to the training data.
print("\nTraining the Random Forest model...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 4. Evaluate the Model ---

# Use the trained model to make predictions on the test set.
y_pred = model.predict(X_test)

# Calculate and print the accuracy of the model.
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Print a detailed classification report, which includes precision, recall, and F1-score.
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the importance of each feature in the model.
# This helps in understanding which variables contribute most to the prediction.
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)

# Print the top 10 most important features.
print("\nTop 10 most important features:")
print(feature_importances.head(10))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your dataset. The 'r' before the string is a raw string literal,
# which is good practice for Windows paths to handle backslashes correctly.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

# The columns to be used in the model, based on the image you provided.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status'
]

# --- 2. Load and Prepare Data ---

# Load the dataset from the specified file path.
try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    # You can add code here to handle the error, such as exiting the program.

# Assuming 'Mental Illness' is the target variable you want to predict.
target_column = 'Mental Illness'

# Drop any rows where the target variable is missing, as these cannot be used for training.
data.dropna(subset=[target_column], inplace=True)

# Separate the features (X) and the target variable (y).
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]

# Convert categorical features into a numerical format using one-hot encoding.
# 'drop_first=True' is used to avoid multicollinearity.
X_encoded = pd.get_dummies(X, drop_first=True)

# Split the data into a training set (70%) and a testing set (30%).
# random_state ensures that the split is the same every time the code is run.
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

print("\nData has been prepared and split into training and testing sets.")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# --- 3. Train the Random Forest Model ---

# Initialize the RandomForestClassifier.
# n_estimators: The number of decision trees in the forest. More trees can lead to better accuracy but take longer to run.
# random_state: Ensures the results are reproducible.
# n_jobs: Uses all available CPU cores for faster training (-1 means use all cores).
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

# Fit the model to the training data.
print("\nTraining the Random Forest model...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 4. Evaluate the Model ---

# Use the trained model to make predictions on the test set.
y_pred = model.predict(X_test)

# Calculate and print the accuracy of the model.
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Print a detailed classification report, which includes precision, recall, and F1-score.
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the importance of each feature in the model.
# This helps in understanding which variables contribute most to the prediction.
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)

# Print the top 10 most important features.
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 5. Plot and Save Feature Importances ---

# Get the top 10 features for plotting
top_10_features = feature_importances.head(10)

# Create a horizontal bar chart to visualize feature importance
plt.figure(figsize=(12, 8))
top_10_features.sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Features for Predicting Mental Illness')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout() # Ensures all labels are visible

# Save the plot as a PNG file
output_file = "feature_importance_plot.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")

# Close the plot to free up memory
plt.close()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your dataset. The 'r' before the string is a raw string literal,
# which is good practice for Windows paths to handle backslashes correctly.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

# The columns to be used in the model, based on the image you provided.
# NOTE: The 'Principal Diagnosis Class' column has been removed to find new insights.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Criminal Justice Status' # This is the original list, but without 'Principal Diagnosis Class'
]

# --- 2. Load and Prepare Data ---

# Load the dataset from the specified file path.
try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    # You can add code here to handle the error, such as exiting the program.

# Assuming 'Mental Illness' is the target variable you want to predict.
target_column = 'Mental Illness'

# Drop any rows where the target variable is missing, as these cannot be used for training.
data.dropna(subset=[target_column], inplace=True)

# Separate the features (X) and the target variable (y).
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]

# Convert categorical features into a numerical format using one-hot encoding.
# 'drop_first=True' is used to avoid multicollinearity.
X_encoded = pd.get_dummies(X, drop_first=True)

# Split the data into a training set (70%) and a testing set (30%).
# random_state ensures that the split is the same every time the code is run.
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

print("\nData has been prepared and split into training and testing sets.")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# --- 3. Train the Random Forest Model ---

# Initialize the RandomForestClassifier.
# n_estimators: The number of decision trees in the forest. More trees can lead to better accuracy but take longer to run.
# random_state: Ensures the results are reproducible.
# n_jobs: Uses all available CPU cores for faster training (-1 means use all cores).
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

# Fit the model to the training data.
print("\nTraining the Random Forest model...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 4. Evaluate the Model ---

# Use the trained model to make predictions on the test set.
y_pred = model.predict(X_test)

# Calculate and print the accuracy of the model.
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Print a detailed classification report, which includes precision, recall, and F1-score.
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the importance of each feature in the model.
# This helps in understanding which variables contribute most to the prediction.
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)

# Print the top 10 most important features.
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 5. Plot and Save Feature Importances ---

# Get the top 10 features for plotting
top_10_features = feature_importances.head(10)

# Create a horizontal bar chart to visualize feature importance
plt.figure(figsize=(12, 8))
top_10_features.sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Features for Predicting Mental Illness (Excluding Principal Diagnosis)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout() # Ensures all labels are visible

# Save the plot as a PNG file
output_file = "feature_importance_plot_no_principal_diagnosis.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")

# Close the plot to free up memory
plt.close()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your dataset. The 'r' before the string is a raw string literal,
# which is good practice for Windows paths to handle backslashes correctly.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

# The columns to be used in the model, based on the image you provided.
# NOTE: The 'Principal Diagnosis Class' column has been removed to find new insights.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Criminal Justice Status' # This is the original list, but without 'Principal Diagnosis Class'
]

# --- 2. Load and Prepare Data ---

# Load the dataset from the specified file path.
try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    # You can add code here to handle the error, such as exiting the program.

# Assuming 'Mental Illness' is the target variable you want to predict.
target_column = 'Mental Illness'

# Drop any rows where the target variable is missing, as these cannot be used for training.
data.dropna(subset=[target_column], inplace=True)

# Separate the features (X) and the target variable (y).
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]

# Convert categorical features into a numerical format using one-hot encoding.
# 'drop_first=True' is used to avoid multicollinearity.
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been prepared and split into training and testing sets.")

# --- 3. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', mask=mask)
plt.title('Correlation Matrix of Features')
plt.show()
plt.savefig("correlation_matrix.png")
print("\nCorrelation matrix heatmap plot saved as 'correlation_matrix.png'")

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# --- 4. Split and Train the Random Forest Model ---

# Split the data into a training set (70%) and a testing set (30%).
# random_state ensures that the split is the same every time the code is run.
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Initialize the RandomForestClassifier.
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

# Fit the model to the training data.
print("\nTraining the Random Forest model...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 5. Evaluate the Model ---

# Use the trained model to make predictions on the test set.
y_pred = model.predict(X_test)

# Calculate and print the accuracy of the model.
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Print a detailed classification report, which includes precision, recall, and F1-score.
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the importance of each feature in the model.
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)

# Print the top 10 most important features.
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 6. Plot and Save Feature Importances ---

# Get the top 10 features for plotting
top_10_features = feature_importances.head(10)

# Create a horizontal bar chart to visualize feature importance
plt.figure(figsize=(12, 8))
top_10_features.sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Features for Predicting Mental Illness (Excluding Principal Diagnosis)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout() # Ensures all labels are visible

# Save the plot as a PNG file
output_file = "feature_importance_plot_no_principal_diagnosis.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")

# Close the plot to free up memory
plt.close()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your dataset. The 'r' before the string is a raw string literal,
# which is good practice for Windows paths to handle backslashes correctly.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

# The columns to be used in the model, based on the image you provided.
# NOTE: The 'Principal Diagnosis Class' column has been removed to find new insights.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Criminal Justice Status' # This is the original list, but without 'Principal Diagnosis Class'
]

# --- 2. Load and Prepare Data ---

# Load the dataset from the specified file path.
try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    # You can add code here to handle the error, such as exiting the program.

# Assuming 'Mental Illness' is the target variable you want to predict.
target_column = 'Mental Illness'

# Drop any rows where the target variable is missing, as these cannot be used for training.
data.dropna(subset=[target_column], inplace=True)

# Separate the features (X) and the target variable (y).
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]

# Convert categorical features into a numerical format using one-hot encoding.
# 'drop_first=True' is used to avoid multicollinearity.
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been prepared and split into training and testing sets.")

# --- 3. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', mask=mask)
plt.title('Correlation Matrix of Features')
plt.savefig("correlation_matrix.png")
print("\nCorrelation matrix heatmap plot saved as 'correlation_matrix.png'")
# The plt.show() has been removed here to prevent interference with saving the PNG file.
plt.close()

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# --- 4. Split and Train the Random Forest Model ---

# Split the data into a training set (70%) and a testing set (30%).
# random_state ensures that the split is the same every time the code is run.
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Initialize the RandomForestClassifier.
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

# Fit the model to the training data.
print("\nTraining the Random Forest model...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 5. Evaluate the Model ---

# Use the trained model to make predictions on the test set.
y_pred = model.predict(X_test)

# Calculate and print the accuracy of the model.
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Print a detailed classification report, which includes precision, recall, and F1-score.
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the importance of each feature in the model.
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)

# Print the top 10 most important features.
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 6. Plot and Save Feature Importances ---

# Get the top 10 features for plotting
top_10_features = feature_importances.head(10)

# Create a horizontal bar chart to visualize feature importance
plt.figure(figsize=(12, 8))
top_10_features.sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Features for Predicting Mental Illness (Excluding Principal Diagnosis)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout() # Ensures all labels are visible

# Save the plot as a PNG file
output_file = "feature_importance_plot_no_principal_diagnosis.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")

# Close the plot to free up memory
plt.close()


In [None]:
corr_matrix.to_csv('correlation_matrix_data.csv')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your dataset. The 'r' before the string is a raw string literal,
# which is good practice for Windows paths to handle backslashes correctly.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

# The columns to be used in the model. Highly correlated 'UNKNOWN' columns have been removed.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Criminal Justice Status'
]

# --- 2. Load and Prepare Data ---

# Load the dataset from the specified file path.
try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    # You can add code here to handle the error, such as exiting the program.

# Assuming 'Mental Illness' is the target variable you want to predict.
target_column = 'Mental Illness'

# Drop any rows where the target variable is missing, as these cannot be used for training.
data.dropna(subset=[target_column], inplace=True)

# Separate the features (X) and the target variable (y).
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]

# Convert categorical features into a numerical format using one-hot encoding.
# 'drop_first=True' is used to avoid multicollinearity.
X_encoded = pd.get_dummies(X, drop_first=True)

# Identify the columns to drop based on the multicollinearity check
# We will drop all '_UNKNOWN' columns except for 'Serious Mental Illness_UNKNOWN'
columns_to_drop = [
    'Household Composition_NOT APPLICABLE',
    'Autism Spectrum_UNKNOWN',
    'Other Developmental Disability_UNKNOWN',
    'Drug Substance Disorder_UNKNOWN',
    'Opioid Related Disorder_UNKNOWN',
    'Mobility Impairment Disorder_UNKNOWN',
    'Hearing Impairment_UNKNOWN',
    'Visual Impairment_UNKNOWN',
    'Speech Impairment_UNKNOWN',
    'Hyperlipidemia_UNKNOWN',
    'High Blood Pressure_UNKNOWN',
    'Diabetes_UNKNOWN',
    'Obesity_UNKNOWN',
    'Heart Attack_UNKNOWN',
    'Stroke_UNKNOWN',
    'Other Cardiac_UNKNOWN',
    'Pulmonary Asthma_UNKNOWN',
    'Alzheimer or Dementia_UNKNOWN',
    'Kidney Disease_UNKNOWN',
    'Liver Disease_UNKNOWN',
    'Endocrine Condition_UNKNOWN',
    'Neurological Condition_UNKNOWN',
    'Traumatic Brain Injury_UNKNOWN',
    'Joint Disease_UNKNOWN',
    'Cancer_UNKNOWN',
    'Other Chronic Med Condition_UNKNOWN',
    'No Chronic Med Condition_UNKNOWN',
    'Alcohol 12m Service_UNKNOWN',
    'Opioid 12m Service_UNKNOWN',
    'Drug/Substance 12m Service_UNKNOWN'
]

# Remove the identified columns from the features
# We must first check if the column exists in the dataframe before dropping it.
for col in columns_to_drop:
    if col in X_encoded.columns:
        X_encoded = X_encoded.drop(columns=[col])
        print(f"Removed multicollinear feature: {col}")

print("\nData has been prepared and split into training and testing sets.")

# --- 3. Split and Train the Random Forest Model ---

# Split the data into a training set (70%) and a testing set (30%).
# random_state ensures that the split is the same every time the code is run.
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Initialize the RandomForestClassifier.
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

# Fit the model to the training data.
print("\nTraining the Random Forest model...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 4. Evaluate the Model ---

# Use the trained model to make predictions on the test set.
y_pred = model.predict(X_test)

# Calculate and print the accuracy of the model.
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Print a detailed classification report, which includes precision, recall, and F1-score.
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the importance of each feature in the model.
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)

# Print the top 10 most important features.
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 5. Plot and Save Feature Importances ---

# Get the top 10 features for plotting
top_10_features = feature_importances.head(10)

# Create a horizontal bar chart to visualize feature importance
plt.figure(figsize=(12, 8))
top_10_features.sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Features for Predicting Mental Illness (Cleaned Model)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout() # Ensures all labels are visible

# Save the plot as a PNG file
output_file = "feature_importance_plot_cleaned_model.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")

# Close the plot to free up memory
plt.close()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import numpy as np
from imblearn.over_sampling import SMOTE
from collections import Counter

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your dataset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

# The columns to be used in the model. Highly correlated 'UNKNOWN' columns have been removed.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Criminal Justice Status'
]

# --- 2. Load and Prepare Data ---

# Load the dataset from the specified file path.
try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    # You can add code here to handle the error, such as exiting the program.

# Assuming 'Mental Illness' is the target variable you want to predict.
target_column = 'Mental Illness'

# Drop any rows where the target variable is missing, as these cannot be used for training.
data.dropna(subset=[target_column], inplace=True)

# Separate the features (X) and the target variable (y).
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]

# Convert categorical features into a numerical format using one-hot encoding.
# 'drop_first=True' is used to avoid multicollinearity.
X_encoded = pd.get_dummies(X, drop_first=True)

# Identify and remove multicollinear columns (as per our previous analysis)
columns_to_drop = [
    'Household Composition_NOT APPLICABLE', 'Autism Spectrum_UNKNOWN', 'Other Developmental Disability_UNKNOWN',
    'Drug Substance Disorder_UNKNOWN', 'Opioid Related Disorder_UNKNOWN', 'Mobility Impairment Disorder_UNKNOWN',
    'Hearing Impairment_UNKNOWN', 'Visual Impairment_UNKNOWN', 'Speech Impairment_UNKNOWN',
    'Hyperlipidemia_UNKNOWN', 'High Blood Pressure_UNKNOWN', 'Diabetes_UNKNOWN', 'Obesity_UNKNOWN',
    'Heart Attack_UNKNOWN', 'Stroke_UNKNOWN', 'Other Cardiac_UNKNOWN', 'Pulmonary Asthma_UNKNOWN',
    'Alzheimer or Dementia_UNKNOWN', 'Kidney Disease_UNKNOWN', 'Liver Disease_UNKNOWN',
    'Endocrine Condition_UNKNOWN', 'Neurological Condition_UNKNOWN', 'Traumatic Brain Injury_UNKNOWN',
    'Joint Disease_UNKNOWN', 'Cancer_UNKNOWN', 'Other Chronic Med Condition_UNKNOWN',
    'No Chronic Med Condition_UNKNOWN', 'Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN',
    'Drug/Substance 12m Service_UNKNOWN'
]

for col in columns_to_drop:
    if col in X_encoded.columns:
        X_encoded = X_encoded.drop(columns=[col])

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
# We use stratify=y to ensure the class distribution is the same in both sets.

print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 3. Address Class Imbalance with SMOTE ---

print("Applying SMOTE to the training data to balance the classes...")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("New class distribution in training data:", Counter(y_train_res))

# --- 4. Train the Random Forest Model on Balanced Data ---

model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

print("\nTraining the Random Forest model on the new, balanced data...")
model.fit(X_train_res, y_train_res)
print("Model training complete!")

# --- 5. Evaluate the Model ---

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy after SMOTE: {accuracy:.4f}")

print("\nClassification Report after SMOTE:")
print(classification_report(y_test, y_pred))

feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)

print("\nTop 10 most important features (Cleaned Model with SMOTE):")
print(feature_importances.head(10))

# --- 6. Plot and Save Feature Importances ---

top_10_features = feature_importances.head(10)
plt.figure(figsize=(12, 8))
top_10_features.sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Features for Predicting Mental Illness (Cleaned Model with SMOTE)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = "feature_importance_plot_cleaned_model_smote.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()

# --- 7. Visualize New Key Features ---

print("\nGenerating visualizations for new key features...")

# Create a dataframe for visualization of new key features
visual_df = data.copy()
# We need to one-hot encode these columns to visualize them
visual_df['Alcohol Related Disorder_UNKNOWN'] = visual_df['Alcohol Related Disorder'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)
visual_df['Intellectual Disability_UNKNOWN'] = visual_df['Intellectual Disability'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)

# Create a grouped bar chart to visualize the relationship between new features and the target
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

sns.countplot(data=visual_df, x='Alcohol Related Disorder_UNKNOWN', hue=target_column, ax=axes[0])
axes[0].set_title('Mental Illness by Alcohol Related Disorder Status')
axes[0].set_xlabel('Alcohol Related Disorder Status (0=Known, 1=Unknown)')
axes[0].set_ylabel('Count')

sns.countplot(data=visual_df, x='Intellectual Disability_UNKNOWN', hue=target_column, ax=axes[1])
axes[1].set_title('Mental Illness by Intellectual Disability Status')
axes[1].set_xlabel('Intellectual Disability Status (0=Known, 1=Unknown)')
axes[1].set_ylabel('Count')

plt.tight_layout()
output_file_visuals = "new_feature_visualizations.png"
plt.savefig(output_file_visuals)
print(f"Visualizations for new key features saved as '{output_file_visuals}'")
plt.close()



In [None]:
pip install --upgrade imbalanced-learn

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import numpy as np
from imblearn.over_sampling import SMOTE
from collections import Counter

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your dataset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

# The columns to be used in the model. Highly correlated 'UNKNOWN' columns have been removed.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Criminal Justice Status'
]

# --- 2. Load and Prepare Data ---

# Load the dataset from the specified file path.
try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    # You can add code here to handle the error, such as exiting the program.

# Assuming 'Mental Illness' is the target variable you want to predict.
target_column = 'Mental Illness'

# Drop any rows where the target variable is missing, as these cannot be used for training.
data.dropna(subset=[target_column], inplace=True)

# Separate the features (X) and the target variable (y).
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]

# Convert categorical features into a numerical format using one-hot encoding.
# 'drop_first=True' is used to avoid multicollinearity.
X_encoded = pd.get_dummies(X, drop_first=True)

# Identify and remove multicollinear columns (as per our previous analysis)
columns_to_drop = [
    'Household Composition_NOT APPLICABLE', 'Autism Spectrum_UNKNOWN', 'Other Developmental Disability_UNKNOWN',
    'Drug Substance Disorder_UNKNOWN', 'Opioid Related Disorder_UNKNOWN', 'Mobility Impairment Disorder_UNKNOWN',
    'Hearing Impairment_UNKNOWN', 'Visual Impairment_UNKNOWN', 'Speech Impairment_UNKNOWN',
    'Hyperlipidemia_UNKNOWN', 'High Blood Pressure_UNKNOWN', 'Diabetes_UNKNOWN', 'Obesity_UNKNOWN',
    'Heart Attack_UNKNOWN', 'Stroke_UNKNOWN', 'Other Cardiac_UNKNOWN', 'Pulmonary Asthma_UNKNOWN',
    'Alzheimer or Dementia_UNKNOWN', 'Kidney Disease_UNKNOWN', 'Liver Disease_UNKNOWN',
    'Endocrine Condition_UNKNOWN', 'Neurological Condition_UNKNOWN', 'Traumatic Brain Injury_UNKNOWN',
    'Joint Disease_UNKNOWN', 'Cancer_UNKNOWN', 'Other Chronic Med Condition_UNKNOWN',
    'No Chronic Med Condition_UNKNOWN', 'Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN',
    'Drug/Substance 12m Service_UNKNOWN'
]

for col in columns_to_drop:
    if col in X_encoded.columns:
        X_encoded = X_encoded.drop(columns=[col])

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
# We use stratify=y to ensure the class distribution is the same in both sets.

print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 3. Address Class Imbalance with SMOTE ---

print("Applying SMOTE to the training data to balance the classes...")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("New class distribution in training data:", Counter(y_train_res))

# --- 4. Train the Random Forest Model on Balanced Data ---

model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

print("\nTraining the Random Forest model on the new, balanced data...")
model.fit(X_train_res, y_train_res)
print("Model training complete!")

# --- 5. Evaluate the Model ---

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy after SMOTE: {accuracy:.4f}")

print("\nClassification Report after SMOTE:")
print(classification_report(y_test, y_pred))

feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)

print("\nTop 10 most important features (Cleaned Model with SMOTE):")
print(feature_importances.head(10))

# --- 6. Plot and Save Feature Importances ---

top_10_features = feature_importances.head(10)
plt.figure(figsize=(12, 8))
top_10_features.sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Features for Predicting Mental Illness (Cleaned Model with SMOTE)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = "feature_importance_plot_cleaned_model_smote.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()

# --- 7. Visualize New Key Features ---

print("\nGenerating visualizations for new key features...")

# Create a dataframe for visualization of new key features
visual_df = data.copy()
# We need to one-hot encode these columns to visualize them
visual_df['Alcohol Related Disorder_UNKNOWN'] = visual_df['Alcohol Related Disorder'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)
visual_df['Intellectual Disability_UNKNOWN'] = visual_df['Intellectual Disability'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)

# Create a grouped bar chart to visualize the relationship between new features and the target
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

sns.countplot(data=visual_df, x='Alcohol Related Disorder_UNKNOWN', hue=target_column, ax=axes[0])
axes[0].set_title('Mental Illness by Alcohol Related Disorder Status')
axes[0].set_xlabel('Alcohol Related Disorder Status (0=Known, 1=Unknown)')
axes[0].set_ylabel('Count')

sns.countplot(data=visual_df, x='Intellectual Disability_UNKNOWN', hue=target_column, ax=axes[1])
axes[1].set_title('Mental Illness by Intellectual Disability Status')
axes[1].set_xlabel('Intellectual Disability Status (0=Known, 1=Unknown)')
axes[1].set_ylabel('Count')

plt.tight_layout()
output_file_visuals = "new_feature_visualizations.png"
plt.savefig(output_file_visuals)
print(f"Visualizations for new key features saved as '{output_file_visuals}'")
plt.close()



In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from imblearn.over_sampling import SMOTE
from collections import Counter

# Suppress potential warnings from scikit-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The file path to your dataset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

# The columns to be used in the model. This list has been refined based on our analysis.
# We've removed 'Principal Diagnosis Class' and other highly correlated '_UNKNOWN' columns.
columns_to_use = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services', 'Mental Illness',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Criminal Justice Status'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    # Exit if the file is not found
    exit()

target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)
X = data[columns_to_use].drop(columns=[target_column])
y = data[target_column]
X_encoded = pd.get_dummies(X, drop_first=True)

# Remove multicollinear columns identified from our analysis
columns_to_drop = [
    'Household Composition_NOT APPLICABLE', 'Autism Spectrum_UNKNOWN', 'Other Developmental Disability_UNKNOWN',
    'Drug Substance Disorder_UNKNOWN', 'Opioid Related Disorder_UNKNOWN', 'Mobility Impairment Disorder_UNKNOWN',
    'Hearing Impairment_UNKNOWN', 'Visual Impairment_UNKNOWN', 'Speech Impairment_UNKNOWN',
    'Hyperlipidemia_UNKNOWN', 'High Blood Pressure_UNKNOWN', 'Diabetes_UNKNOWN', 'Obesity_UNKNOWN',
    'Heart Attack_UNKNOWN', 'Stroke_UNKNOWN', 'Other Cardiac_UNKNOWN', 'Pulmonary Asthma_UNKNOWN',
    'Alzheimer or Dementia_UNKNOWN', 'Kidney Disease_UNKNOWN', 'Liver Disease_UNKNOWN',
    'Endocrine Condition_UNKNOWN', 'Neurological Condition_UNKNOWN', 'Traumatic Brain Injury_UNKNOWN',
    'Joint Disease_UNKNOWN', 'Cancer_UNKNOWN', 'Other Chronic Med Condition_UNKNOWN',
    'No Chronic Med Condition_UNKNOWN', 'Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN',
    'Drug/Substance 12m Service_UNKNOWN'
]

for col in columns_to_drop:
    if col in X_encoded.columns:
        X_encoded = X_encoded.drop(columns=[col])

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 3. Address Class Imbalance with SMOTE ---

print("Applying SMOTE to the training data to balance the classes...")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("New class distribution in training data:", Counter(y_train_res))

# --- 4. Train the Random Forest Model on Balanced Data ---

model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
print("\nTraining the Random Forest model on the new, balanced data...")
model.fit(X_train_res, y_train_res)
print("Model training complete!")

# --- 5. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy after SMOTE: {accuracy:.4f}")
print("\nClassification Report after SMOTE:")
print(classification_report(y_test, y_pred))

feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features (Cleaned Model with SMOTE):")
print(feature_importances.head(10))

# --- 6. Plot and Save Feature Importances ---

top_10_features = feature_importances.head(10)
plt.figure(figsize=(12, 8))
top_10_features.sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Features for Predicting Mental Illness (Cleaned Model with SMOTE)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = "feature_importance_plot_cleaned_model_smote.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()

# --- 7. Visualize New Key Features ---

print("\nGenerating visualizations for new key features...")
visual_df = data.copy()
visual_df['Alcohol Related Disorder_UNKNOWN'] = visual_df['Alcohol Related Disorder'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)
visual_df['Intellectual Disability_UNKNOWN'] = visual_df['Intellectual Disability'].apply(lambda x: 1 if x == 'UNKNOWN' else 0)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))
sns.countplot(data=visual_df, x='Alcohol Related Disorder_UNKNOWN', hue=target_column, ax=axes[0])
axes[0].set_title('Mental Illness by Alcohol Related Disorder Status')
axes[0].set_xlabel('Alcohol Related Disorder Status (0=Known, 1=Unknown)')
axes[0].set_ylabel('Count')

sns.countplot(data=visual_df, x='Intellectual Disability_UNKNOWN', hue=target_column, ax=axes[1])
axes[1].set_title('Mental Illness by Intellectual Disability Status')
axes[1].set_xlabel('Intellectual Disability Status (0=Known, 1=Unknown)')
axes[1].set_ylabel('Count')

plt.tight_layout()
output_file_visuals = "new_feature_visualizations.png"
plt.savefig(output_file_visuals)
print(f"Visualizations for new key features saved as '{output_file_visuals}'")
plt.close()


Data loaded successfully!

Data has been cleaned and prepared for modeling.
Final feature set shape: (99244, 93)

Original class distribution in training data: Counter({'YES': 66984, 'NO': 1794, 'UNKNOWN': 692})
Applying SMOTE to the training data to balance the classes...


TypeError: numpy boolean subtract, the `-` operator, is not supported, use the bitwise_xor, the `^` operator, or the logical_xor function instead.