In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your substance history data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Substance History Female.csv"

# The columns relevant to this dataset.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Alcohol Related Disorder', 'Drug Substance Disorder', 'Opioid Related Disorder',
    'Cannabis Recreational Use', 'Cannabis Medicinal Use', 'Smokes',
    'Received Smoking Counseling', 'Alcohol 12m Service', 'Opioid 12m Service',
    'Drug/Substance 12m Service', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the specified columns
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 3. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# --- Drop Multicollinear Features ---
if to_drop:
    X_encoded = X_encoded.drop(columns=to_drop)
    print(f"\nRemoved multicollinear features: {to_drop}")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 4. Train the BalancedRandomForestClassifier ---

# Using BalancedRandomForestClassifier for handling class imbalance
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the BalancedRandomForestClassifier on the data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 5. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 6. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Substance History Features')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
# The plot will be saved in your project's main folder
output_file = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\substance_history_feature_importance_plot.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()


Data loaded successfully!

Data has been cleaned and prepared for modeling.
Final feature set shape: (99244, 20)

Checking for multicollinearity...

Features with multicollinearity (correlation > 0.8):
['Drug Substance Disorder_UNKNOWN', 'Opioid Related Disorder_UNKNOWN', 'Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN', 'Drug/Substance 12m Service_UNKNOWN']

Removed multicollinear features: ['Drug Substance Disorder_UNKNOWN', 'Opioid Related Disorder_UNKNOWN', 'Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN', 'Drug/Substance 12m Service_UNKNOWN']

Original class distribution in training data: Counter({'YES': 66984, 'NO': 1794, 'UNKNOWN': 692})

Training the BalancedRandomForestClassifier on the data...
Model training complete!

Model Accuracy: 0.9642

Classification Report:
              precision    recall  f1-score   support

          NO       0.00      0.00      0.00       769
     UNKNOWN       0.00      0.00      0.00       296
         YES       0.96      

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your substance history data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Substance History Female.csv"

# The columns relevant to this dataset.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Alcohol Related Disorder', 'Drug Substance Disorder', 'Opioid Related Disorder',
    'Cannabis Recreational Use', 'Cannabis Medicinal Use', 'Smokes',
    'Received Smoking Counseling', 'Alcohol 12m Service', 'Opioid 12m Service',
    'Drug/Substance 12m Service', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the specified columns
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# --- 3. Perform Sensitivity Analysis by removing UNKNOWN from key feature ---
# We will drop all rows where 'Alcohol Related Disorder' is 'UNKNOWN'
# This is our sensitivity analysis to see how the model performs without this feature.
data = data[data['Alcohol Related Disorder'] != 'UNKNOWN']
print("\nRemoved all rows where 'Alcohol Related Disorder' was 'UNKNOWN'.")
print(f"New dataset shape: {data.shape}")

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 4. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# --- Drop Multicollinear Features ---
if to_drop:
    X_encoded = X_encoded.drop(columns=to_drop)
    print(f"\nRemoved multicollinear features: {to_drop}")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 5. Train the BalancedRandomForestClassifier ---

# Using BalancedRandomForestClassifier for handling class imbalance
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the BalancedRandomForestClassifier on the data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 6. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 7. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Substance History Features (Sensitivity Analysis)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
# The plot will be saved in your project's main folder
output_file = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\substance_history_feature_importance_plot_sensitivity.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()


Data loaded successfully!

Removed all rows where 'Alcohol Related Disorder' was 'UNKNOWN'.
New dataset shape: (93211, 11)

Data has been cleaned and prepared for modeling.
Final feature set shape: (93211, 19)

Checking for multicollinearity...

Features with multicollinearity (correlation > 0.8):
['Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN', 'Drug/Substance 12m Service_UNKNOWN']

Removed multicollinear features: ['Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN', 'Drug/Substance 12m Service_UNKNOWN']

Original class distribution in training data: Counter({'YES': 63316, 'NO': 1774, 'UNKNOWN': 157})

Training the BalancedRandomForestClassifier on the data...
Model training complete!

Model Accuracy: 0.9703

Classification Report:
              precision    recall  f1-score   support

          NO       0.00      0.00      0.00       760
     UNKNOWN       0.00      0.00      0.00        68
         YES       0.97      1.00      0.98     27136

    accuracy     

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your substance history data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Substance History Female.csv"

# The columns relevant to this dataset.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Alcohol Related Disorder', 'Drug Substance Disorder', 'Opioid Related Disorder',
    'Cannabis Recreational Use', 'Cannabis Medicinal Use', 'Smokes',
    'Received Smoking Counseling', 'Alcohol 12m Service', 'Opioid 12m Service',
    'Drug/Substance 12m Service', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the specified columns
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# --- 3. Perform Sensitivity Analysis by removing UNKNOWN from key feature ---
# We will drop all rows where 'Alcohol Related Disorder' is 'UNKNOWN'
# This is our sensitivity analysis to see how the model performs without this feature.
data = data[data['Alcohol Related Disorder'] != 'UNKNOWN']
print("\nRemoved all rows where 'Alcohol Related Disorder' was 'UNKNOWN'.")
print(f"New dataset shape: {data.shape}")

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 4. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# --- Drop Multicollinear Features ---
if to_drop:
    X_encoded = X_encoded.drop(columns=to_drop)
    print(f"\nRemoved multicollinear features: {to_drop}")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 5. Train the BalancedRandomForestClassifier ---

# Using BalancedRandomForestClassifier for handling class imbalance
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the BalancedRandomForestClassifier on the data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 6. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 7. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Substance History Features (Sensitivity Analysis)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
# The plot will be saved in your project's main folder
output_file = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\substance_history_feature_importance_plot_sensitivity.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()


Data loaded successfully!

Removed all rows where 'Alcohol Related Disorder' was 'UNKNOWN'.
New dataset shape: (93211, 11)

Data has been cleaned and prepared for modeling.
Final feature set shape: (93211, 19)

Checking for multicollinearity...

Features with multicollinearity (correlation > 0.8):
['Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN', 'Drug/Substance 12m Service_UNKNOWN']

Removed multicollinear features: ['Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN', 'Drug/Substance 12m Service_UNKNOWN']

Original class distribution in training data: Counter({'YES': 63316, 'NO': 1774, 'UNKNOWN': 157})

Training the BalancedRandomForestClassifier on the data...
Model training complete!

Model Accuracy: 0.9703

Classification Report:
              precision    recall  f1-score   support

          NO       0.00      0.00      0.00       760
     UNKNOWN       0.00      0.00      0.00        68
         YES       0.97      1.00      0.98     27136

    accuracy     

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your substance history data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Substance History Female.csv"

# The columns relevant to this dataset.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Alcohol Related Disorder', 'Drug Substance Disorder', 'Opioid Related Disorder',
    'Cannabis Recreational Use', 'Cannabis Medicinal Use', 'Smokes',
    'Received Smoking Counseling', 'Alcohol 12m Service', 'Opioid 12m Service',
    'Drug/Substance 12m Service', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the specified columns
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# --- 3. Perform Mode Imputation on a key feature ---
# We will impute the 'UNKNOWN' values in 'Alcohol Related Disorder' with the mode.
# This will force the model to learn from the actual values rather than the missingness.
alcohol_mode = data['Alcohol Related Disorder'].mode()[0]
data['Alcohol Related Disorder'] = data['Alcohol Related Disorder'].replace('UNKNOWN', alcohol_mode)
print(f"\nReplaced 'UNKNOWN' in 'Alcohol Related Disorder' with the mode: {alcohol_mode}")

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 4. Check for Multicollinearity ---

print("\nChecking for multicollinearity...")
# Calculate the correlation matrix
corr_matrix = X_encoded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Identify highly correlated features (e.g., correlation > 0.8)
threshold = 0.8
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
if to_drop:
    print(f"\nFeatures with multicollinearity (correlation > {threshold}):")
    print(to_drop)
else:
    print(f"\nNo features with multicollinearity found above the {threshold} threshold.")

# --- Drop Multicollinear Features ---
if to_drop:
    X_encoded = X_encoded.drop(columns=to_drop)
    print(f"\nRemoved multicollinear features: {to_drop}")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 5. Train the BalancedRandomForestClassifier ---

# Using BalancedRandomForestClassifier for handling class imbalance
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the BalancedRandomForestClassifier on the data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 6. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 7. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Substance History Features (Mode Imputation)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
# The plot will be saved in your project's main folder
output_file = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\substance_history_feature_importance_plot_imputation.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()


Data loaded successfully!

Replaced 'UNKNOWN' in 'Alcohol Related Disorder' with the mode: NO

Data has been cleaned and prepared for modeling.
Final feature set shape: (99244, 19)

Checking for multicollinearity...

Features with multicollinearity (correlation > 0.8):
['Opioid Related Disorder_UNKNOWN', 'Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN', 'Drug/Substance 12m Service_UNKNOWN']

Removed multicollinear features: ['Opioid Related Disorder_UNKNOWN', 'Alcohol 12m Service_UNKNOWN', 'Opioid 12m Service_UNKNOWN', 'Drug/Substance 12m Service_UNKNOWN']

Original class distribution in training data: Counter({'YES': 66984, 'NO': 1794, 'UNKNOWN': 692})

Training the BalancedRandomForestClassifier on the data...
Model training complete!

Model Accuracy: 0.9641

Classification Report:
              precision    recall  f1-score   support

          NO       0.00      0.00      0.00       769
     UNKNOWN       0.00      0.00      0.00       296
         YES       0.96      1.

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your substance history data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Substance History Female.csv"

# The columns relevant to this dataset.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Alcohol Related Disorder', 'Drug Substance Disorder', 'Opioid Related Disorder',
    'Cannabis Recreational Use', 'Cannabis Medicinal Use', 'Smokes',
    'Received Smoking Counseling', 'Alcohol 12m Service', 'Opioid 12m Service',
    'Drug/Substance 12m Service', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the specified columns
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# --- 3. Implement Missingness Indicators and Mode Imputation ---
# This approach creates a new binary column for each 'UNKNOWN' feature
# and then uses mode imputation to fill the original column.

# Identify columns with 'UNKNOWN' values
cols_with_unknown = [
    'Alcohol Related Disorder', 'Drug Substance Disorder', 'Opioid Related Disorder',
    'Cannabis Recreational Use', 'Cannabis Medicinal Use', 'Smokes',
    'Received Smoking Counseling', 'Alcohol 12m Service', 'Opioid 12m Service',
    'Drug/Substance 12m Service'
]

for col in cols_with_unknown:
    # Create a new binary column to indicate if the value was originally 'UNKNOWN'
    data[f'{col}_Missing'] = (data[col] == 'UNKNOWN').astype(int)
    
    # Perform mode imputation on the original column
    mode_val = data[col].mode()[0]
    data[col] = data[col].replace('UNKNOWN', mode_val)

print("\nMissingness indicators created and 'UNKNOWN' values have been imputed with the mode.")

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 4. Check for Multicollinearity ---
# We will skip the full multicollinearity check to prevent the code from becoming too long.
# The `_Missing` columns are a form of multicollinearity, but we want to test their impact.
print("\nSkipping a full multicollinearity check as we are testing missingness indicators.")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 5. Train the BalancedRandomForestClassifier ---

# Using BalancedRandomForestClassifier for handling class imbalance
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the BalancedRandomForestClassifier on the data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 6. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 7. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Substance History Features (Missingness Indicators)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\substance_history_feature_importance_plot_missingness.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()


Data loaded successfully!

Missingness indicators created and 'UNKNOWN' values have been imputed with the mode.

Data has been cleaned and prepared for modeling.
Final feature set shape: (99244, 20)

Skipping a full multicollinearity check as we are testing missingness indicators.

Original class distribution in training data: Counter({'YES': 66984, 'NO': 1794, 'UNKNOWN': 692})

Training the BalancedRandomForestClassifier on the data...
Model training complete!

Model Accuracy: 0.9649

Classification Report:
              precision    recall  f1-score   support

          NO       0.00      0.00      0.00       769
     UNKNOWN       0.79      0.09      0.16       296
         YES       0.97      1.00      0.98     28709

    accuracy                           0.96     29774
   macro avg       0.59      0.36      0.38     29774
weighted avg       0.94      0.96      0.95     29774


Top 10 most important features:
Alcohol Related Disorder_Missing       0.186163
Opioid 12m Service_Missi

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import seaborn as sns
import numpy as np

# Suppress potential warnings from scikit-learn and imbalanced-learn
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Columns ---

# The new file path to your substance history data subset.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Substance History Female.csv"

# The columns relevant to this dataset.
# The `Mental Illness` column is our target.
columns_to_use = [
    'Alcohol Related Disorder', 'Drug Substance Disorder', 'Opioid Related Disorder',
    'Cannabis Recreational Use', 'Cannabis Medicinal Use', 'Smokes',
    'Received Smoking Counseling', 'Alcohol 12m Service', 'Opioid 12m Service',
    'Drug/Substance 12m Service', 'Mental Illness'
]

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the specified columns
data = data[columns_to_use]
target_column = 'Mental Illness'
data.dropna(subset=[target_column], inplace=True)

# --- 3. Implement Missingness Indicators and Mode Imputation ---
# This approach creates a new binary column for each 'UNKNOWN' feature
# and then uses mode imputation to fill the original column.

# Identify columns with 'UNKNOWN' values
cols_with_unknown = [
    'Alcohol Related Disorder', 'Drug Substance Disorder', 'Opioid Related Disorder',
    'Cannabis Recreational Use', 'Cannabis Medicinal Use', 'Smokes',
    'Received Smoking Counseling', 'Alcohol 12m Service', 'Opioid 12m Service',
    'Drug/Substance 12m Service'
]

for col in cols_with_unknown:
    # Create a new binary column to indicate if the value was originally 'UNKNOWN'
    data[f'{col}_Missing'] = (data[col] == 'UNKNOWN').astype(int)
    
    # Perform mode imputation on the original column
    mode_val = data[col].mode()[0]
    data[col] = data[col].replace('UNKNOWN', mode_val)

print("\nMissingness indicators created and 'UNKNOWN' values have been imputed with the mode.")

# Separate features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Convert categorical features to dummy/indicator variables
X_encoded = pd.get_dummies(X, drop_first=True)

print("\nData has been cleaned and prepared for modeling.")
print(f"Final feature set shape: {X_encoded.shape}")

# --- 4. Check for Multicollinearity ---
# We will skip the full multicollinearity check to prevent the code from becoming too long.
# The `_Missing` columns are a form of multicollinearity, but we want to test their impact.
print("\nSkipping a full multicollinearity check as we are testing missingness indicators.")

# Split the data into a training set (70%) and a testing set (30%).
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)
print("\nOriginal class distribution in training data:", Counter(y_train))

# --- 5. Train the BalancedRandomForestClassifier ---

# Using BalancedRandomForestClassifier for handling class imbalance
model = BalancedRandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, sampling_strategy='not majority')
print("\nTraining the BalancedRandomForestClassifier on the data...")
model.fit(X_train, y_train)
print("Model training complete!")

# --- 6. Evaluate the Model ---

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get the feature importances from the final model
feature_importances = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print("\nTop 10 most important features:")
print(feature_importances.head(10))

# --- 7. Plot and Save Feature Importances ---

plt.figure(figsize=(12, 8))
feature_importances.head(10).sort_values().plot(kind='barh')
plt.title('Top 10 Most Important Substance History Features (Missingness Indicators)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
output_file = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\substance_history_feature_importance_plot_missingness.png"
plt.savefig(output_file)
print(f"\nPlot saved successfully as '{output_file}'")
plt.close()


Data loaded successfully!

Missingness indicators created and 'UNKNOWN' values have been imputed with the mode.

Data has been cleaned and prepared for modeling.
Final feature set shape: (99244, 20)

Skipping a full multicollinearity check as we are testing missingness indicators.

Original class distribution in training data: Counter({'YES': 66984, 'NO': 1794, 'UNKNOWN': 692})

Training the BalancedRandomForestClassifier on the data...
Model training complete!

Model Accuracy: 0.9649

Classification Report:
              precision    recall  f1-score   support

          NO       0.00      0.00      0.00       769
     UNKNOWN       0.79      0.09      0.16       296
         YES       0.97      1.00      0.98     28709

    accuracy                           0.96     29774
   macro avg       0.59      0.36      0.38     29774
weighted avg       0.94      0.96      0.95     29774


Top 10 most important features:
Alcohol Related Disorder_Missing       0.186163
Opioid 12m Service_Missi

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Suppress potential warnings from pandas and other libraries
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Segmentation Columns ---

# The file path to your substance history data.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Substance History Female.csv"

# The columns to use for segmentation.
segment_cols = ['Region Served', 'Age Group']

# --- 2. Load and Prepare Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Identify all substance history columns that are categorical and have 'UNKNOWN' values.
substance_history_cols = [
    'Alcohol Related Disorder', 'Drug Substance Disorder', 'Opioid Related Disorder',
    'Cannabis Recreational Use', 'Cannabis Medicinal Use', 'Smokes',
    'Received Smoking Counseling', 'Alcohol 12m Service', 'Opioid 12m Service',
    'Drug/Substance 12m Service'
]

# --- 3. Segment Analysis of 'UNKNOWN' Values ---

print("\nPerforming segment analysis on 'UNKNOWN' values...")

# Create a new DataFrame to hold the counts of 'UNKNOWN' for each segment.
unknown_counts = {}
for col in substance_history_cols:
    unknown_counts[col] = data[data[col] == 'UNKNOWN'].groupby(segment_cols).size()

# Convert the dictionary to a DataFrame for easier visualization.
unknown_df = pd.DataFrame(unknown_counts).fillna(0).astype(int)

print("\n--- Distribution of 'UNKNOWN' Values by Region and Age Group ---")
print(unknown_df)

# --- 4. Generate Visualizations ---

print("\nGenerating visualizations...")
# Reset the index to make 'Region Served' and 'Age Group' columns
unknown_df_flat = unknown_df.reset_index()

# Set up the plot layout
plt.style.use('seaborn-v0_8-whitegrid')
fig, axes = plt.subplots(len(substance_history_cols), 1, figsize=(15, 6 * len(substance_history_cols)))
fig.suptitle('Distribution of UNKNOWN Substance History by Region and Age', fontsize=18, y=0.99)

# Create a bar plot for each substance history column
for i, col in enumerate(substance_history_cols):
    sns.barplot(data=unknown_df_flat, x='Region Served', y=col, hue='Age Group', ax=axes[i], palette='viridis')
    axes[i].set_title(f"UNKNOWN in '{col}'")
    axes[i].set_xlabel("Region Served")
    axes[i].set_ylabel("Count of UNKNOWN")
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].legend(title='Age Group')

plt.tight_layout()
output_file = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\unknown_segment_analysis.png"
plt.savefig(output_file)
print(f"\nVisualizations saved successfully as '{output_file}'")
plt.close()


Data loaded successfully!

Performing segment analysis on 'UNKNOWN' values...

--- Distribution of 'UNKNOWN' Values by Region and Age Group ---
                                Alcohol Related Disorder  \
Region Served        Age Group                             
CENTRAL NY REGION    ADULT                           474   
                     CHILD                           122   
HUDSON RIVER REGION  ADULT                           723   
                     CHILD                           251   
                     UNKNOWN                           2   
LONG ISLAND REGION   ADULT                           176   
                     CHILD                            29   
                     UNKNOWN                           1   
NEW YORK CITY REGION ADULT                          2975   
                     CHILD                           550   
                     UNKNOWN                           2   
WESTERN REGION       ADULT                           639   
                