In [1]:
import pandas as pd
import warnings

# Suppress potential warnings from pandas
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Master Columns ---

# The file path for your combined dataset.
file_path = r"Female complete Data CSV.csv" # The updated file path

# A master list of all relevant columns for the analysis.
master_columns = [
    'Gender', # We need this to filter for female data
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status',
    'Region Served',
    'Mental Illness'
]

# --- 2. Load Data and Filter for Female Records ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# Filter data to include only the female records
female_data = data[data['Gender'] == 'FEMALE'].copy()
female_data.drop(columns=['Gender'], inplace=True)
female_data = female_data[master_columns] # Filter to master list

print(f"Female data shape: {female_data.shape}")
print("\nFemale data filtered successfully!")

# --- 3. Exploratory Data Analysis (EDA) ---

print("\n--- EDA: Analysis of Missing Values (UNKNOWN) ---")
# Count the number of 'UNKNOWN' values for each column
for col in female_data.columns:
    unknown_count = (female_data[col] == 'UNKNOWN').sum()
    if unknown_count > 0:
        print(f"Column '{col}': {unknown_count} UNKNOWN values")

print("\n--- EDA: Geographic Disparities in UNKNOWN values ---")
# Analyze UNKNOWN values by region
regions = female_data['Region Served'].unique()
for col in female_data.columns:
    unknown_by_region = female_data[female_data[col] == 'UNKNOWN'].groupby('Region Served')[col].count()
    if not unknown_by_region.empty:
        print(f"\nUNKNOWN values in column '{col}' by Region:")
        print(unknown_by_region)

print("\n--- EDA: Class Distribution of the Target Variable ---")
# Check the distribution of the target variable 'Mental Illness'
target_distribution = female_data['Mental Illness'].value_counts()
print(target_distribution)

print("\n--- EDA Complete ---")
print("The EDA has been performed, and the results are ready for interpretation.")


Data loaded successfully!


KeyError: 'Gender'

In [2]:
import pandas as pd
import warnings

# Suppress potential warnings from pandas
warnings.filterwarnings('ignore')

# --- 1. Define File Path and Master Columns ---

# The file path for your female-only dataset.
file_path = r"Female complete Data CSV.csv"

# A master list of all relevant columns for the analysis.
# The 'Gender' column has been removed since the data is already filtered.
master_columns = [
    'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race',
    'Living Situation', 'Household Composition', 'Preferred Language',
    'Religious Preference', 'Veteran Status', 'Employment Status',
    'Number Of Hours Worked Each Week', 'Education Status',
    'Special Education Services',
    'Intellectual Disability', 'Autism Spectrum',
    'Other Developmental Disability', 'Alcohol Related Disorder',
    'Drug Substance Disorder', 'Opioid Related Disorder',
    'Mobility Impairment Disorder', 'Hearing Impairment',
    'Visual Impairment', 'Speech Impairment', 'Hyperlipidemia',
    'High Blood Pressure', 'Diabetes', 'Obesity', 'Heart Attack',
    'Stroke', 'Other Cardiac', 'Pulmonary Asthma',
    'Alzheimer or Dementia', 'Kidney Disease', 'Liver Disease',
    'Endocrine Condition', 'Neurological Condition',
    'Traumatic Brain Injury', 'Joint Disease', 'Cancer',
    'Other Chronic Med Condition', 'No Chronic Med Condition',
    'Unknown Chronic Med Condition', 'Cannabis Recreational Use',
    'Cannabis Medicinal Use', 'Smokes', 'Received Smoking Counseling',
    'Serious Mental Illness', 'Alcohol 12m Service',
    'Opioid 12m Service', 'Drug/Substance 12m Service',
    'Principal Diagnosis Class', 'Criminal Justice Status',
    'Region Served',
    'Mental Illness'
]

# --- 2. Load Data ---

try:
    data = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    exit()

# The data is already filtered for females, so we just select the master columns
female_data = data[master_columns].copy()

print(f"Female data shape: {female_data.shape}")
print("\nFemale data is ready for analysis!")

# --- 3. Exploratory Data Analysis (EDA) ---

print("\n--- EDA: Analysis of Missing Values (UNKNOWN) ---")
# Count the number of 'UNKNOWN' values for each column
for col in female_data.columns:
    unknown_count = (female_data[col] == 'UNKNOWN').sum()
    if unknown_count > 0:
        print(f"Column '{col}': {unknown_count} UNKNOWN values")

print("\n--- EDA: Geographic Disparities in UNKNOWN values ---")
# Analyze UNKNOWN values by region
regions = female_data['Region Served'].unique()
for col in female_data.columns:
    unknown_by_region = female_data[female_data[col] == 'UNKNOWN'].groupby('Region Served')[col].count()
    if not unknown_by_region.empty:
        print(f"\nUNKNOWN values in column '{col}' by Region:")
        print(unknown_by_region)

print("\n--- EDA: Class Distribution of the Target Variable ---")
# Check the distribution of the target variable 'Mental Illness'
target_distribution = female_data['Mental Illness'].value_counts()
print(target_distribution)

print("\n--- EDA Complete ---")
print("The EDA has been performed, and the results are ready for interpretation.")


Data loaded successfully!
Female data shape: (99244, 54)

Female data is ready for analysis!

--- EDA: Analysis of Missing Values (UNKNOWN) ---
Column 'Transgender': 6740 UNKNOWN values
Column 'Sexual Orientation': 12598 UNKNOWN values
Column 'Hispanic Ethnicity': 2902 UNKNOWN values
Column 'Living Situation': 4493 UNKNOWN values
Column 'Household Composition': 5226 UNKNOWN values
Column 'Preferred Language': 1460 UNKNOWN values
Column 'Veteran Status': 3882 UNKNOWN values
Column 'Education Status': 10759 UNKNOWN values
Column 'Special Education Services': 2282 UNKNOWN values
Column 'Intellectual Disability': 8506 UNKNOWN values
Column 'Autism Spectrum': 7819 UNKNOWN values
Column 'Other Developmental Disability': 7889 UNKNOWN values
Column 'Alcohol Related Disorder': 6033 UNKNOWN values
Column 'Drug Substance Disorder': 5983 UNKNOWN values
Column 'Opioid Related Disorder': 7548 UNKNOWN values
Column 'Mobility Impairment Disorder': 7351 UNKNOWN values
Column 'Hearing Impairment': 7726 