In [1]:
pip install pandas factor_analyzer scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import os

# CONFIRM: This path is correct for your system.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female Social data .csv"

# Check if the file exists
if os.path.exists(file_path):
    print("Success: File found!")
    df_check = pd.read_csv(file_path)
    print("\nColumns in your file:")
    print(df_check.columns.tolist())
else:
    print("Error: File not found. Please double-check the path.")

Success: File found!

Columns in your file:
['Survey Year', 'Program Category', 'Region Served', 'Patient ID', 'Age Group', 'Sex', 'Transgender', 'Sexual Orientation', 'Hispanic Ethnicity', 'Race', 'Living Situation', 'Household Composition', 'Preferred Language', 'Religious Preference', 'Veteran Status', 'Employment Status', 'Number Of Hours Worked Each Week', 'Education Status', 'Special Education Services', 'Mental Illness']


In [None]:
import pandas as pd
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# --- Part 1: Data Preparation ---
# 1. Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female Social data .csv"
df = pd.read_csv(file_path)

# 2. Identify the target variable and features for Factor Analysis
target_variable = 'Mental Illness'
df_features = df.drop(columns=[target_variable])

# 3. Handle non-numeric data using one-hot encoding
df_processed = pd.get_dummies(df_features, drop_first=True)

# 4. Handle potential missing values
df_processed = df_processed.fillna(df_processed.mean(numeric_only=True))

# --- Part 2: Factor Analysis ---
print("--- Running Factor Analysis Prerequisites ---")
if df_processed.shape[1] < 2:
    print("Not enough variables for Factor Analysis.")
else:
    kmo_all, kmo_model = calculate_kmo(df_processed)
    print(f"KMO Test Statistic: {kmo_model:.4f}")
    if kmo_model < 0.6:
        print("Interpretation: KMO is too low. Factor Analysis may not be appropriate.\n")
    else:
        print("Interpretation: KMO is acceptable.\n")

    chi_square_value, p_value = calculate_bartlett_sphericity(df_processed)
    print(f"Bartlett's Test Chi-Square: {chi_square_value:.2f}")
    print(f"Bartlett's Test p-value: {p_value:.4f}")
    if p_value < 0.05:
        print("Interpretation: The p-value is significant. Data is suitable.\n")
    else:
        print("Interpretation: The p-value is not significant. Data is not suitable.\n")

# 6. Fit the Factor Analysis model
fa = FactorAnalyzer(n_factors=df_processed.shape[1], rotation="varimax")
fa.fit(df_processed)

# 7. Get Eigenvalues and determine the number of factors to retain
eigenvalues, _ = fa.get_eigenvalues()
num_factors = sum(eigenvalues > 1)
print(f"\nNumber of factors to retain (Eigenvalues > 1): {num_factors}")

# 8. Re-fit the model with the determined number of factors
fa_final = FactorAnalyzer(n_factors=num_factors, rotation="varimax")
fa_final.fit(df_processed)

# 9. Display Factor Loadings
print("\n--- Factor Loadings (Variable correlations with each factor) ---")
loadings_df = pd.DataFrame(fa_final.loadings_, index=df_processed.columns, columns=[f'Factor {i+1}' for i in range(num_factors)])
print(loadings_df)

# --- Part 3: Regression Analysis ---
print("\n--- Running Regression Analysis on the Factors ---")

# 10. Get the factor scores for each observation
factor_scores = fa_final.transform(df_processed)
factor_scores_df = pd.DataFrame(factor_scores, columns=[f'Factor {i+1}' for i in range(num_factors)])

# 11. Prepare the data for regression
regression_df = factor_scores_df.copy()
regression_df[target_variable] = df[target_variable].map({'YES': 1, 'NO': 0, 'UNKNOWN': None})

# Drop rows with unknown values for the target variable
regression_df.dropna(subset=[target_variable], inplace=True)
X = regression_df.drop(columns=[target_variable])
y = regression_df[target_variable]

# 12. Train and fit a Logistic Regression model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

# 13. Interpret the regression coefficients
print("\n--- Regression Coefficients of Factors Predicting Mental Illness ---")
regression_coefficients = pd.DataFrame(model.coef_.T, index=X.columns, columns=['Coefficient'])
regression_coefficients['Absolute Value'] = regression_coefficients['Coefficient'].abs()
regression_coefficients = regression_coefficients.sort_values(by='Absolute Value', ascending=False)

print("Higher absolute coefficient values indicate a stronger impact on the probability of mental illness.")
print(regression_coefficients)

In [None]:
import pandas as pd
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo
import warnings
warnings.filterwarnings('ignore')

# --- Part 1: Data Preparation ---
# 1. Define the file path and the specific columns for analysis.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female Social data .csv"

# This list includes only the columns you want to analyze.
columns_for_analysis = [
    'Living Situation',
    'Household Composition',
    'Employment Status',
    'Number Of Hours Worked Each Week',
    'Education Status',
    'Mental Illness'
]

# 2. Load the dataset.
try:
    df = pd.read_csv(file_path)
    # Select only the specified columns.
    df_selected = df[columns_for_analysis]
    print("Columns loaded successfully.")
except KeyError as e:
    print(f"Error: A column name was not found. Please check your spelling.")
    print(f"Details: {e}")
    # The code will stop here if the column names are incorrect.
    # If this happens, you must check the names in your file.
    raise

# 3. Handle non-numeric data using one-hot encoding.
# This converts your categorical data into a numeric format.
df_processed = pd.get_dummies(df_selected, drop_first=True)

# 4. Handle potential missing values.
df_processed = df_processed.fillna(df_processed.mean(numeric_only=True))

# --- Part 2: Factor Analysis ---
print("\n--- Running Factor Analysis Prerequisites ---")

# 5. Run prerequisite tests (KMO and Bartlett's Test).
if df_processed.shape[1] < 2:
    print("Not enough variables for Factor Analysis.")
else:
    kmo_all, kmo_model = calculate_kmo(df_processed)
    print(f"KMO Test Statistic: {kmo_model:.4f}")
    if kmo_model < 0.6:
        print("Interpretation: KMO is too low. Factor Analysis may not be appropriate.\n")
    else:
        print("Interpretation: KMO is acceptable.\n")

    chi_square_value, p_value = calculate_bartlett_sphericity(df_processed)
    print(f"Bartlett's Test Chi-Square: {chi_square_value:.2f}")
    print(f"Bartlett's Test p-value: {p_value:.4f}")
    if p_value < 0.05:
        print("Interpretation: The p-value is significant. Data is suitable.\n")
    else:
        print("Interpretation: The p-value is not significant. Data is not suitable.\n")

# 6. Fit the Factor Analysis model.
# We'll automatically determine the number of factors based on eigenvalues > 1.
fa = FactorAnalyzer(n_factors=df_processed.shape[1], rotation="varimax")
fa.fit(df_processed)

# 7. Get Eigenvalues and determine the number of factors to retain.
eigenvalues, _ = fa.get_eigenvalues()
num_factors = sum(eigenvalues > 1)
print(f"\nNumber of factors to retain (Eigenvalues > 1): {num_factors}")

# 8. Re-fit the model with the determined number of factors.
fa_final = FactorAnalyzer(n_factors=num_factors, rotation="varimax")
fa_final.fit(df_processed)

# 9. Display Factor Loadings.
print("\n--- Factor Loadings (Variable correlations with each factor) ---")
loadings_df = pd.DataFrame(fa_final.loadings_, index=df_processed.columns, columns=[f'Factor {i+1}' for i in range(num_factors)])
print(loadings_df)

In [None]:
import pandas as pd
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
import warnings

warnings.filterwarnings('ignore')

# --- Part 1: Data Preparation ---
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female Social data .csv"

columns_for_analysis = [
    'Living Situation',
    'Household Composition',
    'Employment Status',
    'Number Of Hours Worked Each Week',
    'Education Status',
    'Mental Illness'
]

# Load and select columns
try:
    df = pd.read_csv(file_path)
    df_selected = df[columns_for_analysis]
    print(" Columns loaded successfully.")
except KeyError as e:
    print(" Error: A column name was not found. Please check your spelling.")
    print(f"Details: {e}")
    raise

# One-hot encode categorical variables
df_processed = pd.get_dummies(df_selected, drop_first=True)

# Fill missing values
df_processed = df_processed.fillna(df_processed.mean(numeric_only=True))

# --- Part 2: Factor Analysis ---
print("\n--- Running Factor Analysis Prerequisites ---")

if df_processed.shape[1] < 2:
    print(" Not enough variables for Factor Analysis.")
else:
    # KMO Test
    kmo_all, kmo_model = calculate_kmo(df_processed)
    print(f"KMO Test Statistic: {kmo_model:.4f}")
    print("Interpretation:", " Acceptable" if kmo_model >= 0.6 else " Too low for Factor Analysis")

    # Bartlett's Test
    chi_square_value, p_value = calculate_bartlett_sphericity(df_processed)
    print(f"Bartlett's Test Chi-Square: {chi_square_value:.2f}")
    print(f"Bartlett's Test p-value: {p_value:.4f}")
    print("Interpretation:", " Suitable for Factor Analysis" if p_value < 0.05 else  Not suitable")

    # Initial Factor Analysis to get eigenvalues
    fa = FactorAnalyzer(n_factors=df_processed.shape[1], rotation="varimax")
    fa.fit(df_processed)
    eigenvalues, _ = fa.get_eigenvalues()
    num_factors = sum(eigenvalues > 1)
    print(f"\nNumber of factors to retain (Eigenvalues > 1): {num_factors}")

    # Final Factor Analysis
    fa_final = FactorAnalyzer(n_factors=num_factors, rotation="varimax")
    fa_final.fit(df_processed)

    # Display Factor Loadings
    print("\n--- Factor Loadings (Variable correlations with each factor) ---")
    loadings_df = pd.DataFrame(
        fa_final.loadings_,
        index=df_processed.columns,
        columns=[f'Factor {i+1}' for i in range(num_factors)]
    )
    print(loadings_df)


In [None]:
import pandas as pd
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
import warnings

warnings.filterwarnings('ignore')

# --- Part 1: Data Preparation ---
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female Social data .csv"

columns_for_analysis = [
    'Living Situation',
    'Household Composition',
    'Employment Status',
    'Number Of Hours Worked Each Week',
    'Education Status',
    'Mental Illness'
]

# Load and select columns
try:
    df = pd.read_csv(file_path)
    df_selected = df[columns_for_analysis]
    print("Columns loaded successfully.")
except KeyError as e:
    print("Error: A column name was not found. Please check your spelling.")
    print(f"Details: {e}")
    raise

# One-hot encode categorical variables
df_processed = pd.get_dummies(df_selected, drop_first=True)

# Fill missing values
df_processed = df_processed.fillna(df_processed.mean(numeric_only=True))

# --- Part 2: Factor Analysis ---
print("\n--- Running Factor Analysis Prerequisites ---")

if df_processed.shape[1] < 2:
    print("Not enough variables for Factor Analysis.")
else:
    # KMO Test
    kmo_all, kmo_model = calculate_kmo(df_processed)
    print(f"KMO Test Statistic: {kmo_model:.4f}")
    print("Interpretation:", "Acceptable" if kmo_model >= 0.6 else "Too low for Factor Analysis")

    # Bartlett's Test
    chi_square_value, p_value = calculate_bartlett_sphericity(df_processed)
    print(f"Bartlett's Test Chi-Square: {chi_square_value:.2f}")
    print(f"Bartlett's Test p-value: {p_value:.4f}")
    print("Interpretation:", "Suitable for Factor Analysis" if p_value < 0.05 else "Not suitable")

    # Initial Factor Analysis to get eigenvalues
    fa = FactorAnalyzer(n_factors=df_processed.shape[1], rotation="varimax")
    fa.fit(df_processed)
    eigenvalues, _ = fa.get_eigenvalues()
    num_factors = sum(eigenvalues > 1)
    print(f"\nNumber of factors to retain (Eigenvalues > 1): {num_factors}")

    # Final Factor Analysis
    fa_final = FactorAnalyzer(n_factors=num_factors, rotation="varimax")
    fa_final.fit(df_processed)

    # Display Factor Loadings
    print("\n--- Factor Loadings (Variable correlations with each factor) ---")
    loadings_df = pd.DataFrame(
        fa_final.loadings_,
        index=df_processed.columns,
        columns=[f'Factor {i+1}' for i in range(num_factors)]
    )
    print(loadings_df)
