In [67]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import scipy.stats as stats
import numpy as np
from scipy.stats import chi2_contingency
from data_loader import data_loader

import warnings
warnings.filterwarnings("ignore")

In [36]:
data = data_loader()

In [40]:
def normality_test(group1, group2):

    """
    This function takes in two groups and returns the shapiro and levene test results
    The shapiro test is used to test for normality (null hypothesis: data is normally distributed)
    The levene test is used to test for homogeneity of variance (null hypothesis: variances are equal)

    If the data is normally distributed we should see p-values above 0.05
    """

    mask1, mask2 =  ~np.isnan(group1), ~np.isnan(group2)
    group1_clean, group2_clean = group1[mask1], group2[mask2]

    shapiro_1, shap_p_1 = stats.shapiro(group1)
    shapiro_2, shap_p_2 = stats.shapiro(group2)

    levene_1, levene_p_1 = stats.levene(group1_clean, group2_clean)
    levene_2, levene_p_2 = stats.levene(group1_clean, group2_clean)

    print(f"Group 1 (shapiro p-value): {shap_p_1}")
    print(f"Group 2 (shapiro p-value): {shap_p_2}")

    print(f"Group 1 (levene p-value): {levene_p_1}")
    print(f"Group 2 (levene p-value): {levene_p_2}")


In [64]:
def statistical_test(group1, group2, normality=True):

    """
    This function takes in two groups and returns the t-test or mannwhitneyu test results
    The t-test is used to test for normality (null hypothesis: data is normally distributed)
    The mannwhitneyu test is used to test for homogeneity of variance (null hypothesis: variances are equal)
    """

    mask1, mask2 =  ~np.isnan(group1), ~np.isnan(group2)
    group1_clean, group2_clean = group1[mask1], group2[mask2]

    if normality:

        t_stat, p_val = stats.ttest_ind(group1_clean, group2_clean)
        print(f"t-statistic: {t_stat}")
        print(f"p-value (t-test): {p_val}")

    else:

        u_stat, p_val = stats.mannwhitneyu(group1_clean, group2_clean)
        print(f"u-statistic: {u_stat}")
        print(f"p-value (mannwhitneyu): {p_val}")

In [65]:
male_income = data[data.applicant_sex == 1]['applicant_income_000s']
female_income = data[data.applicant_sex == 2]['applicant_income_000s']

normality_test(male_income, female_income)
statistical_test(male_income, female_income, normality=True)

Group 1 (shapiro p-value): 1.0
Group 2 (shapiro p-value): 1.0
Group 1 (levene p-value): 0.2184017198539339
Group 2 (levene p-value): 0.2184017198539339
t-statistic: 1.6525441831389538
p-value (t-test): 0.09846373881805694


In [66]:
male_income = data[data.applicant_co_applicant_sex == '1_5']['applicant_income_000s']
female_income = data[data.applicant_co_applicant_sex == '2_5']['applicant_income_000s']

normality_test(male_income, female_income)
statistical_test(male_income, female_income, normality=False)

Group 1 (shapiro p-value): 1.0
Group 2 (shapiro p-value): 1.0
Group 1 (levene p-value): 3.9273910348713235e-06
Group 2 (levene p-value): 3.9273910348713235e-06
u-statistic: 2784332.5
p-value (mannwhitneyu): 1.3126951013388942e-30


In [69]:
loan_outcomes = ['Loan originated', 'Application denied by financial institution', 'Application approved but not accepted']
cont_table = pd.crosstab(data['applicant_sex'], data['action_taken'].isin(loan_outcomes))

# Perform a chi-squared test of independence
chi2_stat, p_val, dof, expected = chi2_contingency(cont_table)

# Print the test results
print("Chi-squared statistic:", chi2_stat)
print("p-value:", p_val)
print("Degrees of freedom:", dof)
print("Expected frequencies:\n", expected)

Chi-squared statistic: 0.0
p-value: 1.0
Degrees of freedom: 0
Expected frequencies:
 [[5594.]
 [2601.]
 [ 711.]
 [1094.]]


In [74]:
print(cont_table)

action_taken   False
applicant_sex       
1               5594
2               2601
3                711
4               1094
