# Run the data cleaning and functions notebook 

In [18]:
%run 02_User_Defined_Functions.ipynb


# import libraries in case needed
import pandas as pd
from scipy import stats
import scipy.stats as stats
import matplotlib.pyplot as plt
import statsmodels.stats.multicomp as multi
from scipy.stats import levene
from scipy.stats import mannwhitneyu
from scipy.stats import f_oneway
import scikit_posthocs as sp


# Call functions to remove extreme outliers

In [19]:
# Call the function that calculates each school's average retention rate
df_cleaned = calculate_school_retention_percentage(df_cleaned)

# Call function to remove extreme outliers before any statistical analysis
df_cleaned = remove_outliers(df_cleaned)

In [20]:
#df_cleaned.columns.tolist()

# Gender (Female vs. Male)

## Understand the format 

Pasted here as a reminder.

In [21]:
# Each school's enrollment broken down by grade are in these cols.
grades = ['KG', 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'G07', 'G08', 'G09', 'G10', 'G11', 'G12']

# These are the 7 racial groups in the data.
races = ['AM', 'AS', 'BL', 'HP', 'HI', 'TR', 'WH']

# Gender categories. It has the letter F or M in the enrollment and retention count cols. 
genders = ['F','M']

# Each school's enrollment broken down by race and gender are in these cols. 
enrollment_cols = ['AMALM','AMALF','ASALM','ASALF','BLALM','BLALF','HPALM','HPALF','HIALM','HIALF',
'TRALM','TRALF','WHALM','WHALF']
# Col name ormat: race+AL+F (for female) or race+AL+M (for male)


# Each school's retention count broken down by grade, race, and gender in these cols:
#'SCH_RET_KG_HI_M','SCH_RET_KG_HI_F','SCH_RET_KG_AM_M','SCH_RET_KG_AM_F', ...'SCH_RET_G01_AM_F'...
# Col name format: SCH_RET_grade_race_gender

## Step 1: Calculate female and male retention rate 

In [22]:
# Call function that calculate gender retention rates 
df_cleaned=calculate_gender_retention_rate(df_cleaned)

print (df_cleaned['ret_rate_female'].mean())
print (df_cleaned['ret_rate_male'].mean())

2.8581509697755854
3.552411454787485


## Step 2: Check assumption_normal distribution 

In [23]:
# The charts show roughly normal distribution. Commented out to save graphic space. 
'''
# Distribution of female retention rates
plt.hist(df_cleaned['ret_rate_female'].dropna(),label='Female Retention Rates')
plt.title('Histogram of Female Retention Rates')
plt.xlabel('Retention Rate (%)')
plt.ylabel('Frequency')
plt.show()

# Distribution of  male retention rates
plt.hist(df_cleaned['ret_rate_male'].dropna(),label='Male Retention Rates')
plt.title('Histogram of Male Retention Rates')
plt.xlabel('Retention Rate (%)')
plt.ylabel('Frequency')
plt.show()

'''

"\n# Distribution of female retention rates\nplt.hist(df_cleaned['ret_rate_female'].dropna(),label='Female Retention Rates')\nplt.title('Histogram of Female Retention Rates')\nplt.xlabel('Retention Rate (%)')\nplt.ylabel('Frequency')\nplt.show()\n\n# Distribution of  male retention rates\nplt.hist(df_cleaned['ret_rate_male'].dropna(),label='Male Retention Rates')\nplt.title('Histogram of Male Retention Rates')\nplt.xlabel('Retention Rate (%)')\nplt.ylabel('Frequency')\nplt.show()\n\n"

--> The data have roughly normal distribution, although there is a longer tail on the right.

## Step 2: Check assumption_equal variance

In [24]:
stat, p = levene(df_cleaned['ret_rate_female'].dropna(), df_cleaned['ret_rate_male'].dropna())

print('Levene’s test statistic:', stat)
print('p-value:', p)

Levene’s test statistic: 112.72661291662762
p-value: 2.577312193585689e-26


## Step 3: Run Mann Whitney U test 

Because we don't have equal variance, run Mann u test - the non-parametric version of standard t test.

In [25]:
stat, p = mannwhitneyu(df_cleaned['ret_rate_female'].dropna(), df_cleaned['ret_rate_male'].dropna())
print(f"U statistic: {stat}")
print(f"P-value: {p}")

U statistic: 717720726.5
P-value: 5.864798031661489e-221


# Race (7 categories)

In [26]:
races = ['AM', 'AS', 'BL', 'HP', 'HI', 'TR', 'WH']

## Step 1: Calculate retention rate for racial groups

In [27]:
# Call function that calculates race retention rates 
df_cleaned=calculate_race_retention_rate(df_cleaned)

# Print the mean retentio rate for each race (just to confirm)
for race in races:
    print(f"{race}: {df_cleaned[f'ret_rate_{race}'].mean():.2f}%")

AM: 3.66%
AS: 1.62%
BL: 3.85%
HP: 2.71%
HI: 3.47%
TR: 3.25%
WH: 2.97%


## Step 2: Check assumption_normal distribution

In [28]:
# Graphics shows roughtly normal distribution. Commented out due to graphics taking a lot of space. 

'''
def plot_histogram(data, title, xlabel):
    plt.hist(data, bins=20)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel('Frequency')
    plt.show()

for race in races:
    column_name = f'ret_rate_{race}'
    df_cleaned[column_name] = df_cleaned[column_name].replace([np.inf, -np.inf], np.nan)
    retention_rates = df_cleaned[column_name].dropna()
    
    plot_histogram(retention_rates, f'Retention Rates Distribution for {race}', 'Retention Rate (%)')

'''

"\ndef plot_histogram(data, title, xlabel):\n    plt.hist(data, bins=20)\n    plt.title(title)\n    plt.xlabel(xlabel)\n    plt.ylabel('Frequency')\n    plt.show()\n\nfor race in races:\n    column_name = f'ret_rate_{race}'\n    df_cleaned[column_name] = df_cleaned[column_name].replace([np.inf, -np.inf], np.nan)\n    retention_rates = df_cleaned[column_name].dropna()\n    \n    plot_histogram(retention_rates, f'Retention Rates Distribution for {race}', 'Retention Rate (%)')\n\n"

--> The data is roughly normally distributed, although not perfect. 

## Step 2: Check assumption_equal variances 

In [29]:
# Extract retention rates for each race and add to a list
retention_rates_by_race = []

# Loop through each race 
for race in races:
    retention_rates = df_cleaned[f'ret_rate_{race}'].dropna()
    retention_rates_by_race.append(retention_rates)


# Perform Levene's test
stat, p_value = levene(*retention_rates_by_race) # unpack all items in the list with *

print(f"Levene's test statistic: {stat}")
print(f"P-value: {p_value}")

Levene's test statistic: 148.89540386399136
P-value: 2.4503042818742784e-189


## Step 3: Perform Kruskal-Wallis H test 

Since we don't have equal variance, we do this test - the non-parametric version of ANOVA.

In [30]:
from scipy.stats import kruskal

stat, p = kruskal(*retention_rates_by_race)

print(f"Kruskal-Wallis H Test statistic: {stat}")
print(f"P-value: {p}")

Kruskal-Wallis H Test statistic: 34859.019484470096
P-value: 0.0


### Conduct post hoc Dunn's test: compare MEDIAN 

Since there is a significant difference, we do follow up tests to see which groups differ. 

In [31]:
# For each race, calculate their median. Use 2 decimal places.
for race in races:
    print (f"{race}: {df_cleaned[f'ret_rate_{race}'].median():.2f}%") 


AM: 0.00%
AS: 0.00%
BL: 0.00%
HP: 0.00%
HI: 0.52%
TR: 0.00%
WH: 0.83%


In [32]:
# Conduct Dunn's test 
data = [df_cleaned[f'ret_rate_{race}'].dropna() for race in races]

# Conduct Dunn's test for pairwise comparisons
p_values = sp.posthoc_dunn(data, p_adjust='bonferroni')

print("Dunn's test p-values to compare medians:")
print(p_values)


Dunn's test p-values to compare medians:
              1             2             3             4              5    6   
1  1.000000e+00  1.000000e+00  0.000000e+00  2.020958e-31   0.000000e+00  0.0  \
2  1.000000e+00  1.000000e+00  0.000000e+00  2.897803e-30   0.000000e+00  0.0   
3  0.000000e+00  0.000000e+00  1.000000e+00  0.000000e+00   1.058164e-93  0.0   
4  2.020958e-31  2.897803e-30  0.000000e+00  1.000000e+00   0.000000e+00  0.0   
5  0.000000e+00  0.000000e+00  1.058164e-93  0.000000e+00   1.000000e+00  0.0   
6  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   0.000000e+00  1.0   
7  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  6.732532e-180  0.0   

               7  
1   0.000000e+00  
2   0.000000e+00  
3   0.000000e+00  
4   0.000000e+00  
5  6.732532e-180  
6   0.000000e+00  
7   1.000000e+00  


-> Some racial groups' MEDIAN retention rates differ from each other  (p<.05), but some don't (p>.05).

# Limited English Proficiency (LEP)

In [33]:

# Filter for LEP (Limited English Proficiency) columns 
df_cleaned_lep = df_cleaned.filter(like='LEP')

# Add school total retention to the filtered DataFrame
df_cleaned_lep['SCH_TOT_RET'] = df_cleaned['SCH_TOT_RET']  # from df_cleaned earlier

# Calcuate total male LEP students
df_cleaned_lep_male = df_cleaned_lep.filter(like='M')
df_cleaned_lep_male_total = df_cleaned_lep_male.sum(axis=1)

# Calcuate total female LEP students
df_cleaned_lep_female = df_cleaned_lep.filter(like='F')
df_cleaned_lep_female_total = df_cleaned_lep_female.sum(axis=1)

# Calculate total LEP students by summing up male and female totals
df_cleaned_lep_total = df_cleaned_lep_male_total + df_cleaned_lep_female_total
df_cleaned_lep['LEP_TOTAL'] = df_cleaned_lep_total

# Drop rows with NA values 
df_cleaned_lep = df_cleaned_lep.dropna(subset=['LEP_TOTAL', 'SCH_TOT_RET'])


In [34]:
# Set up data for x and y axis
x = df_cleaned_lep['LEP_TOTAL']  
y = df_cleaned_lep['SCH_TOT_RET']  

# Perform regression 
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Print regression results including r-squared (effect size)
(slope, intercept, r_value**2, p_value, std_err)  


(3.0625863145987386,
 10.342875899961307,
 0.5397470937544588,
 0.0,
 0.014029644382435008)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=edfebf59-8cf0-4b7e-b1cf-d80b21ef0191' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>