In [1]:
%run 01_US_Public_School_Dataset.ipynb

### Create Color Palette

In [2]:
mich_palette = ['#2F65A7', '#FFCB05', '#00274C', '#CFC096', '#75988d', '#575294', '#989C97', '#00B2A9', '#702082', '#655A52']
sns.set_palette(mich_palette)

### Function to calculate school retention rate

In [3]:
def calculate_school_retention_percentage(df_cleaned):
    
    # Initialize a new column to store retention percentages
    df_cleaned['RETENTION_PERCENTAGE'] = np.nan
    
    # Grade columns
    grade_enroll_cols = ['KG', 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'G07', 'G08', 'G09', 'G10', 'G11', 'G12']

    # Iterate over each row in the DataFrame
    for index, row in df_cleaned.iterrows():
        total_enrollment = 0
        total_retention = 0
        total_grades_with_retention = 0
        
        for grade in grade_enroll_cols:
            
            # Check if retention data is available for the current grade
            retention_male = row.get(f'TOT_RET_{grade}_M')
            retention_female = row.get(f'TOT_RET_{grade}_F')
            
            # Get the enrollment count for the grade
            enrollment = row.get(grade, 0)  
            
            if not pd.isnull(retention_male) and not pd.isnull(retention_female) and not pd.isnull(enrollment):
                
                # Both male and female retention data are available for the grade
                total_retention += retention_male + retention_female
                total_enrollment += enrollment
                total_grades_with_retention += 1
                
        # Calculate retention percentage if there are grades with retention data
        if total_grades_with_retention > 0:
            retention_percentage = (total_retention / total_enrollment) * 100 if total_enrollment > 0 else 0
            df_cleaned.at[index, 'RETENTION_PERCENTAGE'] = retention_percentage
            
    # Filter out rows with null retention percentages
    df_cleaned = df_cleaned[df_cleaned['RETENTION_PERCENTAGE'].notnull()]
            
    return df_cleaned

### Function to create individual grade dataframe

In [4]:
def create_grade_df(df_cleaned):
    
    # Intitalizing grade dataframe dictionary
    grades_df_dict = {} 
    
    # Grades for which seperate dataframe will be created
    grades = ['KG', 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'G07', 'G08', 'G09', 'G10', 'G11', 'G12']
    
    # Common Columns to include in all grades df
    non_ret_cols = ['NCES_SCH_ID','CHARTER','MAGNET','VIRTUAL','SCHOOL_LEVEL',
                    'STITLEI','SCHOOL_TYPE','LOCAL_CATEGORY','TOT_FREE_LUNCH',
                    'TOT_ENROLL','TOT_TEACHERS','STU_TEA_RATIO','AMALM','AMALF','ASALM','ASALF',
                    'BLALM','BLALF','HPALM','HPALF','HIALM','HIALF','TRALM',
                    'TRALF','WHALM','WHALF','LATCOD','LONCOD','LEA_STATE_NAME'        
                    ]
    
    # Loop creating seperate df's and storing in dictionary
    for grade in grades:
        
        # Filtering grade columns
        grade_cols = df_cleaned.filter(like=grade)
        
        # Combining grade cols with common cols
        selected_cols = non_ret_cols + list(grade_cols.columns)
        
        # Creating copy to not change the original dataframe
        grades_df = df_cleaned[selected_cols].copy()
        
        #Dropping nan values from each df
        grades_df = grades_df.dropna(subset=grade_cols.columns)
        
        #Storing each grade df in a dictionary
        grades_df_dict[grade] = grades_df
        
    return grades_df_dict

### Function to calculate grade retention rate

In [5]:
def calculate_grade_retention_rate(df):
    grades = list(df.keys())

    # Initialize combined retention rate list
    combined_retention_rate = []

    for grade, grade_df in df.items():
        # Check for null values in retention counts and total enrollment
        if grade_df[[f'TOT_RET_{grade}_M']].notnull().sum().sum() > 0 and \
           grade_df[[f'TOT_RET_{grade}_F']].notnull().sum().sum() > 0 and \
           grade_df[grade].notnull().sum() > 0:

            # Calculate total retention counts for male and female students
            total_male = grade_df[f'TOT_RET_{grade}_M'].sum()
            total_female = grade_df[f'TOT_RET_{grade}_F'].sum()

            # Calculate total enrollment for the grade
            total_enrollment = grade_df[grade].sum()

            # Calculate combined retention rate
            combined_rate = ((total_male + total_female) / total_enrollment) * 100 if total_enrollment > 0 else 0

            combined_retention_rate.append(combined_rate)
    return combined_retention_rate

### Function to calculate state retention rate

In [6]:
def calculate_state_retention_rate(df_cleaned):
    # Grades values
    grades = ['KG', 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'G07', 'G08', 'G09', 'G10', 'G11', 'G12']
    # Gender values
    gender = ['M', 'F']
    
    # Extracting only state and grades enrollment and retention
    state_retention_columns = ['LEA_STATE_NAME'] + [f'TOT_RET_{grade}_{g}' for grade in grades for g in gender] + ['TOT_ENROLL']
    state_retention_df = df_cleaned[state_retention_columns]
    
    states = state_retention_df['LEA_STATE_NAME'].unique()

    # Initialize combined retention rate list
    combined_retention_rate = []

    for state in states:
        # Select rows corresponding to the state
        state_df = state_retention_df[state_retention_df['LEA_STATE_NAME'] == state]

        # Check for null values in retention and enrollment columns
        if state_df.filter(like='_M').notnull().sum().sum() > 0 and state_df.filter(like='_F').notnull().sum().sum() > 0 and state_df['TOT_ENROLL'].notnull().sum() > 0:

            # Calculate total retention counts for male and female students
            total_male = state_df.filter(like='_M').sum().sum()
            total_female = state_df.filter(like='_F').sum().sum()

            # Calculate total enrollment for the state
            total_enrollment = state_df['TOT_ENROLL'].sum()  

            # Calculate combined retention rate
            combined_rate = ((total_male + total_female) / total_enrollment) * 100 if total_enrollment > 0 else 0
            combined_retention_rate.append(combined_rate)
        
    return combined_retention_rate

### Function to calculate gender retention rate

In [7]:
def calculate_gender_retention_rate(df_cleaned):
    # Grades values
    grades = ['KG', 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'G07', 'G08', 'G09', 'G10', 'G11', 'G12']
    
    # Sum up enrollment for each gender across all races
    df_cleaned['total_enroll_female'] = df_cleaned[['AMALF', 'ASALF', 'BLALF', 'HPALF', 'HIALF', 'TRALF', 'WHALF']].sum(axis=1, skipna=True)
    df_cleaned['total_enroll_male'] = df_cleaned[['AMALM', 'ASALM', 'BLALM', 'HPALM', 'HIALM', 'TRALM', 'WHALM']].sum(axis=1, skipna=True)

    # Get total retention count by gender
    df_cleaned['total_ret_male'] = 0
    df_cleaned['total_ret_female'] = 0

    for grade in grades:
        df_cleaned['total_ret_male'] += df_cleaned[f'TOT_RET_{grade}_M'].fillna(0)  
        df_cleaned['total_ret_female'] += df_cleaned[f'TOT_RET_{grade}_F'].fillna(0) 

    # Drop rows with NaN values before calculating retention rates
    columns_to_check = ['total_enroll_female', 'total_enroll_male', 'total_ret_female', 'total_ret_male']
    df_cleaned.dropna(subset=columns_to_check, inplace=True)

    # Use np.where to avoid division by zero
    df_cleaned['ret_rate_female'] = np.where(df_cleaned['total_enroll_female'] > 0, 
                                             (df_cleaned['total_ret_female'] / df_cleaned['total_enroll_female']) * 100, 
                                             np.nan)  

    df_cleaned['ret_rate_male'] = np.where(df_cleaned['total_enroll_male'] > 0, 
                                           (df_cleaned['total_ret_male'] / df_cleaned['total_enroll_male']) * 100, 
                                           np.nan)
    return df_cleaned

### Function to calculate race retention rate

In [8]:
def calculate_race_retention_rate(df_cleaned):
    # Grades columns
    grades = ['KG', 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'G07', 'G08', 'G09', 'G10', 'G11', 'G12']
    
    # Race columns
    races = ['AM', 'AS', 'BL', 'HP', 'HI', 'TR', 'WH']
    
    # Fill NaN values with 0 to ensure sum operations do not result in NaN
    columns = [f'{race}ALF' for race in races] + [f'{race}ALM' for race in races] + \
                 [f'SCH_RET_{grade}_{race}_M' for race in races for grade in grades] + \
                 [f'SCH_RET_{grade}_{race}_F' for race in races for grade in grades]
    
    df_cleaned[columns] = df_cleaned[columns].fillna(0)

    # Loop through each race to calculate its total enrollment AND its total retention count - for each school.

    for race in races:
        # Get total enrollment of each race (summing up female and male cols)
        # Race enrollment col is in this format "AM+ALF, AM+ALM"
        df_cleaned[f'total_enroll_{race}'] = df_cleaned[f'{race}ALF'] + df_cleaned[f'{race}ALM']

        # Get total retention count for each race 
        df_cleaned[f'total_ret_{race}'] = 0
        
        # KG to G12 
        for grade in grades: 
            # Sum up retention counts for each grade and each gender for this current race.
            df_cleaned[f'total_ret_{race}'] += (df_cleaned[f'SCH_RET_{grade}_{race}_M'] + df_cleaned[f'SCH_RET_{grade}_{race}_F'])   

    # Drop rows where any of the total enrollment or total retention columns for races have NaN values
    columns_to_check = [f'total_enroll_{race}' for race in races] + [f'total_ret_{race}' for race in races]
    df_cleaned.dropna(subset=columns_to_check, inplace=True)

    # Calculate retention rate for each race
    for race in races:
        df_cleaned[f'ret_rate_{race}'] = np.where(df_cleaned[f'total_enroll_{race}'] > 0, 
                                                  (df_cleaned[f'total_ret_{race}'] / df_cleaned[f'total_enroll_{race}']) * 100, 
                                                  np.nan)
    return df_cleaned

### Function to calculate grade retention rate by gender

In [9]:
def grade_retention_by_gender(df):
    grades = list(df.keys())

    # Initialize total retention percentages list for male and female students
    total_retention_male_percent = []
    total_retention_female_percent = []

    for grade, grade_df in df.items():

        # Check for null values in relevant columns for male and female students
        if grade_df[[f'TOT_RET_{grade}_M']].notnull().sum().sum() > 0 and \
           grade_df[[f'TOT_RET_{grade}_F']].notnull().sum().sum() > 0 and \
           grade_df[['AMALM','ASALM','BLALM','HPALM','HIALM','TRALM','WHALM']].sum().sum() > 0 and \
           grade_df[['AMALF','ASALF','BLALF','HPALF','HIALF','TRALF','WHALF']].sum().sum() > 0:

            # Calculate total retention counts for male and female students
            total_male = grade_df[f'TOT_RET_{grade}_M'].sum()
            total_female = grade_df[f'TOT_RET_{grade}_F'].sum()

            # Calculate total enrollment for the grade
            total_male_enroll = grade_df[['AMALM','ASALM','BLALM','HPALM','HIALM','TRALM','WHALM']].sum().sum()
            total_female_enroll = grade_df[['AMALF','ASALF','BLALF','HPALF','HIALF','TRALF','WHALF']].sum().sum()

            # Calculate retention percentages
            percent_male = (total_male / total_male_enroll) * 100 if total_male_enroll > 0 else 0
            percent_female = (total_female / total_female_enroll) * 100 if total_female_enroll > 0 else 0

            total_retention_male_percent.append(percent_male)
            total_retention_female_percent.append(percent_female)
    return total_retention_male_percent, total_retention_female_percent


### Function to calculate grade retention rate by race

In [10]:
def grade_retention_by_race(df):
    races = ['AM', 'AS', 'BL', 'HP', 'HI', 'TR', 'WH']
    total_retention_race_percent = {race: [] for race in races}

    for grade, grade_df in df.items():
        total_enroll_by_race_male = grade_df[[f'{race}ALM' for race in races]].sum()
        total_enroll_by_race_female = grade_df[[f'{race}ALF' for race in races]].sum()
        tot_ret_male = grade_df[[f'SCH_RET_{grade}_{race}_M' for race in races]].sum()
        tot_ret_female = grade_df[[f'SCH_RET_{grade}_{race}_F' for race in races]].sum()

        percent = {}
        for race in races:
            percent[race] = (tot_ret_male[f'SCH_RET_{grade}_{race}_M'] + tot_ret_female[f'SCH_RET_{grade}_{race}_F']) / \
                            (total_enroll_by_race_male[f'{race}ALM'] + total_enroll_by_race_female[f'{race}ALF']) * 100
            total_retention_race_percent[race].append(percent[race])

    return total_retention_race_percent

### Function to remove extreme outliers

In [11]:
def remove_outliers(df):
    # Filter columns that match the pattern 'TOT_RET_GRADEX_F' and 'TOT_RET_GRADEX_M'.
    grade_retention_columns = [col for col in df.columns if col.startswith('TOT_RET_')]

    # Calculate total retention count for each school 
    df['SCH_TOT_RET'] = df[grade_retention_columns].sum(axis=1)
    df = df[
        (df['TOT_ENROLL'] < 5000) & 
        (df['TOT_FREE_LUNCH'] < 4000) & 
        (df['TOT_TEACHERS'] < 300) & 
        (df['TOT_TEACHERS'] >=1 ) &
        (df['STU_TEA_RATIO'] < 500) & 
        (df['SCH_TOT_RET'] < 750) &
        (df['RETENTION_PERCENTAGE'] < 100)
        ]

    return df

### Function to create double bar chart

In [12]:
def create_double_barchart(data=None, var1=None, var2=None, var1_label=None, var2_label=None, xlabel=None, ylabel=None, title=None, ymax=100):
    # Set up the bar positions
    bar_width = 0.35
    var1_pos = range(len(data))
    var2_pos = [pos + bar_width for pos in var1_pos]

    # Create the bar chart
    bars1 = plt.bar(var1_pos, var1, width=bar_width, label=var1_label)
    bars2 = plt.bar(var2_pos, var2, width=bar_width, label=var2_label)


    # Set the x-axis labels
    plt.xticks([pos + bar_width/2 for pos in var1_pos], data.index)

    # Adding text labels to bars
    for bar in bars1:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f}%', ha='center', va='bottom', size=8)


    # Adding text labels to bars
    for bar in bars2:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f}%', ha='center', va='bottom', size=8)

    # Set labels and legend
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.ylim([0,ymax])
    plt.title(title)
    plt.legend(loc='best')

    # Show the plot
    plt.show()
    return plt

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=edfebf59-8cf0-4b7e-b1cf-d80b21ef0191' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>