In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import os

# File to Load
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas Data Frames
school_sum = pd.read_csv(school_data_to_load)
students_sum = pd.read_csv(student_data_to_load)

# Combine the data into a dataset
merged_data = pd.merge(students_sum, school_sum, how="left", on=["school_name", "school_name"])
merged_data1 = pd.merge(students_sum, school_sum, how="left", on=["school_name", "school_name"]) 

In [2]:
# Calculating the various parameters that need to be summarized 
Total_Schools = merged_data["School ID"].nunique()
Total_Students = merged_data["Student ID"].count()
Total_Budget = school_sum["budget"].sum()
Average_Math_Score = round(merged_data["math_score"].mean(),3)
Average_Reading_Score = round(merged_data["reading_score"].mean(),3)
Perc_Passing_Math = round((((merged_data[merged_data["math_score"]>= 70].count())/Total_Students)*100),3)
Perc_Passing_Reading = round((((merged_data[merged_data["reading_score"]>= 70].count())/Total_Students)*100),3)
overall_passing_rate = round((Perc_Passing_Math + Perc_Passing_Reading)/2,3)

#Create data frame with the above information
district_Summary = pd.DataFrame(
    {
        "Total Schools":Total_Schools,
        "Total Students": Total_Students,
        "Total Budget": Total_Budget,
        "Average Math Score": Average_Math_Score,
        "Average Reading Score": Average_Reading_Score,
        "% Passing Math": Perc_Passing_Math,
        "% Passing Reading": Perc_Passing_Reading,
        "Overall Passing Rate": overall_passing_rate
    }

)
district_Summary = district_Summary.reset_index(drop=True)
district_Summary['Total Budget'] = district_Summary['Total Budget'].map("${:,}".format)
district_Summary['Total Students'] = district_Summary['Total Students'].map("{:,}".format)
district_Summary[0:1]

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
0,15,39170,"$24,649,428",78.985,81.878,74.981,85.805,80.393


In [4]:
# Calculating the various parameters that need to be summarized 
school_type = school_sum.set_index(['school_name'])["type"]
merged_data_grp_school = merged_data.groupby(['school_name'])
Total_Students2 = merged_data_grp_school["Student ID"].count()
Total_Budget2 = merged_data_grp_school["budget"].sum() / merged_data_grp_school["budget"].count()
Per_Strudent_Budget2 = Total_Budget2 / Total_Students2
Average_Math_Score2 = round(merged_data_grp_school["math_score"].mean(),3)
Average_Reading_Score2 = round(merged_data_grp_school["reading_score"].mean(),3)
merger_math_70_gry_by_schl_avg_math = round((
                             merged_data[merged_data["math_score"] >=70]
                            .groupby(['school_name'])
                            ["math_score"].count()/Total_Students2
                            )*100
                           ,3)
merger_read_70_gry_by_schl_avg_read = round((
                             merged_data[merged_data["reading_score"] >=70]
                            .groupby(['school_name'])
                            ["reading_score"].count()/Total_Students2
                            )*100
                           ,3)
overall_passing_rate2 = round((merger_math_70_gry_by_schl_avg_math + merger_read_70_gry_by_schl_avg_read)/2,3)

#Create data frame with the above information
district_Summary2 = pd.DataFrame(
    {
        "School Type": school_type,
        "Total Students": Total_Students2,
        "Total School Budget": Total_Budget2,
        "Per Student Budget":Per_Strudent_Budget2,
        "Average Math Score": Average_Math_Score2,
        "Average Reading Score": Average_Reading_Score2,
        "% Passing Math": merger_math_70_gry_by_schl_avg_math,
        "% Passing Reading": merger_read_70_gry_by_schl_avg_read,
        "Overall Passing Rate": overall_passing_rate2,
    }
)

#Formatting the data in dataframe, removing index and renaming columns
district_Summary2['Total School Budget'] = district_Summary2['Total School Budget'].map("${:,}".format)
district_Summary2['Per Student Budget'] = district_Summary2['Per Student Budget'].map("${:,}".format)
district_Summary2 = district_Summary2.sort_values(by='Overall Passing Rate', ascending=False)
district_Summary2.head()


Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
Cabrera High School,Charter,1858,"$1,081,356.0",$582.0,83.062,83.976,94.133,97.04,95.586
Thomas High School,Charter,1635,"$1,043,130.0",$638.0,83.418,83.849,93.272,97.309,95.291
Pena High School,Charter,962,"$585,858.0",$609.0,83.84,84.045,94.595,95.946,95.27
Griffin High School,Charter,1468,"$917,500.0",$625.0,83.351,83.817,93.392,97.139,95.266
Wilson High School,Charter,2283,"$1,319,574.0",$578.0,83.274,83.989,93.868,96.54,95.204


In [5]:
district_Summary2 = district_Summary2.sort_values(by='Overall Passing Rate')
district_Summary2.head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
Rodriguez High School,District,3999,"$2,547,363.0",$637.0,76.843,80.745,66.367,80.22,73.294
Figueroa High School,District,2949,"$1,884,411.0",$639.0,76.712,81.158,65.988,80.739,73.364
Huang High School,District,2917,"$1,910,635.0",$655.0,76.629,81.183,65.684,81.316,73.5
Johnson High School,District,4761,"$3,094,650.0",$650.0,77.072,80.966,66.058,81.222,73.64
Ford High School,District,2739,"$1,763,916.0",$644.0,77.103,80.746,68.31,79.299,73.804


In [5]:
# Calculating the various parameters that need to be summarized 
Average_Math_Score3 = round(merged_data[merged_data["grade"]=='9th']
            .groupby('school_name')
            ["math_score"].mean()
            ,3)
Average_Read_Score3 = round(merged_data[merged_data["grade"]=='9th']
            .groupby('school_name')
            ["reading_score"].mean()
            ,3)
grade10 = merged_data[merged_data["grade"]=='10th']
grade10_grp = grade10.groupby('school_name')
Average_Math_Score4 = round(grade10_grp["math_score"].mean(),3)
Average_Read_Score4 = round(grade10_grp["reading_score"].mean(),3)
grade11 = merged_data[merged_data["grade"]=='11th']
grade11_grp = grade11.groupby('school_name')
Average_Math_Score5 = round(grade11_grp["math_score"].mean(),3)
Average_Read_Score5 = round(grade11_grp["reading_score"].mean(),3)
grade12 = merged_data[merged_data["grade"]=='12th']
grade12_grp = grade12.groupby('school_name')
Average_Math_Score6 = round(grade12_grp["math_score"].mean(),3)
Average_Read_Score6 = round(grade12_grp["reading_score"].mean(),3)

#Create data frame with the above information
summary3 = pd.DataFrame(
    {
        "9th Grade": Average_Math_Score3,
        "10th Grade": Average_Math_Score4,
        "11th Grade":Average_Math_Score5,
        "12th Grade": Average_Math_Score6
    }
)
summary3.index.name = 'School Name'
summary3

Unnamed: 0_level_0,9th Grade,10th Grade,11th Grade,12th Grade
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.084,76.997,77.516,76.492
Cabrera High School,83.095,83.155,82.766,83.277
Figueroa High School,76.403,76.54,76.884,77.151
Ford High School,77.361,77.672,76.918,76.18
Griffin High School,82.044,84.229,83.842,83.356
Hernandez High School,77.438,77.337,77.136,77.187
Holden High School,83.787,83.43,85.0,82.855
Huang High School,77.027,75.909,76.447,77.226
Johnson High School,77.188,76.691,77.492,76.863
Pena High School,83.625,83.372,84.328,84.122


In [6]:
#Create data frame with the above information
summary4 = pd.DataFrame(
    {
        "9th Grade": Average_Read_Score3,
        "10th Grade": Average_Read_Score4,
        "11th Grade":Average_Read_Score5,
        "12th Grade": Average_Read_Score6
    }
)

summary4

Unnamed: 0_level_0,9th Grade,10th Grade,11th Grade,12th Grade
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.303,80.907,80.946,80.912
Cabrera High School,83.676,84.253,83.788,84.288
Figueroa High School,81.199,81.409,80.64,81.385
Ford High School,80.633,81.263,80.404,80.662
Griffin High School,83.369,83.707,84.288,84.014
Hernandez High School,80.867,80.66,81.396,80.857
Holden High School,83.677,83.325,83.816,84.699
Huang High School,81.29,81.512,81.417,80.306
Johnson High School,81.261,80.773,80.616,81.228
Pena High School,83.807,83.612,84.336,84.591


In [8]:
#I could have used the district_Summary2 dataframe from previous step that has the per studnt budget, cut and grouped the data 
# but I wanted to try to use the join funcitons and tried a new apporach.
# Calculating the Per Student Budget per shchool
merged_data_grp_school11 = merged_data.groupby(['school_name'])
total_studentsx = merged_data_grp_school11["Student ID"].count()
Total_BudgetX = merged_data_grp_school11["budget"].sum() / merged_data_grp_school11["budget"].count()
Per_Strudent_BudgetX = Total_BudgetX / total_studentsx

#Creating a new DataFrame with school name and Average_Student_Spend at shchool level
new_df = pd.DataFrame(Per_Strudent_BudgetX)
new_df.reset_index(level=0, inplace=True)
new_df = new_df.rename(columns={0:"Average_Student_Spend"})

#Merging the newly created DataFrame with orignal DataFrame to append the Average_Student_Spend column to the original DataFrame
merged_data_new = pd.merge(merged_data,new_df, suffixes=("_2"), on="school_name", how="left")

#Adding a new column Spending Ranges (Per Student) where we group the data based on spending_bins and group_names
spending_bins = [0, 585, 615, 645, 675]
group_names = ["<$585", "$585-615", "$615-645", "$645-675"]
merged_data_new["Spending Ranges (Per Student)"] = pd.cut(merged_data_new["Average_Student_Spend"], [0, 585, 615, 645, 675],
                                                          labels=["<$585", "$585-615", "$615-645", "$645-675"])

#Grouping the DataFrame by new column Spending Ranges (Per Student)
merged_data_new_grp_range = merged_data_new.groupby("Spending Ranges (Per Student)")


# Calculating the various parameters that need to be summarized 
total_studentsy = merged_data_new_grp_range["Student ID"].count()
Average_Math_Score_new1 = round(merged_data_new_grp_range["math_score"].mean(),3)
Average_Reading_Score_new1 = round(merged_data_new_grp_range["reading_score"].mean(),3)
Passing_Math_new1 = round((
                             merged_data_new[merged_data_new["math_score"] >=70]
                             .groupby(['Spending Ranges (Per Student)'])
                             ["math_score"].count()/total_studentsy
                             )*100
                            ,3)
Passing_Read_new1 = round((
                             merged_data_new[merged_data_new["reading_score"] >=70]
                             .groupby(['Spending Ranges (Per Student)'])
                             ["reading_score"].count()/total_studentsy
                             )*100
                            ,3)
overall_passing_rate_new1 = round((Passing_Math_new1 + Passing_Read_new1)/2,3)
#Create data frame with the above information
district_Summary3 = pd.DataFrame(
    {
        "Average Math Score": Average_Math_Score_new1,
        "Average Reading Score": Average_Reading_Score_new1,
        "% Passing Math": Passing_Math_new1,
        "% Passing Reading": Passing_Read_new1,
        "Overall Passing Rate": overall_passing_rate_new1
    }
)

district_Summary3


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,83.363,83.964,93.703,96.687,95.195
$585-615,83.529,83.838,94.124,95.887,95.006
$615-645,78.062,81.434,71.4,83.615,77.508
$645-675,77.049,81.006,66.231,81.109,73.67


In [9]:
#Calculating the Total Student per shchool
merged_data_grp_school22 = merged_data.groupby(['school_name'])
total_studentsx2 = merged_data_grp_school11["Student ID"].count()

#Creating a new DataFrame with school name and Total Students at shchool level
new_df2 = pd.DataFrame(total_studentsx2)
new_df2.reset_index(level=0, inplace=True)
new_df2 = new_df2.rename(columns={"Student ID":"Total Student Count In School"})

#Merging the newly created DataFrame with orignal DataFrame to append the Total Students at shchool level
merged_data_new2 = pd.merge(merged_data,new_df2, suffixes=("_2"), on="school_name", how="left")

#Adding a new column 'Total Student Count In School' where we group the data based on spending_bins and group_names
size_bins = [0, 1000, 2000, 5000]
group_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]
merged_data_new2["School Size"] = pd.cut(merged_data_new2["Total Student Count In School"], size_bins,
                                                           labels=group_names)
#Grouping the DataFrame by new column School Size
merged_data_new_grp_range2 = merged_data_new2.groupby("School Size")

# Calculating the various parameters that need to be summarized
total_studentsy2 = merged_data_new_grp_range2["Student ID"].count()
Average_Math_Score_new2 = round(merged_data_new_grp_range2["math_score"].mean(),3)
Average_Reading_Score_new2 = round(merged_data_new_grp_range2["reading_score"].mean(),3)
Passing_Math_new2 = round((
                              merged_data_new2[merged_data_new2["math_score"] >=70]
                              .groupby(["School Size"])
                              ["math_score"].count()/total_studentsy2
                              )*100
                             ,3)
Passing_Read_new2 = round((
                              merged_data_new2[merged_data_new2["reading_score"] >=70]
                              .groupby(['School Size'])
                              ["reading_score"].count()/total_studentsy2
                              )*100
                             ,3)
overall_passing_rate_new2 = round((Passing_Math_new2 + Passing_Read_new2)/2,3)

#Create data frame with the above information
district_Summary4 = pd.DataFrame(
    {
        "Average Math Score": Average_Math_Score_new2,
        "Average Reading Score": Average_Reading_Score_new2,
        "% Passing Math": Passing_Math_new2,
        "% Passing Reading": Passing_Read_new2,
        "Overall Passing Rate": overall_passing_rate_new2
    }
)

district_Summary4

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.829,83.974,93.952,96.04,94.996
Medium (1000-2000),83.373,83.868,93.617,96.773,95.195
Large (2000-5000),77.478,81.199,68.652,82.125,75.388


In [10]:
# Calculating the various parameters that need to be summarized 
merged_data_grp_school2 = merged_data.groupby(['type'])
Total_Students7 = merged_data_grp_school2["Student ID"].count()
Average_Math_Score7 = round(merged_data_grp_school2["math_score"].mean(),3)
Average_Reading_Score7 = round(merged_data_grp_school2["reading_score"].mean(),3)
merger_math_70_gry_by_schl_avg_math_2 = round((
                             merged_data[merged_data["math_score"] >=70]
                            .groupby(['type'])
                            ["math_score"].count()/Total_Students7
                            )*100
                           ,3)
merger_read_70_gry_by_schl_avg_read_2 = round((
                             merged_data[merged_data["reading_score"] >=70]
                            .groupby(['type'])
                            ["reading_score"].count()/Total_Students7
                            )*100
                           ,3)
overall_passing_rate3 = round((merger_math_70_gry_by_schl_avg_math_2 + merger_read_70_gry_by_schl_avg_read_2)/2,3)

#Create data frame with the above information
district_Summary7 = pd.DataFrame(
     {
         "Average Math Score": Average_Math_Score7,
         "Average Reading Score": Average_Reading_Score7,
         "% Passing Math": merger_math_70_gry_by_schl_avg_math_2,
         "% Passing Reading": merger_read_70_gry_by_schl_avg_read_2,
         "Overall Passing Rate": overall_passing_rate3
     }
 )
district_Summary7.index.name = 'School Type'

district_Summary7

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.406,83.903,93.702,96.646,95.174
District,76.987,80.962,66.518,80.905,73.712
