In [2]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to Change These)
school_data_to_load = "schools_complete.csv"
student_data_to_load = "students_complete.csv"

# Read School and Student Data File and store into Pandas Data Frames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

In [3]:
#find total number of schools
schools = len(school_data_complete["school_name"].value_counts())

#find total number of students - each row is a unique student
students = school_data_complete["student_name"].count()

#find total budget
budget = school_data_complete["budget"].value_counts()
#for series created, index is actual $, budget column is # of times it appears in df.  We need sum of index
#store as df and reset index
budget = pd.DataFrame(budget)
budget = budget.reset_index()
#find sum of correct column and store as variable
budget_total = budget["index"].sum()

#find average math score
avg_math = round(school_data_complete["math_score"].mean(), 5)

#find average reading score
avg_read = round(school_data_complete["reading_score"].mean(), 5)

#find % passing math - first count # with passing score, then divide by total students & multiply by 100
math_passing = len(school_data_complete[school_data_complete["math_score"]>69])
percent_pass_math = round(math_passing/students*100, 5)

#reading
read_passing = len(school_data_complete[school_data_complete["reading_score"]>69])
percent_pass_read = round(read_passing/students*100, 5)

#overall
overall_pass = round((percent_pass_read + percent_pass_math)/2, 5)


In [7]:
#District Summary Table with nice format
#put values into summary table (dataframe) for District
Summary_District = {"Total schools": schools, "Total students": students, 
                    "Total budget": budget_total, "Average Math Score": avg_math, 
                    "Average Reading Score": avg_read, "% Passed Math": percent_pass_math,
                   "% Passed Reading": percent_pass_read, "Overall Passing Rate": overall_pass}

Summary_District = pd.DataFrame(Summary_District, index=[0])

#format columns appropriately
Summary_District["Total budget"] = Summary_District["Total budget"].map("${:,}".format)
Summary_District["Total students"] = Summary_District["Total students"].map("{:,}".format)
Summary_District

Unnamed: 0,Total schools,Total students,Total budget,Average Math Score,Average Reading Score,% Passed Math,% Passed Reading,Overall Passing Rate
0,15,39170,"$24,649,428",78.98537,81.87784,74.98085,85.80546,80.39316


In [8]:
#begin school summary
#create beginning of summary table by saving new df with school name, type, size and budget
school_summary_df = school_data[["school_name", "type", "size", "budget"]]
school_summary_df

Unnamed: 0,school_name,type,size,budget
0,Huang High School,District,2917,1910635
1,Figueroa High School,District,2949,1884411
2,Shelton High School,Charter,1761,1056600
3,Hernandez High School,District,4635,3022020
4,Griffin High School,Charter,1468,917500
5,Wilson High School,Charter,2283,1319574
6,Cabrera High School,Charter,1858,1081356
7,Bailey High School,District,4976,3124928
8,Holden High School,Charter,427,248087
9,Pena High School,Charter,962,585858


In [9]:
#add per student budget to df
school_summary_df["Per Student Budget"] = school_summary_df["budget"]/school_summary_df["size"]
school_summary_df

Unnamed: 0,school_name,type,size,budget,Per Student Budget
0,Huang High School,District,2917,1910635,655.0
1,Figueroa High School,District,2949,1884411,639.0
2,Shelton High School,Charter,1761,1056600,600.0
3,Hernandez High School,District,4635,3022020,652.0
4,Griffin High School,Charter,1468,917500,625.0
5,Wilson High School,Charter,2283,1319574,578.0
6,Cabrera High School,Charter,1858,1081356,582.0
7,Bailey High School,District,4976,3124928,628.0
8,Holden High School,Charter,427,248087,581.0
9,Pena High School,Charter,962,585858,609.0


In [10]:
#find mean math scores by school
#start by saving new df with school name, school size, and math_score
s_math_df = school_data_complete[["school_name", "size", "math_score"]]

#now, group by school and find mean
s_math = s_math_df.groupby("school_name").math_score.mean().reset_index()

#change column name to be average math score
s_math = s_math.rename(columns={"math_score": "Avg Math Score"})

#now merge with the school summary df
school_summary_df = pd.merge(school_summary_df, s_math, on="school_name")
school_summary_df

Unnamed: 0,school_name,type,size,budget,Per Student Budget,Avg Math Score
0,Huang High School,District,2917,1910635,655.0,76.629414
1,Figueroa High School,District,2949,1884411,639.0,76.711767
2,Shelton High School,Charter,1761,1056600,600.0,83.359455
3,Hernandez High School,District,4635,3022020,652.0,77.289752
4,Griffin High School,Charter,1468,917500,625.0,83.351499
5,Wilson High School,Charter,2283,1319574,578.0,83.274201
6,Cabrera High School,Charter,1858,1081356,582.0,83.061895
7,Bailey High School,District,4976,3124928,628.0,77.048432
8,Holden High School,Charter,427,248087,581.0,83.803279
9,Pena High School,Charter,962,585858,609.0,83.839917


In [11]:
#find mean reading scores by school
#start by saving new df with school name, school size, and reading score
s_read_df = school_data_complete[["school_name", "size", "reading_score"]]

#now, group by school and find mean
s_read = s_read_df.groupby("school_name").reading_score.mean().reset_index()

#change column name to be average reading score
s_read = s_read.rename(columns={"reading_score": "Avg Reading Score"})

#now merge with the school summary df
school_summary_df = pd.merge(school_summary_df, s_read, on="school_name")
school_summary_df

Unnamed: 0,school_name,type,size,budget,Per Student Budget,Avg Math Score,Avg Reading Score
0,Huang High School,District,2917,1910635,655.0,76.629414,81.182722
1,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802
2,Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724
3,Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412
4,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757
5,Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488
6,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578
7,Bailey High School,District,4976,3124928,628.0,77.048432,81.033963
8,Holden High School,Charter,427,248087,581.0,83.803279,83.814988
9,Pena High School,Charter,962,585858,609.0,83.839917,84.044699


In [12]:
#find passing rate for math for each school
s_math_df
#first, filter earlier df with school name, size and math score by passing scores only
only_math = s_math_df.loc[s_math_df["math_score"]>69,:]

#group by school and return number of passing scores by school
math_math = only_math.groupby("school_name").math_score.count().reset_index()

#rename math score column to passing rate
math_math = math_math.rename(columns={"math_score": "passing_rate"})

#merge with original math df so we have size of school and pass math count in same df
s_math_df = pd.merge(s_math_df, math_math, on="school_name")
s_math_df

#remove duplicate rows so we only have one row per school
s_math_df.drop_duplicates(subset= "school_name", keep="first", inplace=True)
s_math_df

Unnamed: 0,school_name,size,math_score,passing_rate
0,Huang High School,2917,79,1916
2917,Figueroa High School,2949,87,1946
5866,Shelton High School,1761,91,1653
7627,Hernandez High School,4635,88,3094
12262,Griffin High School,1468,68,1371
13730,Wilson High School,2283,71,2143
16013,Cabrera High School,1858,94,1749
17871,Bailey High School,4976,59,3318
22847,Holden High School,427,92,395
23274,Pena High School,962,75,910


In [13]:
#to find % passed, we need to divide number of passing scores(passing_rate))by size and multiply by 100
s_math_df["passing_rate"] = s_math_df["passing_rate"]/s_math_df["size"]

s_math_df["passing_rate"] = s_math_df["passing_rate"]*100

#keep only columns we need - school name to merge by, and passing_rate to add to our school summary df
s_math_df =s_math_df[["school_name", "passing_rate"]]
s_math_df

Unnamed: 0,school_name,passing_rate
0,Huang High School,65.683922
2917,Figueroa High School,65.988471
5866,Shelton High School,93.867121
7627,Hernandez High School,66.752967
12262,Griffin High School,93.392371
13730,Wilson High School,93.867718
16013,Cabrera High School,94.133477
17871,Bailey High School,66.680064
22847,Holden High School,92.505855
23274,Pena High School,94.594595


In [14]:
#merge math passing rate into school summary df
school_summary_df = pd.merge(school_summary_df, s_math_df, on="school_name")


#change math passing rate column header
school_summary_df = school_summary_df.rename(columns={"passing_rate": "% Passing Math"})


In [15]:
#find passing rate for reading for each school
s_read_df
#first, filter earlier df with school name, size and math score by passing scores only
only_read = s_read_df.loc[s_read_df["reading_score"]>69,:]

#group by school and return number of passing scores by school
read_read = only_read.groupby("school_name").reading_score.count().reset_index()

#rename math score column to passing rate
read_read = read_read.rename(columns={"reading_score": "passing_rate"})

#merge with original math df so we have size of school and pass math count in same df
s_read_df = pd.merge(s_read_df, read_read, on="school_name")

#remove duplicate rows so we only have one row per school
s_read_df.drop_duplicates(subset= "school_name", keep="first", inplace=True)
s_read_df

Unnamed: 0,school_name,size,reading_score,passing_rate
0,Huang High School,2917,66,2372
2917,Figueroa High School,2949,85,2381
5866,Shelton High School,1761,70,1688
7627,Hernandez High School,4635,70,3748
12262,Griffin High School,1468,79,1426
13730,Wilson High School,2283,84,2204
16013,Cabrera High School,1858,94,1803
17871,Bailey High School,4976,75,4077
22847,Holden High School,427,86,411
23274,Pena High School,962,91,923


In [16]:
#to find % passed, we need to divide number of passing scores(passing_rate))by size and multiply by 100
s_read_df["passing_rate"] = s_read_df["passing_rate"]/s_read_df["size"]

s_read_df["passing_rate"] = s_read_df["passing_rate"]*100

#keep only columns we need - school name to merge by, and passing_rate to add to our school summary df
s_read_df =s_read_df[["school_name", "passing_rate"]]
s_read_df

Unnamed: 0,school_name,passing_rate
0,Huang High School,81.316421
2917,Figueroa High School,80.739234
5866,Shelton High School,95.854628
7627,Hernandez High School,80.862999
12262,Griffin High School,97.138965
13730,Wilson High School,96.539641
16013,Cabrera High School,97.039828
17871,Bailey High School,81.93328
22847,Holden High School,96.252927
23274,Pena High School,95.945946


In [17]:
#merge reading passing rate into school summary df
school_summary_df = pd.merge(school_summary_df, s_read_df, on="school_name")

#change reading passing rate column header
school_summary_df = school_summary_df.rename(columns={"passing_rate": "% Passing Reading"})
school_summary_df

Unnamed: 0,school_name,type,size,budget,Per Student Budget,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading
0,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421
1,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234
2,Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,93.867121,95.854628
3,Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967,80.862999
4,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965
5,Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,96.539641
6,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828
7,Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,66.680064,81.93328
8,Holden High School,Charter,427,248087,581.0,83.803279,83.814988,92.505855,96.252927
9,Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946


In [18]:
#add column to school summary table that gives overall passing rate
school_summary_df["% Overall Passing Rate"] = school_summary_df["% Passing Math"]+ school_summary_df["% Passing Reading"]
school_summary_df["% Overall Passing Rate"] = school_summary_df["% Overall Passing Rate"]/2
school_summary_df

Unnamed: 0,school_name,type,size,budget,Per Student Budget,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171
1,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,73.363852
2,Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,93.867121,95.854628,94.860875
3,Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967,80.862999,73.807983
4,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,95.265668
5,Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,96.539641,95.203679
6,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,95.586652
7,Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,66.680064,81.93328,74.306672
8,Holden High School,Charter,427,248087,581.0,83.803279,83.814988,92.505855,96.252927,94.379391
9,Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,95.27027


In [24]:
#find highest performing schools by overall passing rate
top_schools = school_summary_df.sort_values(by="% Overall Passing Rate", ascending = False)
top_schools.set_index("school_name", inplace = True)
top_schools.head(5)

Unnamed: 0_level_0,type,size,budget,Per Student Budget,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,95.586652
Thomas High School,Charter,1635,1043130,638.0,83.418349,83.84893,93.272171,97.308869,95.29052
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,95.27027
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,95.265668
Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,96.539641,95.203679


In [27]:
#find lowest performing schools by overall passing rate by using tail on previously sorted df
bottom_schools = school_summary_df.sort_values(by="% Overall Passing Rate", ascending = True)
bottom_schools.set_index("school_name", inplace = True)
bottom_schools.head(5)

Unnamed: 0_level_0,type,size,budget,Per Student Budget,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,2547363,637.0,76.842711,80.744686,66.366592,80.220055,73.293323
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,73.363852
Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171
Johnson High School,District,4761,3094650,650.0,77.072464,80.966394,66.057551,81.222432,73.639992
Ford High School,District,2739,1763916,644.0,77.102592,80.746258,68.309602,79.299014,73.804308


In [33]:
#table of average math score per grade for each school
#create df limited to columns we need - school name, grade and math score
grade_and_math = school_data_complete[["school_name", "grade", "math_score"]]
grade_and_math.head()

#find what grade levels
level = grade_and_math["grade"].unique()

#create df for 9th grade first - filter so only 9th
ninth = grade_and_math.loc[grade_and_math["grade"]=="9th",:]
#group by school and find mean
ninth_math_mean = ninth.groupby("school_name").math_score.mean().reset_index()
ninth_math_mean
#change column title to be 9th
ninth_math_mean = ninth_math_mean.rename(columns={"math_score": "9th"})
ninth_math_mean

Unnamed: 0,school_name,9th
0,Bailey High School,77.083676
1,Cabrera High School,83.094697
2,Figueroa High School,76.403037
3,Ford High School,77.361345
4,Griffin High School,82.04401
5,Hernandez High School,77.438495
6,Holden High School,83.787402
7,Huang High School,77.027251
8,Johnson High School,77.187857
9,Pena High School,83.625455


In [34]:
#create df for 10th grade
tenth = grade_and_math.loc[grade_and_math["grade"]=="10th",:]
#group by school and find mean
tenth_math_mean = tenth.groupby("school_name").math_score.mean().reset_index()

#change column title to be 10th
tenth_math_mean = tenth_math_mean.rename(columns={"math_score": "10th"})
tenth_math_mean

Unnamed: 0,school_name,10th
0,Bailey High School,76.996772
1,Cabrera High School,83.154506
2,Figueroa High School,76.539974
3,Ford High School,77.672316
4,Griffin High School,84.229064
5,Hernandez High School,77.337408
6,Holden High School,83.429825
7,Huang High School,75.908735
8,Johnson High School,76.691117
9,Pena High School,83.372


In [35]:
#create df for 11th grade
eleventh = grade_and_math.loc[grade_and_math["grade"]=="11th",:]
#group by school and find mean
eleventh_math_mean = eleventh.groupby("school_name").math_score.mean().reset_index()

#change column title to be 11th
eleventh_math_mean = eleventh_math_mean.rename(columns={"math_score": "11th"})
eleventh_math_mean

Unnamed: 0,school_name,11th
0,Bailey High School,77.515588
1,Cabrera High School,82.76556
2,Figueroa High School,76.884344
3,Ford High School,76.918058
4,Griffin High School,83.842105
5,Hernandez High School,77.136029
6,Holden High School,85.0
7,Huang High School,76.446602
8,Johnson High School,77.491653
9,Pena High School,84.328125


In [38]:
#create df for 12th grade
twelfth = grade_and_math.loc[grade_and_math["grade"]=="12th",:]
#group by school and find mean
twelfth_math_mean = twelfth.groupby("school_name").math_score.mean().reset_index()

#change column title to be 11th
twelfth_math_mean = twelfth_math_mean.rename(columns={"math_score": "12th"})
twelfth_math_mean

Unnamed: 0,school_name,12th
0,Bailey High School,76.492218
1,Cabrera High School,83.277487
2,Figueroa High School,77.151369
3,Ford High School,76.179963
4,Griffin High School,83.356164
5,Hernandez High School,77.186567
6,Holden High School,82.855422
7,Huang High School,77.225641
8,Johnson High School,76.863248
9,Pena High School,84.121547


In [45]:
#merge the individual grade dfs into one df
grade_and_math = pd.merge(ninth_math_mean, tenth_math_mean, on="school_name")
grade_and_math = pd.merge(grade_and_math, eleventh_math_mean, on="school_name")
grade_and_math = pd.merge(grade_and_math, twelfth_math_mean, on="school_name")

#set school name as index
grade_and_math.set_index("school_name", inplace = True)
grade_and_math

Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.083676,76.996772,77.515588,76.492218
Cabrera High School,83.094697,83.154506,82.76556,83.277487
Figueroa High School,76.403037,76.539974,76.884344,77.151369
Ford High School,77.361345,77.672316,76.918058,76.179963
Griffin High School,82.04401,84.229064,83.842105,83.356164
Hernandez High School,77.438495,77.337408,77.136029,77.186567
Holden High School,83.787402,83.429825,85.0,82.855422
Huang High School,77.027251,75.908735,76.446602,77.225641
Johnson High School,77.187857,76.691117,77.491653,76.863248
Pena High School,83.625455,83.372,84.328125,84.121547


In [46]:
#table of average reading score per grade for each school
#create df limited to columns we need - school name, grade and reading score
grade_and_read = school_data_complete[["school_name", "grade", "reading_score"]]

#create df for 9th grade first - filter so only 9th
ninth = grade_and_read.loc[grade_and_read["grade"]=="9th",:]
#group by school and find mean
ninth_read_mean = ninth.groupby("school_name").reading_score.mean().reset_index()

#change column title to be 9th
ninth_read_mean = ninth_read_mean.rename(columns={"reading_score": "9th"})
ninth_read_mean

Unnamed: 0,school_name,9th
0,Bailey High School,81.303155
1,Cabrera High School,83.676136
2,Figueroa High School,81.198598
3,Ford High School,80.632653
4,Griffin High School,83.369193
5,Hernandez High School,80.86686
6,Holden High School,83.677165
7,Huang High School,81.290284
8,Johnson High School,81.260714
9,Pena High School,83.807273


In [47]:
#create df for 10th grade first - filter so only 10th
tenth = grade_and_read.loc[grade_and_read["grade"]=="10th",:]
#group by school and find mean
tenth_read_mean = tenth.groupby("school_name").reading_score.mean().reset_index()

#change column title to be 10th
tenth_read_mean = tenth_read_mean.rename(columns={"reading_score": "10th"})
tenth_read_mean

Unnamed: 0,school_name,10th
0,Bailey High School,80.907183
1,Cabrera High School,84.253219
2,Figueroa High School,81.408912
3,Ford High School,81.262712
4,Griffin High School,83.706897
5,Hernandez High School,80.660147
6,Holden High School,83.324561
7,Huang High School,81.512386
8,Johnson High School,80.773431
9,Pena High School,83.612


In [48]:
#create df for 11th grade first - filter so only 11th
eleventh = grade_and_read.loc[grade_and_read["grade"]=="11th",:]
#group by school and find mean
eleventh_read_mean = eleventh.groupby("school_name").reading_score.mean().reset_index()

#change column title to be 11th
eleventh_read_mean = eleventh_read_mean.rename(columns={"reading_score": "11th"})
eleventh_read_mean

Unnamed: 0,school_name,11th
0,Bailey High School,80.945643
1,Cabrera High School,83.788382
2,Figueroa High School,80.640339
3,Ford High School,80.403642
4,Griffin High School,84.288089
5,Hernandez High School,81.39614
6,Holden High School,83.815534
7,Huang High School,81.417476
8,Johnson High School,80.616027
9,Pena High School,84.335938


In [50]:
#create df for 12th grade first - filter so only 12th
twelfth = grade_and_read.loc[grade_and_read["grade"]=="12th",:]
#group by school and find mean
twelfth_read_mean = twelfth.groupby("school_name").reading_score.mean().reset_index()

#change column title to be 12th
twelfth_read_mean = twelfth_read_mean.rename(columns={"reading_score": "12th"})
twelfth_read_mean

Unnamed: 0,school_name,12th
0,Bailey High School,80.912451
1,Cabrera High School,84.287958
2,Figueroa High School,81.384863
3,Ford High School,80.662338
4,Griffin High School,84.013699
5,Hernandez High School,80.857143
6,Holden High School,84.698795
7,Huang High School,80.305983
8,Johnson High School,81.227564
9,Pena High School,84.59116


In [51]:
#merge the individual grade dfs into one df
grade_and_read = pd.merge(ninth_read_mean, tenth_read_mean, on="school_name")
grade_and_read = pd.merge(grade_and_read, eleventh_read_mean, on="school_name")
grade_and_read = pd.merge(grade_and_read, twelfth_read_mean, on="school_name")

#set school name as index
grade_and_read.set_index("school_name", inplace = True)
grade_and_read

Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.303155,80.907183,80.945643,80.912451
Cabrera High School,83.676136,84.253219,83.788382,84.287958
Figueroa High School,81.198598,81.408912,80.640339,81.384863
Ford High School,80.632653,81.262712,80.403642,80.662338
Griffin High School,83.369193,83.706897,84.288089,84.013699
Hernandez High School,80.86686,80.660147,81.39614,80.857143
Holden High School,83.677165,83.324561,83.815534,84.698795
Huang High School,81.290284,81.512386,81.417476,80.305983
Johnson High School,81.260714,80.773431,80.616027,81.227564
Pena High School,83.807273,83.612,84.335938,84.59116


In [53]:
#scores by school spending(per student)
#start with school summary df which has all the columns we need, and create bins
spending_bins = [0, 585, 615, 645, 675]
group_names = ["<$585", "$585-615", "$615-645", "$645-675"]
school_summary_df["spending"] = pd.cut(school_summary_df["Per Student Budget"], spending_bins, labels=group_names)
school_summary_df

Unnamed: 0,school_name,type,size,budget,Per Student Budget,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate,spending
0,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171,$645-675
1,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,73.363852,$615-645
2,Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,93.867121,95.854628,94.860875,$585-615
3,Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967,80.862999,73.807983,$645-675
4,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,95.265668,$615-645
5,Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,96.539641,95.203679,<$585
6,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,95.586652,<$585
7,Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,66.680064,81.93328,74.306672,$615-645
8,Holden High School,Charter,427,248087,581.0,83.803279,83.814988,92.505855,96.252927,94.379391,<$585
9,Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,95.27027,$585-615


In [66]:
#create df for us to use in calculations with just columns we are interested in.
reduced_ss = school_summary_df[["spending", "Avg Math Score", "Avg Reading Score", "% Passing Math", "% Passing Reading", "% Overall Passing Rate"]]

#rename columns so we have handles to use that work with syntax
reduced_ss = reduced_ss.rename(columns={"Avg Math Score": "avg_math_score", "Avg Reading Score": "avg_read_score",
                                       "% Passing Math": "pass_math", "% Passing Reading": "pass_read",
                                        "% Overall Passing Rate": "overall_pass"})

#create dfs for each column we are interested in means by spending
math_spending = reduced_ss.groupby("spending").avg_math_score.mean().reset_index()
read_spending = reduced_ss.groupby("spending").avg_read_score.mean().reset_index()
pass_math_spend = reduced_ss.groupby("spending").pass_math.mean().reset_index()
pass_read_spend = reduced_ss.groupby("spending").pass_read.mean().reset_index()
overall_pass_spend = reduced_ss.groupby("spending").overall_pass.mean().reset_index()

#merge into single df
spending_summary = pd.merge(math_spending, read_spending, on="spending")
spending_summary = pd.merge(spending_summary, pass_math_spend, on="spending")
spending_summary = pd.merge(spending_summary, pass_read_spend, on="spending")
spending_summary = pd.merge(spending_summary, overall_pass_spend, on="spending")
spending_summary

#rename columns
spending_summary = spending_summary.rename(columns={"spending": "Spending Ranges(Per Student)",
                                                   "avg_math_score": "Average Math Score",
                                                   "avg_read_score": "Average Reading Score",
                                                   "pass_math": "% Passing Math",
                                                   "pass_read": "% Passing Reading",
                                                   "overall_pass": "% Overall Passing Rate"})
spending_summary = spending_summary.set_index("Spending Ranges(Per Student)")
spending_summary

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
Spending Ranges(Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,83.455399,83.933814,93.460096,96.610877,95.035486
$585-615,83.599686,83.885211,94.230858,95.900287,95.065572
$615-645,79.079225,81.891436,75.668212,86.106569,80.887391
$645-675,76.99721,81.027843,66.164813,81.133951,73.649382


In [67]:
#scores by School size (grouped)
#start with school summary df which has all the columns we need, and create bins
size_bins = [0, 1000, 2000, 5000]
group_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]
school_summary_df["School_Size"] = pd.cut(school_summary_df["size"], size_bins, labels=group_names)
school_summary_df

Unnamed: 0,school_name,type,size,budget,Per Student Budget,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate,spending,School_Size
0,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171,$645-675,Large (2000-5000)
1,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,73.363852,$615-645,Large (2000-5000)
2,Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,93.867121,95.854628,94.860875,$585-615,Medium (1000-2000)
3,Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967,80.862999,73.807983,$645-675,Large (2000-5000)
4,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,95.265668,$615-645,Medium (1000-2000)
5,Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,96.539641,95.203679,<$585,Large (2000-5000)
6,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,95.586652,<$585,Medium (1000-2000)
7,Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,66.680064,81.93328,74.306672,$615-645,Large (2000-5000)
8,Holden High School,Charter,427,248087,581.0,83.803279,83.814988,92.505855,96.252927,94.379391,<$585,Small (<1000)
9,Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,95.27027,$585-615,Small (<1000)


In [68]:
#create df for us to use in calculations with just columns we are interested in.
reduced_ss = school_summary_df[["School_Size", "Avg Math Score", "Avg Reading Score", "% Passing Math", "% Passing Reading", "% Overall Passing Rate"]]

#rename columns so we have handles to use that work with syntax
reduced_ss = reduced_ss.rename(columns={"Avg Math Score": "avg_math_score", "Avg Reading Score": "avg_read_score",
                                       "% Passing Math": "pass_math", "% Passing Reading": "pass_read",
                                        "% Overall Passing Rate": "overall_pass"})

#create dfs for each column we are interested in means by spending
math_size = reduced_ss.groupby("School_Size").avg_math_score.mean().reset_index()
read_size = reduced_ss.groupby("School_Size").avg_read_score.mean().reset_index()
pass_math_size = reduced_ss.groupby("School_Size").pass_math.mean().reset_index()
pass_read_size = reduced_ss.groupby("School_Size").pass_read.mean().reset_index()
overall_pass_size = reduced_ss.groupby("School_Size").overall_pass.mean().reset_index()

#merge into single df
size_summary = pd.merge(math_size, read_size, on="School_Size")
size_summary = pd.merge(size_summary, pass_math_size, on="School_Size")
size_summary = pd.merge(size_summary, pass_read_size, on="School_Size")
size_summary = pd.merge(size_summary, overall_pass_size, on="School_Size")
size_summary

#rename columns
size_summary = size_summary.rename(columns={"School_Size": "School Size",
                                                   "avg_math_score": "Average Math Score",
                                                   "avg_read_score": "Average Reading Score",
                                                   "pass_math": "% Passing Math",
                                                   "pass_read": "% Passing Reading",
                                                   "overall_pass": "% Overall Passing Rate"})
size_summary = size_summary.set_index("School Size")
size_summary

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.821598,83.929843,93.550225,96.099437,94.824831
Medium (1000-2000),83.374684,83.864438,93.599695,96.79068,95.195187
Large (2000-5000),77.746417,81.344493,69.963361,82.766634,76.364998


In [69]:
#create df for us to use in calculations with just columns we are interested in.
reduced_ss = school_summary_df[["type", "Avg Math Score", "Avg Reading Score", "% Passing Math", "% Passing Reading", "% Overall Passing Rate"]]

#rename columns so we have handles to use that work with syntax
reduced_ss = reduced_ss.rename(columns={"Avg Math Score": "avg_math_score", "Avg Reading Score": "avg_read_score",
                                       "% Passing Math": "pass_math", "% Passing Reading": "pass_read",
                                        "% Overall Passing Rate": "overall_pass"})

#create dfs for each column we are interested in means by spending
math_type = reduced_ss.groupby("type").avg_math_score.mean().reset_index()
read_type = reduced_ss.groupby("type").avg_read_score.mean().reset_index()
pass_math_type = reduced_ss.groupby("type").pass_math.mean().reset_index()
pass_read_type = reduced_ss.groupby("type").pass_read.mean().reset_index()
overall_pass_type = reduced_ss.groupby("type").overall_pass.mean().reset_index()

#merge into single df
type_summary = pd.merge(math_type, read_type, on="type")
type_summary = pd.merge(type_summary, pass_math_type, on="type")
type_summary = pd.merge(type_summary, pass_read_type, on="type")
type_summary = pd.merge(type_summary, overall_pass_type, on="type")
type_summary

#rename columns
type_summary = type_summary.rename(columns={"type": "School Type",
                                                   "avg_math_score": "Average Math Score",
                                                   "avg_read_score": "Average Reading Score",
                                                   "pass_math": "% Passing Math",
                                                   "pass_read": "% Passing Reading",
                                                   "overall_pass": "% Overall Passing Rate"})
type_summary = type_summary.set_index("School Type")
type_summary

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.473852,83.896421,93.62083,96.586489,95.10366
District,76.956733,80.966636,66.548453,80.799062,73.673757
