## Get packages and data

In [1]:
import pandas as pd

In [2]:
#naming data input files
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

In [3]:
#reading input data into pandas dataframes
school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load)

## Clean up stray prefixes and postfixes

In [4]:
# unwanted prefixes and suffixes - note spaces!
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

In [5]:
# pandas replace, as opposed to basic string replace, defaults to using regular expressions
# this matters for strings like "Dr. " with punctuation, and throws a warning
# the flag regex=False fixes this issue
for prefsuf in prefixes_suffixes:
    student_data_df["student_name"] = \
    student_data_df["student_name"].str.replace(prefsuf, "", regex=False)

## Merge into a master dataset

In [6]:
# Combine the data into a single dataset.
school_data_complete_df = pd.merge(student_data_df, school_data_df, on=["school_name", "school_name"])
school_data_complete_df.head()
# remember that type, size, and budget are school properties - maybe rename?

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [7]:
# we're gonna need boolean Pass/Fail values for each student for many computations,
# so i'm gonna add them to the main dataframe we use
passing_math = school_data_complete_df["math_score"] >= 70
passing_reading = school_data_complete_df["reading_score"] >= 70
school_data_complete_df["pass_math"] = passing_math
school_data_complete_df["pass_reading"] = passing_reading
school_data_complete_df["pass_both"] = passing_math & passing_reading
school_data_complete_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,pass_math,pass_reading,pass_both
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,True,False,False
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,False,True,False
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,False,True,False
3,3,Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,False,False,False
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,True,True,True


## Prepare District Summary

In [8]:
# not sure why module wants me to use .count() instead of len(), seeing as how there are no NaN's
student_count = len(school_data_complete_df)
school_count = len(school_data_df)
total_budget = school_data_df["budget"].sum()
average_math_score = school_data_complete_df["math_score"].mean()
average_reading_score = school_data_complete_df["reading_score"].mean()
print(f"These {school_count} schools have {student_count:,} students.\n"
      f"Their total budget is ${total_budget:,}.\n"
      f"The average math score is {average_math_score:.2f}, "
      f"and the average reading score is {average_reading_score:.2f}.")

These 15 schools have 39,170 students.
Their total budget is $24,649,428.
The average math score is 78.99, and the average reading score is 81.88.


In [9]:
# the sum of a series of booleans is the count of True's in it
math_pass_count = school_data_complete_df["pass_math"].sum()
reading_pass_count = school_data_complete_df["pass_reading"].sum()
both_pass_count = school_data_complete_df["pass_both"].sum()

# store percentages in variables
passing_math_percentage = math_pass_count / student_count * 100
passing_reading_percentage = reading_pass_count / student_count * 100
overall_passing_percentage = both_pass_count / student_count * 100

# print results for me to see
print(f"{passing_math_percentage:.2f}% of students passed math.\n"
      f"{passing_reading_percentage:.2f}% of students passed reading.\n"
      f"{overall_passing_percentage:.2f}% of students passed both.")

74.98% of students passed math.
85.81% of students passed reading.
65.17% of students passed both.


In [10]:
# new DataFrame (build from a dictionary) contains requested district summary data
district_summary_df = pd.DataFrame(
          [{"Total Schools": school_count,
          "Total Students": student_count,
          "Total Budget": total_budget,
          "Average Math Score": average_math_score,
          "Average Reading Score": average_reading_score,
          "% Passing Math": passing_math_percentage,
         "% Passing Reading": passing_reading_percentage,
        "% Overall Passing": overall_passing_percentage}])
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,65.172326


In [11]:
# district summary column formats dictionary, not intended to ever get used again
disumcolforms = {"Total Students": "{:,}", "Total Budget": "${:,.2f}",
                 "Average Math Score": "{:.1f}", "Average Reading Score": "{:.1f}",
                 "% Passing Math": "{:.0f}", "% Passing Reading": "{:.0f}", 
                 "% Overall Passing": "{:.0f}"}
# i disagree with requested choices: percentages should be formatted to end with "%" symbol

In [12]:
# format values in district-summary dataframe for readability, converting numbers to strings
for col_name in disumcolforms:
    district_summary_df[col_name] = \
    district_summary_df[col_name].map(disumcolforms[col_name].format)
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79.0,81.9,75,86,65


## Generate back-end school dataframe.

In [13]:
# here we create a dataframe holding the 5 complicated school stats:
# math/reading avg scores, and math/reading/both passing rates
# because average of booleans gives fraction of True's !!
school_avgs_df = school_data_complete_df.groupby("school_name").mean()
school_avgs_df = \
    school_avgs_df[["reading_score", "math_score", "pass_math", "pass_reading", "pass_both"]]
school_avgs_df

Unnamed: 0_level_0,reading_score,math_score,pass_math,pass_reading,pass_both
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bailey High School,81.033963,77.048432,0.666801,0.819333,0.546423
Cabrera High School,83.97578,83.061895,0.941335,0.970398,0.913348
Figueroa High School,81.15802,76.711767,0.659885,0.807392,0.532045
Ford High School,80.746258,77.102592,0.683096,0.79299,0.542899
Griffin High School,83.816757,83.351499,0.933924,0.97139,0.905995
Hernandez High School,80.934412,77.289752,0.66753,0.80863,0.535275
Holden High School,83.814988,83.803279,0.925059,0.962529,0.892272
Huang High School,81.182722,76.629414,0.656839,0.813164,0.535139
Johnson High School,80.966394,77.072464,0.660576,0.812224,0.535392
Pena High School,84.044699,83.839917,0.945946,0.959459,0.905405


In [14]:
# here we add back the data we had in the school_data file
school_summary_df = pd.merge(school_data_df, school_avgs_df, on=["school_name", "school_name"])
school_summary_df

Unnamed: 0,School ID,school_name,type,size,budget,reading_score,math_score,pass_math,pass_reading,pass_both
0,0,Huang High School,District,2917,1910635,81.182722,76.629414,0.656839,0.813164,0.535139
1,1,Figueroa High School,District,2949,1884411,81.15802,76.711767,0.659885,0.807392,0.532045
2,2,Shelton High School,Charter,1761,1056600,83.725724,83.359455,0.938671,0.958546,0.898921
3,3,Hernandez High School,District,4635,3022020,80.934412,77.289752,0.66753,0.80863,0.535275
4,4,Griffin High School,Charter,1468,917500,83.816757,83.351499,0.933924,0.97139,0.905995
5,5,Wilson High School,Charter,2283,1319574,83.989488,83.274201,0.938677,0.965396,0.905826
6,6,Cabrera High School,Charter,1858,1081356,83.97578,83.061895,0.941335,0.970398,0.913348
7,7,Bailey High School,District,4976,3124928,81.033963,77.048432,0.666801,0.819333,0.546423
8,8,Holden High School,Charter,427,248087,83.814988,83.803279,0.925059,0.962529,0.892272
9,9,Pena High School,Charter,962,585858,84.044699,83.839917,0.945946,0.959459,0.905405


In [15]:
# here add a derived column for per-capita spending, reindex by school_name,
# remove numerical indexes from the original schools df, and place new column where we want it
school_summary_df["per_cap"] = school_summary_df["budget"] / school_summary_df["size"]
school_summary_df = school_summary_df.set_index(["school_name"]) \
    [["type", "size", "budget", "per_cap",
     'math_score', 'reading_score', 'pass_math', 'pass_reading', 'pass_both']]
school_summary_df

Unnamed: 0_level_0,type,size,budget,per_cap,math_score,reading_score,pass_math,pass_reading,pass_both
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Huang High School,District,2917,1910635,655.0,76.629414,81.182722,0.656839,0.813164,0.535139
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,0.659885,0.807392,0.532045
Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,0.938671,0.958546,0.898921
Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,0.66753,0.80863,0.535275
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,0.933924,0.97139,0.905995
Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,0.938677,0.965396,0.905826
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,0.941335,0.970398,0.913348
Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,0.666801,0.819333,0.546423
Holden High School,Charter,427,248087,581.0,83.803279,83.814988,0.925059,0.962529,0.892272
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,0.945946,0.959459,0.905405


## Prepare school performance report

In [17]:
#prettifying this df for output will interfere with later usage, hence we start by making a copy
pretty_school_summary_df = school_summary_df.copy()

# requested output has no name for index column, so
pretty_school_summary_df.index.name = None
# over-complicated column renaming, so i can see what i'm changing
# the easy option is df.columns = new_col_names
old_col_names = ['type', 'size', 'budget', 'per_cap', 'math_score', 'reading_score',
                 'pass_math', 'pass_reading', 'pass_both']
new_col_names = ["School Type", "Total Students", "Total School Budget", "Per Student Budget",
                "Average Math Score", "Average Reading Score", "% Passing Math",
                "% Passing Reading", "% Overall Passing"]
col_rename_dic = {}
for i in range(len(old_col_names)):
    col_rename_dic[old_col_names[i]] = new_col_names[i]
col_rename_dic

{'type': 'School Type',
 'size': 'Total Students',
 'budget': 'Total School Budget',
 'per_cap': 'Per Student Budget',
 'math_score': 'Average Math Score',
 'reading_score': 'Average Reading Score',
 'pass_math': '% Passing Math',
 'pass_reading': '% Passing Reading',
 'pass_both': '% Overall Passing'}

In [18]:
# rename columns as requested
pretty_school_summary_df.rename(columns = col_rename_dic, inplace = True)
pretty_school_summary_df

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Huang High School,District,2917,1910635,655.0,76.629414,81.182722,0.656839,0.813164,0.535139
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,0.659885,0.807392,0.532045
Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,0.938671,0.958546,0.898921
Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,0.66753,0.80863,0.535275
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,0.933924,0.97139,0.905995
Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,0.938677,0.965396,0.905826
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,0.941335,0.970398,0.913348
Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,0.666801,0.819333,0.546423
Holden High School,Charter,427,248087,581.0,83.803279,83.814988,0.925059,0.962529,0.892272
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,0.945946,0.959459,0.905405


In [19]:
# format outputs, module only asks for budget and per-cap budget, with same format
for colnam in ("Total School Budget", "Per Student Budget"):
    pretty_school_summary_df[colnam] = pretty_school_summary_df[colnam].map("${:,.2f}".format)
pretty_school_summary_df

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,0.656839,0.813164,0.535139
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,0.659885,0.807392,0.532045
Shelton High School,Charter,1761,"$1,056,600.00",$600.00,83.359455,83.725724,0.938671,0.958546,0.898921
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,0.66753,0.80863,0.535275
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,0.933924,0.97139,0.905995
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.274201,83.989488,0.938677,0.965396,0.905826
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,0.941335,0.970398,0.913348
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,0.666801,0.819333,0.546423
Holden High School,Charter,427,"$248,087.00",$581.00,83.803279,83.814988,0.925059,0.962529,0.892272
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,0.945946,0.959459,0.905405


In [20]:
# passing-both is their key performance metric, so we sort by it
pretty_school_summary_df.sort_values(["% Overall Passing"], ascending=False).head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,0.941335,0.970398,0.913348
Thomas High School,Charter,1635,"$1,043,130.00",$638.00,83.418349,83.84893,0.932722,0.973089,0.90948
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,0.933924,0.97139,0.905995
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.274201,83.989488,0.938677,0.965396,0.905826
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,0.945946,0.959459,0.905405


In [21]:
# i guess they want worst performers with the worstest on top, so
pretty_school_summary_df.sort_values(["% Overall Passing"]).head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Rodriguez High School,District,3999,"$2,547,363.00",$637.00,76.842711,80.744686,0.663666,0.802201,0.529882
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,0.659885,0.807392,0.532045
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,0.656839,0.813164,0.535139
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,0.66753,0.80863,0.535275
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,0.660576,0.812224,0.535392


## School-by-grade tables for math and reading average scores. 

In [22]:
# here we create a dataframe holding the 5 complicated school/grade stats:
# math/reading avg scores, and math/reading/both passing rates
# because average of booleans gives fraction of True's !!
school_grade_avgs_df = school_data_complete_df.groupby(["school_name", "grade"]).mean()
school_grade_avgs_df = \
    school_grade_avgs_df[['reading_score', 'math_score', 'pass_math', 'pass_reading', 'pass_both']]
school_grade_avgs_df.reset_index(inplace=True)
school_grade_avgs_df.head()

Unnamed: 0,school_name,grade,reading_score,math_score,pass_math,pass_reading,pass_both
0,Bailey High School,10th,80.907183,76.996772,0.663438,0.835351,0.556901
1,Bailey High School,11th,80.945643,77.515588,0.684253,0.805755,0.552358
2,Bailey High School,12th,80.912451,76.492218,0.642996,0.81323,0.520428
3,Bailey High School,9th,81.303155,77.083676,0.671468,0.821674,0.550754
4,Cabrera High School,10th,84.253219,83.154506,0.939914,0.974249,0.914163


In [23]:
reading_scores_by_grade = \
    school_grade_avgs_df.pivot(index='school_name', columns='grade', values='reading_score')
reading_scores_by_grade.index.name = None
reading_scores_by_grade.columns.name = None
reading_scores_by_grade

Unnamed: 0,10th,11th,12th,9th
Bailey High School,80.907183,80.945643,80.912451,81.303155
Cabrera High School,84.253219,83.788382,84.287958,83.676136
Figueroa High School,81.408912,80.640339,81.384863,81.198598
Ford High School,81.262712,80.403642,80.662338,80.632653
Griffin High School,83.706897,84.288089,84.013699,83.369193
Hernandez High School,80.660147,81.39614,80.857143,80.86686
Holden High School,83.324561,83.815534,84.698795,83.677165
Huang High School,81.512386,81.417476,80.305983,81.290284
Johnson High School,80.773431,80.616027,81.227564,81.260714
Pena High School,83.612,84.335938,84.59116,83.807273


In [24]:
math_scores_by_grade = school_grade_avgs_df.pivot(index='school_name', columns='grade', values='math_score')
math_scores_by_grade.index.name = None
math_scores_by_grade.columns.name = None
math_scores_by_grade

Unnamed: 0,10th,11th,12th,9th
Bailey High School,76.996772,77.515588,76.492218,77.083676
Cabrera High School,83.154506,82.76556,83.277487,83.094697
Figueroa High School,76.539974,76.884344,77.151369,76.403037
Ford High School,77.672316,76.918058,76.179963,77.361345
Griffin High School,84.229064,83.842105,83.356164,82.04401
Hernandez High School,77.337408,77.136029,77.186567,77.438495
Holden High School,83.429825,85.0,82.855422,83.787402
Huang High School,75.908735,76.446602,77.225641,77.027251
Johnson High School,76.691117,77.491653,76.863248,77.187857
Pena High School,83.372,84.328125,84.121547,83.625455


In [25]:
# for requested outputs, need to cut off decimals and reorder columns
# i'd totally use column.name or index.name for table title...

# column names in order:
ordcolnam = ['9th', '10th', '11th', '12th']
# reorder columns
reading_scores_by_grade = reading_scores_by_grade[ordcolnam]
math_scores_by_grade = math_scores_by_grade[ordcolnam]
# format everything
for colnam in ordcolnam:
    reading_scores_by_grade[colnam] = reading_scores_by_grade[colnam].map("{:,.1f}".format)
    math_scores_by_grade[colnam] = math_scores_by_grade[colnam].map("{:,.1f}".format)

In [26]:
reading_scores_by_grade

Unnamed: 0,9th,10th,11th,12th
Bailey High School,81.3,80.9,80.9,80.9
Cabrera High School,83.7,84.3,83.8,84.3
Figueroa High School,81.2,81.4,80.6,81.4
Ford High School,80.6,81.3,80.4,80.7
Griffin High School,83.4,83.7,84.3,84.0
Hernandez High School,80.9,80.7,81.4,80.9
Holden High School,83.7,83.3,83.8,84.7
Huang High School,81.3,81.5,81.4,80.3
Johnson High School,81.3,80.8,80.6,81.2
Pena High School,83.8,83.6,84.3,84.6


In [27]:
math_scores_by_grade

Unnamed: 0,9th,10th,11th,12th
Bailey High School,77.1,77.0,77.5,76.5
Cabrera High School,83.1,83.2,82.8,83.3
Figueroa High School,76.4,76.5,76.9,77.2
Ford High School,77.4,77.7,76.9,76.2
Griffin High School,82.0,84.2,83.8,83.4
Hernandez High School,77.4,77.3,77.1,77.2
Holden High School,83.8,83.4,85.0,82.9
Huang High School,77.0,75.9,76.4,77.2
Johnson High School,77.2,76.7,77.5,76.9
Pena High School,83.6,83.4,84.3,84.1


## Back to backend schools dataframe, for size, funding bins.

In [38]:
# just to see what we're working with
school_summary_df

Unnamed: 0_level_0,type,size,budget,per_cap,math_score,reading_score,pass_math,pass_reading,pass_both,per_cap_bin
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Huang High School,District,2917,1910635,655.0,76.629414,81.182722,0.656839,0.813164,0.535139,"(645, 675]"
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,0.659885,0.807392,0.532045,"(630, 645]"
Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,0.938671,0.958546,0.898921,"(585, 630]"
Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,0.66753,0.80863,0.535275,"(645, 675]"
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,0.933924,0.97139,0.905995,"(585, 630]"
Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,0.938677,0.965396,0.905826,"(0, 585]"
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,0.941335,0.970398,0.913348,"(0, 585]"
Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,0.666801,0.819333,0.546423,"(585, 630]"
Holden High School,Charter,427,248087,581.0,83.803279,83.814988,0.925059,0.962529,0.892272,"(0, 585]"
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,0.945946,0.959459,0.905405,"(585, 630]"


### Spending bins

In [39]:
school_summary_df["per_cap"].describe()

count     15.000000
mean     620.066667
std       28.544368
min      578.000000
25%      591.500000
50%      628.000000
75%      641.500000
max      655.000000
Name: per_cap, dtype: float64

In [40]:
# bins suggested for per_cap, based on above
#spending_bins = [0, 585, 615, 645, 675]   #equal width
spending_bins = [0, 585, 630, 645, 675]   #equal counts, later choice

In [62]:
# it's weird that they want unweighted averages, so students in small schools contribute more...
school_summary_df["per_cap_bin"] = \
    pd.cut(school_summary_df["per_cap"], spending_bins)
spending_summary_df = school_summary_df.groupby("per_cap_bin").mean()
performetrics = ['math_score', 'reading_score', 'pass_math', 'pass_reading', 'pass_both']
spending_summary_df

Unnamed: 0_level_0,size,budget,per_cap,math_score,reading_score,pass_math,pass_reading,pass_both
per_cap_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"(0, 585]",1592.0,924604.2,581.0,83.455399,83.933814,0.934601,0.966109,0.903695
"(585, 630]",2291.75,1421222.0,615.5,81.899826,83.155286,0.871335,0.927182,0.814186
"(630, 645]",2830.5,1809705.0,639.5,78.518855,81.624473,0.734842,0.843918,0.628577
"(645, 675]",4104.333333,2675768.0,652.333333,76.99721,81.027843,0.661648,0.81134,0.535269


In [63]:
spending_summary_df = spending_summary_df[performetrics].copy()

In [64]:
col_ren_dict = {'math_score': 'Average Math Score',
                'reading_score': 'Average Reading Score', 'pass_math': '% Passing Math',
                'pass_reading': '% Passing Reading', 'pass_both': '% Overall Passing'}
spending_summary_df.rename(columns = col_ren_dict, inplace = True)
spending_summary_df

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
per_cap_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(0, 585]",83.455399,83.933814,0.934601,0.966109,0.903695
"(585, 630]",81.899826,83.155286,0.871335,0.927182,0.814186
"(630, 645]",78.518855,81.624473,0.734842,0.843918,0.628577
"(645, 675]",76.99721,81.027843,0.661648,0.81134,0.535269


In [65]:
off_by_100 = ["% Passing Math", "% Passing Reading", "% Overall Passing"]
for colnam in off_by_100:
    spending_summary_df[colnam] = 100*spending_summary_df[colnam]

In [66]:
bin_labels = ["<$586", "$586-630", "631-645", "$646-675"]
spending_summary_df.set_axis([bin_labels], inplace = True)
spending_summary_df.rename_axis("Spending Ranges (Per Student)", inplace = True)

In [67]:
spending_summary_df

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$586,83.455399,83.933814,93.460096,96.610877,90.369459
$586-630,81.899826,83.155286,87.133538,92.718205,81.418596
631-645,78.518855,81.624473,73.484209,84.391793,62.857656
$646-675,76.99721,81.027843,66.164813,81.133951,53.526855


### Size bins

In [68]:
# bins suggested by module
size_bins = [0, 999, 1999, 5000]
group_names = ["Small (<1000)", "Medium (1000-1999)", "Large (2000-5000)"]

In [70]:
# Categorize spending based on the bins.
school_summary_df["School Size"] = \
    pd.cut(school_summary_df["size"], size_bins, labels=group_names)


Unnamed: 0_level_0,type,size,budget,per_cap,math_score,reading_score,pass_math,pass_reading,pass_both,per_cap_bin,School Size
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Huang High School,District,2917,1910635,655.0,76.629414,81.182722,0.656839,0.813164,0.535139,"(645, 675]",Large (2000-5000)
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,0.659885,0.807392,0.532045,"(630, 645]",Large (2000-5000)
Shelton High School,Charter,1761,1056600,600.0,83.359455,83.725724,0.938671,0.958546,0.898921,"(585, 630]",Medium (1000-1999)
Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,0.66753,0.80863,0.535275,"(645, 675]",Large (2000-5000)
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,0.933924,0.97139,0.905995,"(585, 630]",Medium (1000-1999)


In [76]:
sizes_summary_df = school_summary_df.groupby(["School Size"]).mean()


Unnamed: 0_level_0,size,budget,per_cap,math_score,reading_score,pass_math,pass_reading,pass_both
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Small (<1000),694.5,416972.5,595.0,83.821598,83.929843,0.935502,0.960994,0.898839
Medium (1000-1999),1704.4,1029597.2,605.6,83.374684,83.864438,0.935997,0.967907,0.906215
Large (2000-5000),3657.375,2333437.125,635.375,77.746417,81.344493,0.699634,0.827666,0.58286


In [81]:
size_summary_df = sizes_summary_df[performetrics].copy()
size_summary_df

Unnamed: 0_level_0,math_score,reading_score,pass_math,pass_reading,pass_both
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.821598,83.929843,0.935502,0.960994,0.898839
Medium (1000-1999),83.374684,83.864438,0.935997,0.967907,0.906215
Large (2000-5000),77.746417,81.344493,0.699634,0.827666,0.58286


In [82]:
size_summary_df.rename(columns = col_ren_dict, inplace = True)
size_summary_df

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.821598,83.929843,0.935502,0.960994,0.898839
Medium (1000-1999),83.374684,83.864438,0.935997,0.967907,0.906215
Large (2000-5000),77.746417,81.344493,0.699634,0.827666,0.58286


In [83]:
for colnam in off_by_100:
    size_summary_df[colnam] = 100*size_summary_df[colnam]
size_summary_df

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.821598,83.929843,93.550225,96.099437,89.883853
Medium (1000-1999),83.374684,83.864438,93.599695,96.79068,90.621535
Large (2000-5000),77.746417,81.344493,69.963361,82.766634,58.286003


In [None]:
# they want formatting, too

### district vs charter breakdown 

In [86]:
type_summary_df = school_summary_df.groupby(["type"]).mean()
type_summary_df = type_summary_df[performetrics]
type_summary_df

Unnamed: 0_level_0,math_score,reading_score,pass_math,pass_reading,pass_both
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.473852,83.896421,0.936208,0.965865,0.904322
District,76.956733,80.966636,0.665485,0.807991,0.536722


In [None]:
# rename columns, rescale rates, format numbers into strings