In [1]:
#Dependencies and Setup
import pandas as pd
import numpy as np

In [2]:
#Load to files
schools_csv_path = "Resources/schools_complete.csv"
students_csv_path = "Resources/students_complete.csv"

In [3]:
#Read the files and store into dataframes
schools_df = pd.read_csv(schools_csv_path)
students_df = pd.read_csv(students_csv_path)

In [4]:
#combine the data into a single dataset 
combined_df = pd.merge(students_df, schools_df, on="school_name")

In [5]:
# Calculate the total number of backers for all US projects (from kickstarter activity)
total_students = students_df['student_name'].count()

In [6]:
average_math_score = students_df["math_score"].mean()

In [7]:
average_read_score = students_df["reading_score"].mean()

In [8]:
total_budget = schools_df["budget"].sum()

In [9]:
total_schools = schools_df["school_name"].count()

In [10]:
pass_read = combined_df["student_name"].loc[combined_df['reading_score'] >= 70].count()
pass_read_pct = pass_read / total_students * 100

In [11]:
pass_math = combined_df["student_name"].loc[combined_df["math_score"] >= 70].count()
pass_math_pct = pass_math / total_students * 100

In [12]:
pass_all = combined_df["student_name"].loc[(combined_df["math_score"] >= 70) & (combined_df["reading_score"] >= 70)].count()
overall_pass = pass_all / total_students * 100

In [13]:
#create a (data frame) summary with Total Schools, Total Students, Total Budget, Average Math Score, Average Reading Score, %Passing Math, %Passing Reading %Overall Passing
summary_schools_df = pd.DataFrame({"Total Schools": [total_schools], 
                                   "Total Students": total_students, 
                                   "Total Budget": total_budget, 
                                   "Average Math Score": average_math_score,
                                   "Average Reading Score": average_read_score,
                                   "% Passing Math": pass_math_pct,"% Passing Reading": pass_read_pct, 
                                   "% Overall Passing": overall_pass})

In [14]:
summary_schools_df["Total Budget"] = summary_schools_df["Total Budget"].astype(float).map("${:,.2f}".format)
summary_schools_df["Average Math Score"] = summary_schools_df["Average Math Score"].map("{:.0f}".format)
summary_schools_df["Average Reading Score"] = summary_schools_df["Average Reading Score"].map("{:.0f}".format)
summary_schools_df["% Passing Math"] = summary_schools_df["% Passing Math"].map("{:.0f}%".format)
summary_schools_df["% Passing Reading"] = summary_schools_df["% Passing Reading"].map("{:.0f}%".format)
summary_schools_df["% Overall Passing"] = summary_schools_df["% Overall Passing"].map("{:.0f}%".format)

In [15]:
print("")
print("")
print("District Summary")
summary_schools_df



District Summary


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79,82,75%,86%,65%


In [16]:
#add boolean series for passing reading and passing math 
combined_df["pass_math"] = combined_df["student_name"].loc[combined_df["math_score"] >= 70]
combined_df["pass_reading"] = combined_df["student_name"].loc[combined_df["math_score"]>= 70]
combined_df["pass_both"] = combined_df["student_name"].loc[(combined_df["math_score"] >= 70) & (combined_df["reading_score"] >= 70)]

In [17]:
#Create a dataframe with metrics about each school it should groupby school
school_group = combined_df.groupby(["school_name"])
school_group_df = school_group.sum()
school_group_df = school_group_df[["reading_score","math_score"]]

In [18]:
stats_group = combined_df.groupby(["school_name"])
stats_group_df = stats_group.count()

In [19]:
#drop student ID, gender, grade, reading score, math score, school ID, type
stats_group_df = stats_group_df[["pass_math","pass_reading","pass_both"]]

In [20]:
passing_df = pd.merge(stats_group_df, school_group_df, on="school_name")

In [21]:
best_worst_df = pd.merge(passing_df, schools_df, on="school_name")

In [22]:
#add column with per student budget
best_worst_df["per_student_budget"] = best_worst_df["budget"] / best_worst_df["size"]

In [23]:
#add column with average math score
best_worst_df["average_math"] = best_worst_df["math_score"] / best_worst_df["size"]

In [24]:
#add column with average reading scores
best_worst_df["average_reading"] = best_worst_df["reading_score"] / best_worst_df["size"]

In [25]:
#column with % passing reading
best_worst_df["pass_reading_pct"] = best_worst_df["pass_reading"] / best_worst_df["size"] *100

In [26]:
#add column with % passing math
best_worst_df["pass_math_pct"] = best_worst_df["pass_math"] / best_worst_df["size"] *100

In [27]:
#add column with % passing both
best_worst_df["pass_both_pct"] = best_worst_df["pass_both"] / best_worst_df["size"] *100

In [28]:
#rename headers School Name, School Type, Total Students, Total School Budget, Per Student Budget
#Average Math Score, Average Reading Score, % Passing Math (The percentage of students that passed math.)
#% Passing Reading (The percentage of students that passed reading.), % Overall Passing (The percentage of students that passed math **and** reading.)
rename_best_worst_df = best_worst_df.rename(columns={"school_name":"School Name",
                                              "size":"Total Students", 
                                              "budget":"Total School Budget", 
                                              "per_student_budget":"Per Student Budget",
                                              "average_math":"Average Math Score", 
                                              "average_reading":"Average Reading Score",
                                              "pass_math_pct":"% Passing Math",
                                              "pass_reading_pct":"% Passing Reading", 
                                              "pass_both_pct":"% Overall Passing",
                                             "type":"School Type"})

In [29]:
rename_best_worst_df["Total School Budget"] = rename_best_worst_df["Total School Budget"].astype(float).map("${:,.2f}".format)
rename_best_worst_df["Per Student Budget"] = rename_best_worst_df["Per Student Budget"].astype(float).map("${:,.2f}".format)
rename_best_worst_df["Total Students"] = rename_best_worst_df["Total Students"].map("{:,}".format)
rename_best_worst_df["Average Math Score"] = rename_best_worst_df["Average Math Score"].map("{:.0f}".format)
rename_best_worst_df["Average Reading Score"] = rename_best_worst_df["Average Reading Score"].map("{:.0f}".format)
rename_best_worst_df["% Passing Math"] = rename_best_worst_df["% Passing Math"].map("{:.0f}%".format)
rename_best_worst_df["% Passing Reading"] = rename_best_worst_df["% Passing Reading"].map("{:.0f}%".format)
rename_best_worst_df["% Overall Passing"] = rename_best_worst_df["% Overall Passing"].map("{:.0f}%".format)

In [30]:
#organize the columns to match the order
final_best_worst_df = rename_best_worst_df[["School Name", "School Type", 
                                             "Total Students", "Total School Budget", 
                                             "Per Student Budget", "Average Math Score", 
                                             "Average Reading Score", "% Passing Math",
                                             "% Passing Reading","% Overall Passing"]]

In [31]:
print("")
print("")
print("School Summary")
final_best_worst_df.set_index("School Name")



School Summary


Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,"$3,124,928.00",$628.00,77,81,67%,67%,55%
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83,84,94%,94%,91%
Figueroa High School,District,2949,"$1,884,411.00",$639.00,77,81,66%,66%,53%
Ford High School,District,2739,"$1,763,916.00",$644.00,77,81,68%,68%,54%
Griffin High School,Charter,1468,"$917,500.00",$625.00,83,84,93%,93%,91%
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77,81,67%,67%,54%
Holden High School,Charter,427,"$248,087.00",$581.00,84,84,93%,93%,89%
Huang High School,District,2917,"$1,910,635.00",$655.00,77,81,66%,66%,54%
Johnson High School,District,4761,"$3,094,650.00",$650.00,77,81,66%,66%,54%
Pena High School,Charter,962,"$585,858.00",$609.00,84,84,95%,95%,91%


In [32]:
#to sort from highest to lowest ascending=False must be passed in
best_df = final_best_worst_df.sort_values("% Overall Passing",ascending=False)

In [33]:
print("")
print("")
print("Top Performing Schools (by % Overall Passing)")
best_df.set_index("School Name")
best_df.head(5)



Top Performing Schools (by % Overall Passing)


Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
1,Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83,84,94%,94%,91%
4,Griffin High School,Charter,1468,"$917,500.00",$625.00,83,84,93%,93%,91%
9,Pena High School,Charter,962,"$585,858.00",$609.00,84,84,95%,95%,91%
12,Thomas High School,Charter,1635,"$1,043,130.00",$638.00,83,84,93%,93%,91%
13,Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83,84,94%,94%,91%


In [34]:
#sort the schools by ascending order and display the top five bottom performing schools by overall passing
worst_df = final_best_worst_df.sort_values("% Overall Passing",ascending=True)
print("")
print("")
print("Bottom Pefroming Schools (by % Overall Passing)")
worst_df.head(5)



Bottom Pefroming Schools (by % Overall Passing)


Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
2,Figueroa High School,District,2949,"$1,884,411.00",$639.00,77,81,66%,66%,53%
10,Rodriguez High School,District,3999,"$2,547,363.00",$637.00,77,81,66%,66%,53%
3,Ford High School,District,2739,"$1,763,916.00",$644.00,77,81,68%,68%,54%
5,Hernandez High School,District,4635,"$3,022,020.00",$652.00,77,81,67%,67%,54%
7,Huang High School,District,2917,"$1,910,635.00",$655.00,77,81,66%,66%,54%


In [35]:
r_ninth_grade_df = combined_df[["school_name","reading_score"]].loc[combined_df["grade"] == "9th"]
r_ninth_grade_df.set_index("school_name")
r_ninth_grade_df = r_ninth_grade_df.rename(columns = {"reading_score":"9th","school_name":"School Name"})

In [36]:
m_ninth_grade_df = combined_df[["school_name","math_score"]].loc[combined_df["grade"] == "9th"]
m_ninth_grade_df.set_index("school_name")
m_ninth_grade_df = m_ninth_grade_df.rename(columns = {"math_score":"9th","school_name":"School Name"})

In [37]:
r_tenth_grade_df = combined_df[["school_name","reading_score"]].loc[combined_df["grade"] == "10th"]
r_tenth_grade_df.set_index("school_name")
r_tenth_grade_df = r_tenth_grade_df.rename(columns = {"reading_score":"10th","school_name":"School Name"})

In [38]:
m_tenth_grade_df = combined_df[["school_name","math_score"]].loc[combined_df["grade"] == "10th"]
m_tenth_grade_df.set_index("school_name")
m_tenth_grade_df = m_tenth_grade_df.rename(columns = {"math_score":"10th","school_name":"School Name"})

In [39]:
r_eleventh_grade_df = combined_df[["school_name","reading_score"]].loc[combined_df["grade"] == "11th"]
r_eleventh_grade_df.set_index("school_name")
r_eleventh_grade_df = r_eleventh_grade_df.rename(columns = {"reading_score":"11th","school_name":"School Name"})

In [40]:
m_eleventh_grade_df = combined_df[["school_name","math_score"]].loc[combined_df["grade"] == "11th"]
m_eleventh_grade_df.set_index("school_name")
m_eleventh_grade_df = m_eleventh_grade_df.rename(columns = {"math_score":"11th","school_name":"School Name"})

In [41]:
r_twelfth_grade_df = combined_df[["school_name","reading_score"]].loc[combined_df["grade"] == "12th"]
r_twelfth_grade_df.set_index("school_name")
r_twelfth_grade_df = r_twelfth_grade_df.rename(columns = {"reading_score":"12th","school_name":"School Name"})

In [42]:
m_twelfth_grade_df = combined_df[["school_name","math_score"]].loc[combined_df["grade"] == "12th"]
m_twelfth_grade_df.set_index("school_name")
m_twelfth_grade_df = m_twelfth_grade_df.rename(columns = {"math_score":"12th","school_name":"School Name"})

In [43]:
r_ninth_group = r_ninth_grade_df.groupby(["School Name"])
r_ninth = r_ninth_group.mean()

In [44]:
m_ninth_group = m_ninth_grade_df.groupby(["School Name"])
m_ninth = m_ninth_group.mean()

In [45]:
r_tenth_group = r_tenth_grade_df.groupby(["School Name"])
r_tenth = r_tenth_group.mean()
reading_by_grade = pd.merge(r_ninth, r_tenth, on="School Name")

In [46]:
m_tenth_group = m_tenth_grade_df.groupby(["School Name"])
m_tenth = m_tenth_group.mean()
math_by_grade = pd.merge(m_ninth, m_tenth, on="School Name")

In [47]:
r_eleventh_group = r_eleventh_grade_df.groupby(["School Name"])
r_eleventh = r_eleventh_group.mean()
reading_by_grade = pd.merge(reading_by_grade, r_eleventh, on="School Name")

In [48]:
m_eleventh_group = m_eleventh_grade_df.groupby(["School Name"])
m_eleventh = m_eleventh_group.mean()
math_by_grade = pd.merge(math_by_grade, m_eleventh, on="School Name")

In [49]:
r_twelfth_group = r_twelfth_grade_df.groupby(["School Name"])
r_twelfth = r_twelfth_group.mean()
reading_by_grade = pd.merge(reading_by_grade, r_twelfth, on="School Name")

In [50]:
m_twelfth_group = m_twelfth_grade_df.groupby(["School Name"])
m_twelfth = m_twelfth_group.mean()
math_by_grade = pd.merge(math_by_grade, m_twelfth, on="School Name")

In [51]:
print("")
print("")
print("Math Scores by Grade")
math_by_grade



Math Scores by Grade


Unnamed: 0_level_0,9th,10th,11th,12th
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.083676,76.996772,77.515588,76.492218
Cabrera High School,83.094697,83.154506,82.76556,83.277487
Figueroa High School,76.403037,76.539974,76.884344,77.151369
Ford High School,77.361345,77.672316,76.918058,76.179963
Griffin High School,82.04401,84.229064,83.842105,83.356164
Hernandez High School,77.438495,77.337408,77.136029,77.186567
Holden High School,83.787402,83.429825,85.0,82.855422
Huang High School,77.027251,75.908735,76.446602,77.225641
Johnson High School,77.187857,76.691117,77.491653,76.863248
Pena High School,83.625455,83.372,84.328125,84.121547


In [52]:
print("")
print("")
print("Reading Scores by Grade")
reading_by_grade



Reading Scores by Grade


Unnamed: 0_level_0,9th,10th,11th,12th
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.303155,80.907183,80.945643,80.912451
Cabrera High School,83.676136,84.253219,83.788382,84.287958
Figueroa High School,81.198598,81.408912,80.640339,81.384863
Ford High School,80.632653,81.262712,80.403642,80.662338
Griffin High School,83.369193,83.706897,84.288089,84.013699
Hernandez High School,80.86686,80.660147,81.39614,80.857143
Holden High School,83.677165,83.324561,83.815534,84.698795
Huang High School,81.290284,81.512386,81.417476,80.305983
Johnson High School,81.260714,80.773431,80.616027,81.227564
Pena High School,83.807273,83.612,84.335938,84.59116


In [53]:
#a table should include the following: Spending Ranges (Per Student), Average Math Score, Average Raading Score, % Passing Math, %Passing Reading, %Overall Passing
score_summary_df = best_worst_df[["school_name","per_student_budget","type","size",
                                  "average_math", "average_reading","pass_math_pct",
                                  "pass_reading_pct","pass_both_pct"]]

score_summary_df = score_summary_df.rename(columns={"average_math":"Average Math Score", 
                                              "average_reading":"Average Reading Score",
                                              "pass_math_pct":"% Passing Math",
                                              "pass_reading_pct":"% Passing Reading", 
                                              "pass_both_pct":"% Overall Passing"})

In [54]:
# Create bins for the scores by school spending
spending_bins = [0, 585, 630, 645, 680]

# Create labels for these bins
spending_labels = ["<$585", "$585-630", "$630-645", "$645-680"]

In [55]:
# Slice the data and place it into bins
score_summary_df["Spending Ranges (per student)"] = pd.cut(score_summary_df["per_student_budget"], spending_bins, labels=spending_labels)

In [56]:
# Create bins for scores by school size
size_bins = [0, 1000, 2000, 5000]

# Create labels for these bins
size_labels = ["Small (<1000)", "Medium(1000-2000)", "Large(2000-5000)"]

In [57]:
score_summary_df["School Size"] = pd.cut(score_summary_df["size"],size_bins, labels = size_labels)

In [58]:
#group by Spending Ranges (per student)
scores_by_spending = score_summary_df.groupby(["Spending Ranges (per student)"])
print("")
print("")
print("Scores by School Spending")
scores_by_spending.mean()



Scores by School Spending


Unnamed: 0_level_0,per_student_budget,size,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Spending Ranges (per student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
<$585,581.0,1592.0,83.455399,83.933814,93.460096,93.460096,90.369459
$585-630,615.5,2291.75,81.899826,83.155286,87.133538,87.133538,81.418596
$630-645,639.5,2830.5,78.518855,81.624473,73.484209,73.484209,62.857656
$645-680,652.333333,4104.333333,76.99721,81.027843,66.164813,66.164813,53.526855


In [59]:
#group by School Size
scores_by_size = score_summary_df.groupby(["School Size"])
print("")
print("")
print("Scores by School Size")
scores_by_size.mean()



Scores by School Size


Unnamed: 0_level_0,per_student_budget,size,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Small (<1000),595.0,694.5,83.821598,83.929843,93.550225,93.550225,89.883853
Medium(1000-2000),605.6,1704.4,83.374684,83.864438,93.599695,93.599695,90.621535
Large(2000-5000),635.375,3657.375,77.746417,81.344493,69.963361,69.963361,58.286003


In [60]:
#call up the best worst data frame and pull only the columns I need for the group by school type
scores_by_type_df = best_worst_df[["school_name","type","average_math", "average_reading","pass_math_pct","pass_reading_pct","pass_both_pct"]]

In [61]:
#Group by school type and display only scores for the scores by school type
scores_by_type_df = scores_by_type_df.rename(columns={"average_math":"Average Math Score", 
                                              "average_reading":"Average Reading Score",
                                              "pass_math_pct":"% Passing Math",
                                              "pass_reading_pct":"% Passing Reading", 
                                              "pass_both_pct":"% Overall Passing",
                                             "type":"School Type"})

scores_type_group = scores_by_type_df.groupby(["School Type"])
print("")
print("")
print("Scores by School Type")
scores_type_group.mean()



Scores by School Type


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.473852,83.896421,93.62083,93.62083,90.432244
District,76.956733,80.966636,66.548453,66.548453,53.672208
