In [1]:
#import dependencies
import pandas as pd
import numpy as np

In [2]:
#import school data and get headers
schools_df = pd.read_csv("schools_complete.csv")
schools_df.head()

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [3]:
#check for missing data - school districts
schools_df.count()

School ID      15
school_name    15
type           15
size           15
budget         15
dtype: int64

In [4]:
#import student data and get headers
students_df = pd.read_csv("students_complete.csv")
students_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [5]:
#check for missing data - students
students_df.count()

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
dtype: int64

In [6]:
#merge data frames together
merged = pd.merge(schools_df, students_df, on="school_name")
merged.head()

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84


In [7]:
#Get district summary 
#start with totals from merged dataframe
total_schools = merged["school_name"].count()
total_students = merged["student_name"].count()
total_budget = merged["budget"].sum()

#recast math and reading scores as int so can take average
merged["math_score"] = merged["math_score"].astype(int)
avg_math = round(merged["math_score"].mean(), 2)
merged["reading_score"] = merged["reading_score"].astype(int)
avg_read = round(merged["reading_score"].mean(), 2)

print(f"Total schools: {total_schools}")
print(f"Total students: {total_students}")
print(f"Total budget: {total_budget}")
print(f"Average Math Score: {avg_math}")
print(f"Average Reading Score: {avg_read}")

Total schools: 39170
Total students: 39170
Total budget: 82932329558
Average Math Score: 78.99
Average Reading Score: 81.88


In [8]:
#Figure out % passing by getting number of students with passing grade and dividing by total #

#math
math_passing = len(merged[merged["math_score"]>64])
percent_pass_math = round(math_passing/total_students*100, 2)

#reading
read_passing = len(merged[merged["reading_score"]>64])
percent_pass_read = round(read_passing/total_students*100, 2)

#overall
overall_pass = round((percent_pass_read + percent_pass_math)/2, 2)

#print values out
print(f"Percent that passed Math: {percent_pass_math}")
print(f"Percent that passed Reading: {percent_pass_read}")
print(f"Overall Passing Rate: {overall_pass}")


Percent that passed Math: 84.73
Percent that passed Reading: 96.2
Overall Passing Rate: 90.46


In [9]:
#put values into summary table (dataframe) for District
Summary_District = {"Total schools": total_schools, "Total students": total_students, 
                    "Total budget": total_budget, "Average Math Score": avg_math, 
                    "Average Reading Score": avg_read, "% Passed Math": percent_pass_math,
                   "% Passed Reading": percent_pass_read, "Overall Passing Rate": overall_pass}

Summary_District = pd.DataFrame(Summary_District, index=[0])
Summary_District

Unnamed: 0,Total schools,Total students,Total budget,Average Math Score,Average Reading Score,% Passed Math,% Passed Reading,Overall Passing Rate
0,39170,39170,82932329558,78.99,81.88,84.73,96.2,90.46


In [10]:
#Start building School Summary by doing group by school name on the merged dataframe
school_grouped = merged.groupby("school_name")

#check first entries in group formed
school_grouped.first()

Unnamed: 0_level_0,School ID,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bailey High School,7,District,4976,3124928,17871,Blake Martin,M,9th,75,59
Cabrera High School,6,Charter,1858,1081356,16013,Olivia Short,F,11th,94,94
Figueroa High School,1,District,2949,1884411,2917,Amy Jacobs,F,10th,85,87
Ford High School,13,District,2739,1763916,34796,Michael Mercado,M,9th,66,94
Griffin High School,4,Charter,1468,917500,12262,Heather Wright,F,11th,79,68
Hernandez High School,3,District,4635,3022020,7627,Russell Davis,M,10th,70,88
Holden High School,8,Charter,427,248087,22847,Daniel Rodriguez,M,11th,86,92
Huang High School,0,District,2917,1910635,0,Paul Bradley,M,9th,66,79
Johnson High School,12,District,4761,3094650,30035,Lisa Casey,F,12th,87,87
Pena High School,9,Charter,962,585858,23274,Alec Davis,M,9th,91,75


In [11]:
#already have school name, type, total students, total budget

#add column that calculates per student budget by school to original school df
per_student_budget = schools_df["budget"]/schools_df["size"]
schools_df["Per Student Budget"] = per_student_budget
schools_df.head()


Unnamed: 0,School ID,school_name,type,size,budget,Per Student Budget
0,0,Huang High School,District,2917,1910635,655.0
1,1,Figueroa High School,District,2949,1884411,639.0
2,2,Shelton High School,Charter,1761,1056600,600.0
3,3,Hernandez High School,District,4635,3022020,652.0
4,4,Griffin High School,Charter,1468,917500,625.0


In [14]:
#create df to store data grouped by school
grouped_school_df = merged.groupby("school_name")

#find mean of math scores by school
grouped_math = grouped_school_df["math_score"].mean()
grouped_math = pd.DataFrame(grouped_math)



#reset index so can merge with schools df
grouped_math = grouped_math.reset_index()
grouped_math = grouped_math.rename(columns={"math_score": "mean_math_score"})
grouped_math.head()

#merge mean math scores by school with rest of schools_df 
schools_df = pd.merge(schools_df, grouped_math, on="school_name")
schools_df
#find means of reading scores by school and add to schools df
#schools_df["mean read score"] = merged.groupby("school_name")["reading_score"].transform('mean')
#schools_df

Unnamed: 0,School ID,school_name,type,size,budget,Per Student Budget,mean_math_score_x,mean_math_score_y
0,0,Huang High School,District,2917,1910635,655.0,76.629414,76.629414
1,1,Figueroa High School,District,2949,1884411,639.0,76.711767,76.711767
2,2,Shelton High School,Charter,1761,1056600,600.0,83.359455,83.359455
3,3,Hernandez High School,District,4635,3022020,652.0,77.289752,77.289752
4,4,Griffin High School,Charter,1468,917500,625.0,83.351499,83.351499
5,5,Wilson High School,Charter,2283,1319574,578.0,83.274201,83.274201
6,6,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.061895
7,7,Bailey High School,District,4976,3124928,628.0,77.048432,77.048432
8,8,Holden High School,Charter,427,248087,581.0,83.803279,83.803279
9,9,Pena High School,Charter,962,585858,609.0,83.839917,83.839917


In [13]:
#filtered out failing scores - only passing scores left.
group = merged.loc[(merged["math_score"]>64)]
group


Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84
5,0,Huang High School,District,2917,1910635,5,Bryan Miranda,M,9th,94,94
6,0,Huang High School,District,2917,1910635,6,Sheena Carter,F,11th,82,80
7,0,Huang High School,District,2917,1910635,7,Nicole Baker,F,12th,96,69
8,0,Huang High School,District,2917,1910635,8,Michael Roth,M,10th,95,87
9,0,Huang High School,District,2917,1910635,9,Matthew Greene,M,10th,96,84
10,0,Huang High School,District,2917,1910635,10,Andrew Alexander,M,10th,90,70
11,0,Huang High School,District,2917,1910635,11,Daniel Cooper,M,10th,78,77
12,0,Huang High School,District,2917,1910635,12,Brittney Walker,F,9th,64,79
