In [1]:
import pandas as pd

In [2]:
# The path to our CSV file
students_file = "raw_data/students_sample.csv"
schools_file = "raw_data/schools_sample.csv"

In [3]:
# Read our student data into pandas
students_df = pd.read_csv(students_file)

students_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [4]:
# Read our school data into pandas
schools_df = pd.read_csv(schools_file)

schools_df.head()

Unnamed: 0,School ID,name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [5]:
#  clean school table; prep for merging
#  name column needs to be school, same as student table
renamed_schools_df = schools_df.rename(columns={"name":"school"})
renamed_schools_df.head()

Unnamed: 0,School ID,school,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [6]:
# Merge two dataframes using an outer join
students_schools_df = pd.merge(students_df, renamed_schools_df, on="school", how="outer")
students_schools_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score,School ID,type,size,budget
0,0.0,Paul Bradley,M,9th,Huang High School,66.0,79.0,0,District,2917,1910635
1,1.0,Victor Smith,M,12th,Huang High School,94.0,61.0,0,District,2917,1910635
2,2.0,Kevin Rodriguez,M,12th,Huang High School,90.0,60.0,0,District,2917,1910635
3,3.0,Dr. Richard Scott,M,12th,Huang High School,67.0,58.0,0,District,2917,1910635
4,4.0,Bonnie Ray,F,9th,Huang High School,97.0,84.0,0,District,2917,1910635


In [7]:
# the school ID is 0; clean it up
reduced_combined_df = students_schools_df[["Student ID", "name", "gender", "grade",
                       "school", "reading_score", "math_score", "type", "size", "budget"]]
reduced_combined_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score,type,size,budget
0,0.0,Paul Bradley,M,9th,Huang High School,66.0,79.0,District,2917,1910635
1,1.0,Victor Smith,M,12th,Huang High School,94.0,61.0,District,2917,1910635
2,2.0,Kevin Rodriguez,M,12th,Huang High School,90.0,60.0,District,2917,1910635
3,3.0,Dr. Richard Scott,M,12th,Huang High School,67.0,58.0,District,2917,1910635
4,4.0,Bonnie Ray,F,9th,Huang High School,97.0,84.0,District,2917,1910635


In [10]:
reduced_combined_df.columns


Index(['Student ID', 'name', 'gender', 'grade', 'school', 'reading_score',
       'math_score', 'type', 'size', 'budget'],
      dtype='object')

In [11]:
# school summary
school_summary = reduced_combined_df.groupby('school').sum()
school_summary

Unnamed: 0_level_0,Student ID,reading_score,math_score,size,budget
school,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bailey High School,0.0,0.0,0.0,4976,3124928
Cabrera High School,0.0,0.0,0.0,1858,1081356
Figueroa High School,0.0,0.0,0.0,2949,1884411
Ford High School,0.0,0.0,0.0,2739,1763916
Griffin High School,0.0,0.0,0.0,1468,917500
Hernandez High School,0.0,0.0,0.0,4635,3022020
Holden High School,0.0,0.0,0.0,427,248087
Huang High School,55.0,967.0,826.0,32087,21016985
Johnson High School,0.0,0.0,0.0,4761,3094650
Pena High School,0.0,0.0,0.0,962,585858
