In [1]:
# This pandas script reads two input files, one containing district wide information, the other detailed student info.
# Data in the input file are assumed to be csv format.
# The input file is assumed to be in the a sub-directory called Resoures.
#
# The code for each report is contained in a single cell and includes comments that detail its logic and operations.
#
# At a high level, each of these cells defines a data frame corresponding to the report it produces. The elements of these
#   data frames are calculated/filled-in as expalined in the comments throughout this script.
#

In [2]:
# Import the Pandas library
import pandas as pd

In [3]:
# Create a reference the CSV file desired
csv_path1 = "Resources/schools_complete.csv"
csv_path2 = "Resources/students_complete.csv"

# Read the CSV into a Pandas DataFrame (df)
district_df = pd.read_csv(csv_path1)
students_df = pd.read_csv(csv_path2)

# Rename the common column for the data frames (df) for clarity and consistency
district_df=district_df.rename(columns={"name":"school_name"})
students_df=students_df.rename(columns={"school":"school_name"})
# Print the first five rows of data to the screen
district_df.head()

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [4]:
students_df.head()

Unnamed: 0,Student ID,name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [5]:
# This first report produces a distric-level report which includes:
#  Total schools, total students, total budget, average math and reading scores, % passing math and reading, % overall passing
#
# Start with collecting district-wide stats using count() and sum() methods for the two main data frames:
num_schools = district_df["School ID"].count()
num_students = district_df["size"].sum()
total_budget = district_df["budget"].sum()
avg_reading_score = students_df["reading_score"].mean()
avg_math_score = students_df["math_score"].mean()

# Establish pass/fail bins for math and reading scores to gather and calculate pass/fail rate info.
#
# Start with binning math related data, collect total count of path/fail scores and determine % passing rate:
#
bins = [0, 69, 100]
bins_lables = ["Fail", "Pass"]
students_df['math_status'] = pd.cut(students_df['math_score'], bins, labels=bins_lables)
pf_counts_math = students_df['math_status'].value_counts()
# fail_count_math = pf_counts_math.iloc[1]
pass_count_math = pf_counts_math.iloc[0]             # passing rate is the first element (idx 0) returned by value_counts
pass_rate_math = pass_count_math/num_students * 100

# Do the same for reading scores
students_df['reading_status'] = pd.cut(students_df['reading_score'], bins, labels=bins_lables)
pf_counts_reading = students_df['reading_status'].value_counts()
#pf_counts_math, pf_counts_reading
#fail_count_reading=pf_counts_reading.iloc[1]
pass_count_reading = pf_counts_reading.iloc[0]             # passing rate is the first element (idx 0) returned by value_counts
pass_rate_reading = pass_count_reading/num_students * 100

# Now build the report data frame using the info compiled above:
district_summary = pd.DataFrame({"Total Schools" : [num_schools],
                               "Total Students" : [num_students],
                               "Total Budget" : [total_budget],
                               "Average Math Score" : [avg_math_score],
                               "Average Reading Score" : [avg_reading_score],
                               "% Passing Math" : [pass_rate_math],
                               "% Passing Reading" : [pass_rate_reading],
                               "% Overall Passing Rate" : [(pass_rate_math+pass_rate_reading)/2]
                               })
district_summary = district_summary[['Total Schools','Total Students','Total Budget','Average Math Score','Average Reading Score','% Passing Math','% Passing Reading','% Overall Passing Rate']]
district_summary.head()

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,80.393158


In [6]:
# This code cell produces a school-level report, including:
#   School name, type, budget, total students, per student budget, average math & reading scores, % passing math, reading & overall passing

# Create the school summary report df by copying a subset of the district level df and adding additional columns which
#   contain info calculated and gathered from various other data frames.
#
school_summary_df = district_df[['school_name','type','size','budget']]

# 
school_summary_df['Budget per Stdnt'] = district_df['budget'] / district_df['size']

# Start w/ grouping student data by school so sum() and count() methods can be used to calculate various school level stats
school_group = students_df.groupby("school_name")
math_avg = school_group['math_score'].sum() / school_group['math_score'].count()
reading_avg = school_group['reading_score'].sum() / school_group['reading_score'].count()
school_summary_df['Average math score'] = math_avg.values
school_summary_df['Average reading score'] = reading_avg.values
# The two lists below will contain total fail/pass counts for math and reading, respectively
school_pf_counts_math = school_group['math_status'].value_counts(normalize=True)      # normalize effectively returns percent value
school_pf_counts_reading=school_group['reading_status'].value_counts(normalize=True)

# Establish three lists to be used for calculating math, reading and overall passing rates
school_prate_math = []
school_prate_reading = []
school_prate_overall = []
# Fill the arrays by noting that the total passing count for each school is every other element of the pf lists built above
#   since there are 15 schools, start with idx 0 and increment by 2 (ever other element)
for i in range(0,30,2):
    school_prate_math.append(school_pf_counts_math.iloc[i]*100)
    school_prate_reading.append(school_pf_counts_reading.iloc[i]*100)
    school_prate_overall.append((school_pf_counts_reading.iloc[i]*100 + school_pf_counts_math.iloc[i]*100) / 2)

# Finally, populate the school summary report df using lists built above
school_summary_df['% Passing Math'] = school_prate_math
school_summary_df['% Passing Reading'] = school_prate_reading
school_summary_df['% Overall Passing'] = school_prate_overall

# Set the index for the report df to schools
schools = district_df['school_name'].unique()
school_summary_df.set_index(schools, drop=True, inplace=True)
# Select and set the order of columns, to match the sample file:
school_summary_df = school_summary_df [['type', 'size', 'budget', 'Budget per Stdnt', 'Average math score', 'Average reading score',
                                  '% Passing Math', '% Passing Reading', '% Overall Passing']]
school_summary_df.head(15)

Unnamed: 0,type,size,budget,Budget per Stdnt,Average math score,Average reading score,% Passing Math,% Passing Reading,% Overall Passing
Huang High School,District,2917,1910635,655.0,77.048432,81.033963,66.680064,81.93328,74.306672
Figueroa High School,District,2949,1884411,639.0,83.061895,83.97578,94.133477,97.039828,95.586652
Shelton High School,Charter,1761,1056600,600.0,76.711767,81.15802,65.988471,80.739234,73.363852
Hernandez High School,District,4635,3022020,652.0,77.102592,80.746258,68.309602,79.299014,73.804308
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,95.265668
Wilson High School,Charter,2283,1319574,578.0,77.289752,80.934412,66.752967,80.862999,73.807983
Cabrera High School,Charter,1858,1081356,582.0,83.803279,83.814988,92.505855,96.252927,94.379391
Bailey High School,District,4976,3124928,628.0,76.629414,81.182722,65.683922,81.316421,73.500171
Holden High School,Charter,427,248087,581.0,77.072464,80.966394,66.057551,81.222432,73.639992
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,95.27027


In [7]:
# Generate report of the top 5 schools based on Overall Passing Rate; the output contains same elements as the report above.
#
# Simply sort the school summary df in descending order
school_summary_df.sort_values(by="% Overall Passing", ascending=False).head()

Unnamed: 0,type,size,budget,Budget per Stdnt,Average math score,Average reading score,% Passing Math,% Passing Reading,% Overall Passing
Figueroa High School,District,2949,1884411,639.0,83.061895,83.97578,94.133477,97.039828,95.586652
Johnson High School,District,4761,3094650,650.0,83.418349,83.84893,93.272171,97.308869,95.29052
Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,95.27027
Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,95.265668
Ford High School,District,2739,1763916,644.0,83.274201,83.989488,93.867718,96.539641,95.203679


In [8]:
# Generate report of the bottom 5 schools based on Overall Passing Rate; the output contains same elements as the report above.
#
# Simply sort the school summary df in ascending order
school_summary_df.sort_values(by="% Overall Passing").head()

Unnamed: 0,type,size,budget,Budget per Stdnt,Average math score,Average reading score,% Passing Math,% Passing Reading,% Overall Passing
Wright High School,Charter,1800,1049400,583.0,76.842711,80.744686,66.366592,80.220055,73.293323
Shelton High School,Charter,1761,1056600,600.0,76.711767,81.15802,65.988471,80.739234,73.363852
Bailey High School,District,4976,3124928,628.0,76.629414,81.182722,65.683922,81.316421,73.500171
Holden High School,Charter,427,248087,581.0,77.072464,80.966394,66.057551,81.222432,73.639992
Hernandez High School,District,4635,3022020,652.0,77.102592,80.746258,68.309602,79.299014,73.804308


In [9]:
# This code cell produces a report containing average math scores for students of each grade level at each school.
#
# Start with creating a list of all 9th grade students by looking them up in the main student df:
grade_grp_9th = students_df[students_df['grade'] == '9th']
# Group this list by school, and store the average math score in the list using mean() method 
school_grp_9th = grade_grp_9th.groupby(['school_name']).mean()['math_score']
# Now creat the report df using the list above, as a column:
grade_df = pd.DataFrame({'9th' : school_grp_9th})

# To fill the rest, repeat the steps above while looping on 10th, 11th and 12th literals
for i in '10th', '11th', '12th':
    i = pd.DataFrame({i : students_df[students_df['grade'] == i].groupby(['school_name']).mean()['math_score']})
    grade_df = grade_df.join(i)
grade_df.head(15)

Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.083676,76.996772,77.515588,76.492218
Cabrera High School,83.094697,83.154506,82.76556,83.277487
Figueroa High School,76.403037,76.539974,76.884344,77.151369
Ford High School,77.361345,77.672316,76.918058,76.179963
Griffin High School,82.04401,84.229064,83.842105,83.356164
Hernandez High School,77.438495,77.337408,77.136029,77.186567
Holden High School,83.787402,83.429825,85.0,82.855422
Huang High School,77.027251,75.908735,76.446602,77.225641
Johnson High School,77.187857,76.691117,77.491653,76.863248
Pena High School,83.625455,83.372,84.328125,84.121547


In [10]:
# This code cell produces a report containing average reading scores for students of each grade level at each school.
#
# Start with creating a list of all 9th grade students by looking them up in the main student df:
grade_grp_9th = students_df[students_df['grade'] == '9th']
# Group this list by school, and store the average reading score in the list using mean() method 
school_grp_9th = grade_grp_9th.groupby(['school_name']).mean()['reading_score']
# Now creat the report df using the list above, as a column:
grade_df = pd.DataFrame({'9th' : school_grp_9th})

# To fill the rest, repeat the steps above while looping on 10th, 11th and 12th literals.
# In each iteration, a df is created and then appended to the report df using joi() method:
for i in '10th', '11th', '12th':
    i = pd.DataFrame({i : students_df[students_df['grade'] == i].groupby(['school_name']).mean()['reading_score']})
    grade_df = grade_df.join(i)
grade_df.head(15)

Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.303155,80.907183,80.945643,80.912451
Cabrera High School,83.676136,84.253219,83.788382,84.287958
Figueroa High School,81.198598,81.408912,80.640339,81.384863
Ford High School,80.632653,81.262712,80.403642,80.662338
Griffin High School,83.369193,83.706897,84.288089,84.013699
Hernandez High School,80.86686,80.660147,81.39614,80.857143
Holden High School,83.677165,83.324561,83.815534,84.698795
Huang High School,81.290284,81.512386,81.417476,80.305983
Johnson High School,81.260714,80.773431,80.616027,81.227564
Pena High School,83.807273,83.612,84.335938,84.59116


In [11]:
# This code cell produces a break down of school performance based on average per-student spending.
#
# Start with establishing spending bins, and adding a new column to the scholl summary df, reflecting spending bin for each school:
spending_cats = [0, 595, 615, 635, 660]
spending_labels = ['<$595', '\$595-$614', '\$615-$634', '\$635-$660']
school_summary_df['spending_cat'] = pd.cut(school_summary_df['Budget per Stdnt'], spending_cats, labels=spending_labels)

# Select and set the order of columns, to match the sample file:
reduced_school_summary = school_summary_df [['spending_cat','Average math score', 'Average reading score', '% Passing Math',
                                          '% Passing Reading', '% Overall Passing']]
scores_by_spending = reduced_school_summary.groupby('spending_cat')
scores_by_spending.mean().head()

Unnamed: 0_level_0,Average math score,Average reading score,% Passing Math,% Passing Reading,% Overall Passing
spending_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$595,78.752051,81.61512,72.920741,84.639603,78.780172
\$595-$614,80.275842,82.601359,80.291533,88.34259,84.317061
\$615-$634,79.990456,82.49974,79.538146,89.227693,84.382919
\$635-$660,81.563878,83.039306,86.20907,92.083767,89.146418


In [12]:
# This code cell produces a break down of school performance based on school size.
#
# Start with establishing size bins, and adding a new column to the scholl summary df, reflecting size bin for each school:
size_cats = [0, 1500, 3000, 5000]
size_labels = ['Small (<1500)', 'Medium (1500-3000)', 'Large (3000-5000)']
school_summary_df['size_cat'] = pd.cut(school_summary_df['size'], size_cats, labels=size_labels)
reduced_school_summary = school_summary_df [['size_cat','Average math score', 'Average reading score', '% Passing Math',
                                          '% Passing Reading', '% Overall Passing']]
scores_by_size = reduced_school_summary.groupby('size_cat')
scores_by_size.mean().head()

Unnamed: 0_level_0,Average math score,Average reading score,% Passing Math,% Passing Reading,% Overall Passing
size_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1500),81.421293,82.942617,84.681505,91.435781,88.058643
Medium (1500-3000),80.214282,82.450792,79.95356,88.774884,84.364222
Large (3000-5000),80.127452,82.375908,80.283204,88.444733,84.363968


In [13]:
# This code cell produces a break down of school performance based on school type.
#
# Start with a copy of the school summary df containing selct columns to match the format of sample file:
reduced_school_summary2 = school_summary_df [['type','Average math score', 'Average reading score', '% Passing Math',
                                          '% Passing Reading', '% Overall Passing']]
# Now simply group this df and use the mean() method to calculate average overall passing rate for each type:
scores_by_type = reduced_school_summary2.groupby('type')
scores_by_type.mean().head()

Unnamed: 0_level_0,Average math score,Average reading score,% Passing Math,% Passing Reading,% Overall Passing
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,80.324201,82.429369,79.873967,88.624209,84.249088
District,80.556334,82.643266,82.259154,89.898811,86.078983
