In [90]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from io import StringIO

In [91]:
# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_df = pd.read_csv(school_data_to_load)
student_df = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_student_df = pd.merge(student_df, school_df, how="left", on=["school_name", "school_name"])

In [92]:
#Dist Summary
school_student_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [93]:
#Calculate the total number of schools
total_schools=school_student_df['school_name'].value_counts()
num_total_schools= len(school_student_df['school_name'].value_counts())
num_total_schools


15

In [94]:
#Calculate the total number of students
total_students=sum(school_student_df['student_name'].value_counts())
total_students

39170

In [95]:
#Calculate the total budget per school
total_budget= school_df["budget"].sum()
total_budget


24649428

In [96]:
#Calculate the average math score 
average_math_score= round(school_student_df["math_score"].mean(),6)
average_math_score

78.985371

In [97]:
#Calculate the average reading score
average_reading_score=round(school_student_df["reading_score"].mean(), 6)
average_reading_score

81.87784

In [98]:
#Calculate the percentage of students with a passing math score
students_pass_math_score= school_student_df.loc[school_student_df["math_score"]>=70].count()["student_name"]
students_pass_math_score

29370

In [99]:
#(70 or greater)Calculate the percentage of students with a passing reading score
perc_stu_pass_math= round((students_pass_math_score/total_students) *100, 6)
perc_stu_pass_math

74.980853

In [100]:
#Calculate the percentage of students with a passing reading score
students_pass_reading_score=school_student_df.loc[school_student_df["reading_score"]>=70].count()["student_name"]
students_pass_reading_score

33610

In [101]:
# Passing Reading (The percentage of students that passed reading.)
perc_stu_pass_reading= round(students_pass_reading_score/total_students *100,6)
perc_stu_pass_reading

85.805463

In [102]:
# Calculate the percentage of students who passed math and reading (% Overall Passing)
perc_pass_read_math=perc_stu_pass_reading + perc_stu_pass_math/total_students*100
perc_pass_read_math

85.99688718432475

In [106]:
# District Summary

# Create a dataframe to hold the above results
dist_summary=pd.DataFrame({
    "Total Schools":[num_total_schools],"Total Students":[total_students],
    "Total Budget":[total_budget],"Average Math Score": [average_math_score],"Average Reading Score": [average_reading_score],
    "Passing Math %": [perc_stu_pass_math],"Passing Reading %": [perc_stu_pass_reading], "Overall Passing %":[perc_pass_read_math]})
dist_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,Passing Math %,Passing Reading %,Overall Passing %
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,85.996887


In [107]:
# Improve formatting before outputting
dist_summary["Total Budget"] = dist_summary["Total Budget"].map("${0:,.2f}".format)
dist_summary["Total Students"] = dist_summary["Total Students"].map("{0:,.0f}".format)
dist_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,Passing Math %,Passing Reading %,Overall Passing %
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,85.996887


In [None]:
# SCHOOL SUMMARY

In [10]:
school_df.count()

School ID      15
school_name    15
type           15
size           15
budget         15
dtype: int64

In [35]:
student_df.count()

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
dtype: int64

In [36]:
school_student_df.columns

Index(['Student ID', 'student_name', 'gender', 'grade', 'school_name',
       'reading_score', 'math_score', 'School ID', 'type', 'size', 'budget'],
      dtype='object')

In [38]:
# School Summary needs: 'school_name', 'student_name', , 'grade', 'school_name',
#'reading_score', 'math_score', ', 'type', 'size', 'budget']
school_sum= school_student_df[[
    'school_name','reading_score','Student ID',
    'math_score','type', 'size', 'budget']]
school_sum.head()

Unnamed: 0,school_name,student_name,reading_score,Student ID,math_score,type,size,budget
0,Huang High School,Paul Bradley,66,0,79,District,2917,1910635
1,Huang High School,Victor Smith,94,1,61,District,2917,1910635
2,Huang High School,Kevin Rodriguez,90,2,60,District,2917,1910635
3,Huang High School,Dr. Richard Scott,67,3,58,District,2917,1910635
4,Huang High School,Bonnie Ray,97,4,84,District,2917,1910635


In [71]:
# Groupby function on schools
school_sum_grouped= school_sum.groupby(["school_name"])

# School Type (District per school)
school_index=school_df.set_index(["school_name"])
school_type=school_index["type"] 

# Total Budget per School
school_budget=school_sum_grouped["budget"] 

# Per student Budget
student_budget_group= school_sum.groupby(["Student ID"])
student_budget=student_budget_group["budget"]

# Total_Students per school
total_students= school_sum_grouped["Student ID"].count()

# Averages using Groupby on the reading score per school
average_reading_score = school_sum_grouped["reading_score"].mean()

# Averages using Groupby on the math score per school
average_math_score = school_sum_grouped["math_score"].mean()

# Total of students per school
total_budget= school_sum_grouped["budget"].sum()

# % Passing Math
passing_math= school_sum_grouped["math_score"] =>70 /total_students


% Passing Reading
% Overall Passing (The percentage of students that passed math and reading.)


0        1910635
1        1910635
2        1910635
3        1910635
4        1910635
          ...   
37535    1043130
37536    1043130
37537    1043130
37538    1043130
37539    1043130
Name: budget, Length: 75, dtype: int64

In [89]:

passing_math= school_sum_grouped["math_score"] >=70 /total_students

passing_math
#school_student_df.loc[school_student_df["math_score"]>=70].count()["student_name"]


TypeError: '>=' not supported between instances of 'SeriesGroupBy' and 'float'

In [84]:
school_summary_table= pd.DataFrame({"School Type": school_type,"Total Students": total_students,
                                    "Total School Budget": school_budget,"Per Student Budget": student_budget,
                                    "Average Math Score":average_math_score ,"Average Reading Score": average_reading_score,
                                    })
school_summary_table.head()

ValueError: arrays must all be same length

In [None]:
School Name




Average Math Score
Average Reading Score
% Passing Math
% Passing Reading
% Overall Passing (The percentage of students that passed math and reading.)
