In [104]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from io import StringIO

In [105]:
# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_df = pd.read_csv(school_data_to_load)
student_df = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_student_df = pd.merge(student_df, school_df, how="left", on=["school_name", "school_name"])

In [106]:
school_student_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [7]:

# Combination of two files in one DataFrame
#school_student_df.head(10)
school_student_df.dtypes

Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
School ID         int64
type             object
size              int64
budget            int64
dtype: object

In [109]:
#Calculate the total number of schools
total_schools=school_student_df['school_name'].value_counts()
num_total_schools= len(school_student_df['school_name'].value_counts())
num_total_schools

39170

In [9]:
#Calculate the total number of students
total_students=sum(school_student_df['student_name'].value_counts())
total_students


39170

In [10]:
#Calculate the total budget per school
total_budget= school_df["budget"].sum()
total_budget


24649428

In [11]:
#Calculate the average math score 
average_math_score= round(school_student_df["math_score"].mean(),6)
average_math_score

78.985371

In [12]:
#Calculate the average reading score
average_reading_score=round(school_student_df["reading_score"].mean(), 6)
average_reading_score

81.87784

In [13]:
#Calculate the percentage of students with a passing math score
students_pass_math_score= school_student_df.loc[school_student_df["math_score"]>=70].count()["student_name"]
students_pass_math_score

29370

In [14]:
#(70 or greater)Calculate the percentage of students with a passing reading score
perc_stu_pass_math= round((students_pass_math_score/total_students) *100, 6)
perc_stu_pass_math

74.980853

In [15]:
#Calculate the percentage of students with a passing reading score
students_pass_reading_score=school_student_df.loc[school_student_df["reading_score"]>=70].count()["student_name"]
students_pass_reading_score

33610

In [16]:
# Passing Reading (The percentage of students that passed reading.)
perc_stu_pass_reading= round(students_pass_reading_score/total_students *100,6)
perc_stu_pass_reading

85.805463

In [17]:
# Calculate the percentage of students who passed math and reading (% Overall Passing)
perc_pass_read_math=perc_stu_pass_reading + perc_stu_pass_math/total_students*100

perc_pass_read_math

85.99688718432475

In [18]:
# District Summary

# Create a dataframe to hold the above results
dist_summary=pd.DataFrame({
    "Total Schools":[num_total_schools],"Total Students":[total_students],
    "Total Budget":[total_budget],"Average Math Score": [average_math_score],"Average Reading Score": [average_reading_score],
    "Passing Math %": [perc_stu_pass_math],"Passing Reading %": [perc_stu_pass_reading], "Overall Passing %":[perc_pass_read_math]})
dist_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,Passing Math %,Passing Reading %,Overall Passing %
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,85.996887


In [19]:
# Improve formatting before outputting
dist_summary["Total Budget"] = dist_summary["Total Budget"].map("${0:,.2f}".format)
dist_summary["Total Students"] = dist_summary["Total Students"].map("{0:,.0f}".format)

In [20]:
dist_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,Passing Math %,Passing Reading %,Overall Passing %
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,85.996887


In [40]:
# SCHOOL SUMMARY
# Inspect all columns
school_student_df.columns
school_student_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [85]:
# DataFrame for School
school_in= school_df.set_index("school_name")
school_index=school_in.drop(columns="School ID")
school_index=school_index.rename(columns={"type":"School Type","size":"Total Students","budget":"Total School Budget"})
school_index

Unnamed: 0_level_0,School Type,Total Students,Total School Budget
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Huang High School,District,2917,1910635
Figueroa High School,District,2949,1884411
Shelton High School,Charter,1761,1056600
Hernandez High School,District,4635,3022020
Griffin High School,Charter,1468,917500
Wilson High School,Charter,2283,1319574
Cabrera High School,Charter,1858,1081356
Bailey High School,District,4976,3124928
Holden High School,Charter,427,248087
Pena High School,Charter,962,585858


In [112]:
#  DataFrame for Students

student_index=student_df.set_index("school_name") 
avrg_reading_score=student_index.groupby(["reading_score"]).mean()
avrg_reading_score.head(15)

stud_index_mean= stud_index["reading_score"].mean()
student_index.drop(columns=["Student ID","gender","grade"])
student_index=stud_index["reading_score"].mean()
student_index.head()



school_name
Bailey High School      81.033963
Cabrera High School     83.975780
Figueroa High School    81.158020
Ford High School        80.746258
Griffin High School     83.816757
Name: reading_score, dtype: float64

In [83]:

# Per student budget
per_stu_budget= school_student_df.groupby(["student_name","budget"]).sum()
per_stu_budget.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Student ID,reading_score,math_score,School ID,size
student_name,budget,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aaron Acosta,1763916,36194,99,95,13,2739
Aaron Aguilar,3022020,9133,90,60,3,4635
Aaron Anderson,1763916,36126,69,84,13,2739
Aaron Atkinson,3124928,18861,88,77,7,4976
Aaron Bailey,3124928,19812,98,70,7,4976
Aaron Bonilla,1081356,17610,80,77,6,1858
Aaron Booker,1910635,2737,81,64,0,2917
Aaron Boyer,3022020,8336,99,83,3,4635
Aaron Brandt,1884411,5816,89,66,1,2949
Aaron Brown,1763916,35726,85,82,13,2739


In [110]:
school_budget= school_student_df.groupby(["school_name"]).sum()
school_budget.head()


Unnamed: 0_level_0,Student ID,reading_score,math_score,School ID,size,budget
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bailey High School,101303896,403225,383393,34832,24760576,15549641728
Cabrera High School,31477307,156027,154329,11148,3452164,2009159448
Figueroa High School,12949059,239335,226223,2949,8696601,5557128039
Ford High School,99055935,221164,211184,35607,7502121,4831365924
Griffin High School,19077394,123043,122360,5872,2155024,1346890000


In [30]:
student_budget= school_student_df.loc[school_student_df[:,"school_name","type"]].value_counts()
student_budget


TypeError: '(slice(None, None, None), 'school_name', 'type')' is an invalid key

In [31]:
school_group= school_df.groupby(["school_name"])
district_df= pd.DataFrame({"School Type":school_df["type"],"Total Students":tol_stud})
district_df.head()


Unnamed: 0,School Type,Total Students
0,District,Paul Bradley
1,District,Victor Smith
2,Charter,Kevin Rodriguez
3,District,Dr. Richard Scott
4,Charter,Bonnie Ray


In [32]:
school_summary=pd.DataFrame({"Total Students":total_students,"Total School Budget":school_budget})#"Per Student Budget":student_budget, "Average Math Score": average_math_score,
   # "Average Reading Score": average_reading_score,
    #" % Passing Math": perc_stu_pass_math,"% Passing Reading": perc_stu_pass_reading, "% Overall Passing":perc_pass_read_math

school_summary.head()

ValueError: If using all scalar values, you must pass an index

In [124]:
index_school=school_student_df.set_index("school_name")
index_school=index_school.drop(["student_name","gender","grade","Student ID"], axis=1)
grouped= index_school.groupby("school_name")
index_school
#index_school_grpuped=index_school.groupby()

AttributeError: 'DataFrameGroupBy' object has no attribute 'value_counts'