In [3]:
# Dependencies and Setup
import pandas as pd
import os


In [4]:
# Create variables with the paths
school_data_path = os.path.join("Resources", "schools.csv")
student_data_path = os.path.join("Resources", "students.csv")

In [5]:
# Read School and Student Data File and store into Pandas DataFrames
school_data_df = pd.read_csv(school_data_path)
student_data_df = pd.read_csv(student_data_path)

In [6]:
# Combine the data into a single dataset.  
school_data_complete_df = pd.merge(student_data_df, school_data_df, how="left", on=["school_name", "school_name"])
school_data_complete_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [7]:
#Clean Data

#Identify incomplete rows
school_data_complete_df.count()
#There do not appear to be any incomplete rows

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
School ID        39170
type             39170
size             39170
budget           39170
dtype: int64

In [8]:
#All rows appear to be relevant
school_data_complete_df["type"].unique

<bound method Series.unique of 0        District
1        District
2        District
3        District
4        District
           ...   
39165     Charter
39166     Charter
39167     Charter
39168     Charter
39169     Charter
Name: type, Length: 39170, dtype: object>

In [9]:
#Check data types
school_data_complete_df.dtypes

Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
School ID         int64
type             object
size              int64
budget            int64
dtype: object

In [10]:
#Calculate the total number of schools in the district
school_data_complete_df["school_name"].value_counts()

Bailey High School       4976
Johnson High School      4761
Hernandez High School    4635
Rodriguez High School    3999
Figueroa High School     2949
Huang High School        2917
Ford High School         2739
Wilson High School       2283
Cabrera High School      1858
Wright High School       1800
Shelton High School      1761
Thomas High School       1635
Griffin High School      1468
Pena High School          962
Holden High School        427
Name: school_name, dtype: int64

In [11]:
#Calculate the total number of schools in the district
total_number_of_schools=len(school_data_complete_df["school_name"].unique())
total_number_of_schools

15

In [13]:
#Calculate the total number of students in the district
total_number_of_students=school_data_complete_df["student_name"].count()
total_number_of_students


39170

In [14]:
#Calculate the total budget of the district
unique_budgets=school_data_complete_df["budget"].unique()
total_budget=unique_budgets.sum()
total_budget

24649428

In [15]:
#Calculate the average math score in the district
average_math_score=school_data_complete_df["math_score"].mean()
average_math_score

78.98537145774827

In [16]:
#Calculate the average reading score in the district
average_reading_score=school_data_complete_df["reading_score"].mean()
average_reading_score

81.87784018381414

In [32]:
#Calculate the percentage of students in the district passing math (score of 70 or higher)
number_passing_math=school_data_complete_df["math_score"]>=70
percent_passing_math=number_passing_math.sum()/total_number_of_students
percent_passing_math

0.749808526933878

In [33]:
#Calculate the percentage of students in the district passing reading (score of 70 or higher)
number_passing_reading=school_data_complete_df["reading_score"]>=70
percent_passing_reading=number_passing_reading.sum()/total_number_of_students
percent_passing_reading

0.8580546336482001

In [309]:
#Calculate the percentage of students passing both math and reading
reading_and_math_df = school_data_complete_df[["reading_score", "math_score"]]
criteria_1=reading_and_math_df["reading_score"]>=70
criteria_2=reading_and_math_df["math_score"]>=70
all_criteria=criteria_1 & criteria_2
number_passing_math_and_reading=all_criteria.sum()/total_number_of_students


25528

In [46]:
#Create a dataframe to hold the above results
district_summary_df=pd.DataFrame({"Total Schools":[total_number_of_schools], "Total Students":[total_number_of_students],
                                  "Total Budget":[total_budget], "Average Math Score":[average_math_score], "Average Reading Score":[average_reading_score],
                                 "Percent Passing Math":[percent_passing_math], "Overall Passing Percentage":[number_passing_math_and_reading]})
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,Percent Passing Math,Overall Passing Percentage
0,15,39170,24649428,78.985371,81.87784,0.749809,0.651723


In [131]:
#create an overview table that summarizes key metrics about each school

#edit the school_data_complete_df dataframe so it is grouped by "school name"
grouped_by_school_group=school_data_complete_df.groupby(["school_name"])


In [132]:
#determine the type of each school
grouped_by_school_group["type"].unique()

school_name
Bailey High School       [District]
Cabrera High School       [Charter]
Figueroa High School     [District]
Ford High School         [District]
Griffin High School       [Charter]
Hernandez High School    [District]
Holden High School        [Charter]
Huang High School        [District]
Johnson High School      [District]
Pena High School          [Charter]
Rodriguez High School    [District]
Shelton High School       [Charter]
Thomas High School        [Charter]
Wilson High School        [Charter]
Wright High School        [Charter]
Name: type, dtype: object

In [290]:
#determine the total students at each school
students_per_school=grouped_by_school_group["student_name"].count()

In [134]:
budget_by_school=pd.DataFrame(grouped_by_school_group["budget"].unique())
budget_by_school_int=budget_by_school["budget"].astype("int64")
budget_by_school_int

school_name
Bailey High School       3124928
Cabrera High School      1081356
Figueroa High School     1884411
Ford High School         1763916
Griffin High School       917500
Hernandez High School    3022020
Holden High School        248087
Huang High School        1910635
Johnson High School      3094650
Pena High School          585858
Rodriguez High School    2547363
Shelton High School      1056600
Thomas High School       1043130
Wilson High School       1319574
Wright High School       1049400
Name: budget, dtype: int64

In [111]:
per_student_budget=budget_by_school_int/students_per_school
per_student_budget

school_name
Bailey High School       628.0
Cabrera High School      582.0
Figueroa High School     639.0
Ford High School         644.0
Griffin High School      625.0
Hernandez High School    652.0
Holden High School       581.0
Huang High School        655.0
Johnson High School      650.0
Pena High School         609.0
Rodriguez High School    637.0
Shelton High School      600.0
Thomas High School       638.0
Wilson High School       578.0
Wright High School       583.0
dtype: float64

In [135]:
#Calculate the average math score by school
average_math_score_by_school=grouped_by_school_group["math_score"].mean()


In [136]:
average_reading_score_grouped_by_school=grouped_by_school_group["reading_score"].mean()

In [211]:
passing_math_scores_by_district=school_data_complete_df.loc[school_data_complete_df["math_score"]>=70, :]
#passing_reading_scores_by_district=school_data_complete_df.loc[school_data_complete_df["reading_score"]>=70, :]
passing_math_scores_by_school_groupby=passing_math_scores_by_district.groupby(["school_name"])
number_of_students_passing_math_per_school=passing_math_scores_by_school_groupby["math_score"].count()
#passing_reading_scores_by_school_groupby=passing_math_scores_by_district.groupby(["school_name"])
percent_of_students_passing_math_per_school=number_of_students_passing_math_per_school/students_per_school
percent_of_students_passing_math_per_school

school_name
Bailey High School       0.666801
Cabrera High School      0.941335
Figueroa High School     0.659885
Ford High School         0.683096
Griffin High School      0.933924
Hernandez High School    0.667530
Holden High School       0.925059
Huang High School        0.656839
Johnson High School      0.660576
Pena High School         0.945946
Rodriguez High School    0.663666
Shelton High School      0.938671
Thomas High School       0.932722
Wilson High School       0.938677
Wright High School       0.933333
dtype: float64

In [213]:
passing_reading_scores_by_district=school_data_complete_df.loc[school_data_complete_df["reading_score"]>=70, :]
passing_reading_scores_by_school_groupby=passing_reading_scores_by_district.groupby(["school_name"])
number_of_students_passing_reading_per_school=passing_reading_scores_by_school_groupby["reading_score"].count()
percent_of_students_passing_reading_per_school=number_of_students_passing_reading_per_school/students_per_school
percent_of_students_passing_reading_per_school

school_name
Bailey High School       0.819333
Cabrera High School      0.970398
Figueroa High School     0.807392
Ford High School         0.792990
Griffin High School      0.971390
Hernandez High School    0.808630
Holden High School       0.962529
Huang High School        0.813164
Johnson High School      0.812224
Pena High School         0.959459
Rodriguez High School    0.802201
Shelton High School      0.958546
Thomas High School       0.973089
Wilson High School       0.965396
Wright High School       0.966111
dtype: float64

In [229]:
school_data_complete_df.columns


Index(['Student ID', 'student_name', 'gender', 'grade', 'school_name',
       'reading_score', 'math_score', 'School ID', 'type', 'size', 'budget'],
      dtype='object')

In [310]:
math_and_reading_scores_df= school_data_complete_df[["school_name","math_score","reading_score", "Student ID"]]
math_scores_df=school_data_complete_df[["school_name","math_score"]]
reading_scores_df=school_data_complete_df[["school_name", "math_score"]]
math_and_reading_scores_df

Unnamed: 0,school_name,math_score,reading_score,Student ID
0,Huang High School,79,66,0
1,Huang High School,61,94,1
2,Huang High School,60,90,2
3,Huang High School,58,67,3
4,Huang High School,84,97,4
...,...,...,...,...
39165,Thomas High School,90,99,39165
39166,Thomas High School,70,95,39166
39167,Thomas High School,84,73,39167
39168,Thomas High School,90,99,39168


In [326]:
number_of_students_passing_both_district_level=math_and_reading_scores_df.loc[(
math_and_reading_scores_df["math_score"]>70) & (math_and_reading_scores_df["reading_score"]>=70),
["school_name","Student ID"]]
number_of_students_passing_both_district_level.groupby(["school_name"]).count()


Unnamed: 0_level_0,Student ID
school_name,Unnamed: 1_level_1
Bailey High School,2629
Cabrera High School,1615
Figueroa High School,1517
Ford High School,1436
Griffin High School,1278
Hernandez High School,2403
Holden High School,373
Huang High School,1505
Johnson High School,2460
Pena High School,845


Unnamed: 0_level_0,math_score,reading_score,Student ID
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bailey High School,2629,2629,2629
Cabrera High School,1615,1615,1615
Figueroa High School,1517,1517,1517
Ford High School,1436,1436,1436
Griffin High School,1278,1278,1278
Hernandez High School,2403,2403,2403
Holden High School,373,373,373
Huang High School,1505,1505,1505
Johnson High School,2460,2460,2460
Pena High School,845,845,845
