In [1]:
#Dependencies and Setup
import pandas as pd
import numpy as np

In [2]:
#Load to files
schools_csv_path = "Resources/schools_complete.csv"
students_csv_path = "Resources/students_complete.csv"

In [3]:
#Read the files and store into dataframes
schools_df = pd.read_csv(schools_csv_path)
students_df = pd.read_csv(students_csv_path)

In [4]:
#combine the data into a single dataset 
combined_df = pd.merge(students_df, schools_df, on="school_name")

In [5]:
# Calculate the total number of backers for all US projects (from kickstarter activity)
total_students = students_df['student_name'].count()

In [6]:
average_math_score = students_df["math_score"].mean()

In [7]:
average_read_score = students_df["reading_score"].mean()

In [8]:
total_budget = schools_df["budget"].sum()

In [9]:
total_schools = schools_df["school_name"].count()

In [10]:
pass_read = combined_df["student_name"].loc[combined_df['reading_score'] >= 70].count()
pass_read_pct = pass_read / total_students * 100

In [11]:
pass_math = combined_df["student_name"].loc[combined_df["math_score"] >= 70].count()
pass_math_pct = pass_math / total_students * 100

In [12]:
pass_all = combined_df["student_name"].loc[(combined_df["math_score"] >= 70) & (combined_df["reading_score"] >= 70)].count()
overall_pass = pass_all / total_students * 100

In [13]:
#create a (data frame) summary with Total Schools, Total Students, Total Budget, Average Math Score, Average Reading Score, %Passing Math, %Passing Reading %Overall Passing
summary_schools_df = pd.DataFrame({"Total Schools": [total_schools], 
                                   "Total Students": total_students, 
                                   "Total Budget": total_budget, 
                                   "Average Math Score": average_math_score,
                                   "Average Reading Score": average_read_score,
                                   "% Passing Math": pass_math_pct,"% Passing Reading": pass_read_pct, 
                                   "% Overall Passing": overall_pass})
summary_schools_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,65.172326


In [14]:
#add boolean series for passing reading and passing math 
combined_df["pass_math"] = combined_df["student_name"].loc[combined_df["math_score"] >= 70]
combined_df["pass_reading"] = combined_df["student_name"].loc[combined_df["math_score"]>= 70]
combined_df["pass_both"] = combined_df["student_name"].loc[(combined_df["math_score"] >= 70) & (combined_df["reading_score"] >= 70)]
combined_df

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,pass_math,pass_reading,pass_both
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,Paul Bradley,Paul Bradley,
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,,,
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,,,
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,,,
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,Bonnie Ray,Bonnie Ray,Bonnie Ray
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130,Donna Howard,Donna Howard,Donna Howard
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130,Dawn Bell,Dawn Bell,Dawn Bell
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130,Rebecca Tanner,Rebecca Tanner,Rebecca Tanner
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130,Desiree Kidd,Desiree Kidd,Desiree Kidd


In [15]:
#Create a dataframe with metrics about each school it should groupby school
school_group = combined_df.groupby(["school_name"])
school_group_df = school_group.sum()
school_group_df = school_group_df[["reading_score","math_score"]]

In [16]:
stats_group = combined_df.groupby(["school_name"])
stats_group_df = stats_group.count()

In [17]:
#drop student ID, gender, grade, reading score, math score, school ID, type
stats_group_df = stats_group_df[["pass_math","pass_reading","pass_both"]]

In [18]:
passing_df = pd.merge(stats_group_df, school_group_df, on="school_name")

In [19]:
best_worst_df = pd.merge(passing_df, schools_df, on="school_name")
best_worst_df

Unnamed: 0,school_name,pass_math,pass_reading,pass_both,reading_score,math_score,School ID,type,size,budget
0,Bailey High School,3318,3318,2719,403225,383393,7,District,4976,3124928
1,Cabrera High School,1749,1749,1697,156027,154329,6,Charter,1858,1081356
2,Figueroa High School,1946,1946,1569,239335,226223,1,District,2949,1884411
3,Ford High School,1871,1871,1487,221164,211184,13,District,2739,1763916
4,Griffin High School,1371,1371,1330,123043,122360,4,Charter,1468,917500
5,Hernandez High School,3094,3094,2481,375131,358238,3,District,4635,3022020
6,Holden High School,395,395,381,35789,35784,8,Charter,427,248087
7,Huang High School,1916,1916,1561,236810,223528,0,District,2917,1910635
8,Johnson High School,3145,3145,2549,385481,366942,12,District,4761,3094650
9,Pena High School,910,910,871,80851,80654,9,Charter,962,585858


In [20]:
#add column with per student budget
best_worst_df["per_student_budget"] = best_worst_df["budget"] / best_worst_df["size"]

In [21]:
#add column with average math score
best_worst_df["average_math"] = best_worst_df["math_score"] / best_worst_df["size"]

In [22]:
#add column with average reading scores
best_worst_df["average_reading"] = best_worst_df["reading_score"] / best_worst_df["size"]

In [23]:
#column with % passing reading
best_worst_df["pass_reading_pct"] = best_worst_df["pass_reading"] / best_worst_df["size"] *100

In [24]:
#add column with % passing math
best_worst_df["pass_math_pct"] = best_worst_df["pass_math"] / best_worst_df["size"] *100

In [25]:
#add column with % passing both
best_worst_df["pass_both_pct"] = best_worst_df["pass_both"] / best_worst_df["size"] *100

In [26]:
#rename headers School Name, School Type, Total Students, Total School Budget, Per Student Budget
#Average Math Score, Average Reading Score, % Passing Math (The percentage of students that passed math.)
#% Passing Reading (The percentage of students that passed reading.), % Overall Passing (The percentage of students that passed math **and** reading.)
rename_best_worst_df = best_worst_df.rename(columns={"school_name":"School Name",
                                              "size":"Total Students", 
                                              "budget":"Total School Budget", 
                                              "per_student_budget":"Per Student Budget",
                                              "average_math":"Average Math Score", 
                                              "average_reading":"Average Reading Score",
                                              "pass_math_pct":"% Passing Math",
                                              "pass_reading_pct":"% Passing Reading", 
                                              "pass_both_pct":"% Overall Passing",
                                             "type":"School Type"})

In [28]:
#organize the columns to match the order
final_best_worst_df = rename_best_worst_df[["School Name", "School Type", 
                                             "Total Students", "Total School Budget", 
                                             "Per Student Budget", "Average Math Score", 
                                             "Average Reading Score", "% Passing Math",
                                             "% Passing Reading","% Overall Passing"]]

In [34]:
#to sort from highest to lowest ascending=False must be passed in
best_df = final_best_worst_df.sort_values("% Overall Passing",ascending=False)
best_df.head(5)

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
1,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,94.133477,91.334769
12,Thomas High School,Charter,1635,1043130,638.0,83.418349,83.84893,93.272171,93.272171,90.948012
4,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,93.392371,90.599455
13,Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,93.867718,90.582567
9,Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,94.594595,90.540541


In [None]:
#sort the schools by ascending order and display the top five bottom performing schools by overall passing
worst_df = final_best_worst_df.sort_values("% Overall Passing",ascending=True)
worst_df.head(5)

In [None]:
#create a data frame that lists the average reading score for students of each grade level (use bins) grade levels are 9th,10th, 11th, and 12th -- index by school

#create a pandas series for each grade (use a conditional statement)

#combine series into a dataframe


In [None]:
#create a data frame that lists the average math score for students of each grade level (use bins) grade levels are 9th,10th, 11th, and 12th -- index by school

#create a pandas series for each grade (use a conditional statement)

#combine series into a dataframe


In [None]:
#create a dataframe that breaks down school performance based on average spending ranges (per student) -- use four reasonable bins to group schools by spending.

#a table should include the following: Spending Ranges (Per Student), Average Math Score, Average Raading Score, % Passing Math, %Passing Reading, %Overall Passing


In [None]:
#create a dataframe that breaks down school performance based on school size -- use four reasonable bins to group schools.

#a table should include the following: Spending Ranges (Per Student), Average Math Score, Average Raading Score, % Passing Math, %Passing Reading, %Overall Passing


In [None]:
#create a dataframe that breaks down school performance based on school type (charter or district)

#a table should include the following: Spending Ranges (Per Student), Average Math Score, Average Raading Score, % Passing Math, %Passing Reading, %Overall Passing
