### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [160]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas Data Frames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

#print out 'school_data' to see what the data looks like before merge
school_data

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500
5,5,Wilson High School,Charter,2283,1319574
6,6,Cabrera High School,Charter,1858,1081356
7,7,Bailey High School,District,4976,3124928
8,8,Holden High School,Charter,427,248087
9,9,Pena High School,Charter,962,585858


In [161]:
#print out student_data
student_data

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84
...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90


In [162]:
# Combine the data into a single dataset
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_complete.columns

Index(['Student ID', 'student_name', 'gender', 'grade', 'school_name',
       'reading_score', 'math_score', 'School ID', 'type', 'size', 'budget'],
      dtype='object')

In [163]:
school_data_complete

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


## District Summary

* Calculate the total number of schools

* Calculate the total number of students

* Calculate the total budget

* Calculate the average math score 

* Calculate the average reading score

* Calculate the overall passing rate (overall average score), i.e. (avg. math score + avg. reading score)/2

* Calculate the percentage of students with a passing math score (70 or greater)

* Calculate the percentage of students with a passing reading score (70 or greater)

* Create a dataframe to hold the above results

* Optional: give the displayed data cleaner formatting

In [164]:
#calculate total # schools
school_count = len(school_data_complete['school_name'].unique())
school_count

15

In [209]:
#unique school names
unique_schools = (school_data_complete['school_name'].unique())
unique_schools


array(['Huang High School', 'Figueroa High School', 'Shelton High School',
       'Hernandez High School', 'Griffin High School',
       'Wilson High School', 'Cabrera High School', 'Bailey High School',
       'Holden High School', 'Pena High School', 'Wright High School',
       'Rodriguez High School', 'Johnson High School', 'Ford High School',
       'Thomas High School'], dtype=object)

In [166]:
#unique school types
school_types = (school_data_complete['type'].unique())
school_types

array(['District', 'Charter'], dtype=object)

In [167]:
#SCHOOL 1: calculate total # of students for "Huang High School"

#set index to  school_name
school_index = school_data.set_index("school_name")
school_index

Unnamed: 0_level_0,School ID,type,size,budget
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Huang High School,0,District,2917,1910635
Figueroa High School,1,District,2949,1884411
Shelton High School,2,Charter,1761,1056600
Hernandez High School,3,District,4635,3022020
Griffin High School,4,Charter,1468,917500
Wilson High School,5,Charter,2283,1319574
Cabrera High School,6,Charter,1858,1081356
Bailey High School,7,District,4976,3124928
Holden High School,8,Charter,427,248087
Pena High School,9,Charter,962,585858


In [168]:
#SCHOOL 0: calculate per student budget 
huang_school_budget = school_index.loc["Huang High School", "budget"]
huang_school_budget

1910635

In [169]:
#school 0: set variable for number of students
huang_students = school_index.loc["Huang High School", "size"]
huang_students

2917

In [170]:
# budget per student, for each school
per_student_budget = school_index["budget"]/school_index["size"]
per_student_budget

school_name
Huang High School        655.0
Figueroa High School     639.0
Shelton High School      600.0
Hernandez High School    652.0
Griffin High School      625.0
Wilson High School       578.0
Cabrera High School      582.0
Bailey High School       628.0
Holden High School       581.0
Pena High School         609.0
Wright High School       583.0
Rodriguez High School    637.0
Johnson High School      650.0
Ford High School         644.0
Thomas High School       638.0
dtype: float64

In [171]:
#set index for student_data to school_name
student_data_index = student_data.set_index("school_name")
student_data_index

Unnamed: 0_level_0,Student ID,student_name,gender,grade,reading_score,math_score
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Huang High School,0,Paul Bradley,M,9th,66,79
Huang High School,1,Victor Smith,M,12th,94,61
Huang High School,2,Kevin Rodriguez,M,12th,90,60
Huang High School,3,Dr. Richard Scott,M,12th,67,58
Huang High School,4,Bonnie Ray,F,9th,97,84
...,...,...,...,...,...,...
Thomas High School,39165,Donna Howard,F,12th,99,90
Thomas High School,39166,Dawn Bell,F,10th,95,70
Thomas High School,39167,Rebecca Tanner,F,9th,73,84
Thomas High School,39168,Desiree Kidd,F,10th,99,90


## School index(0): Huang High School

In [210]:
#name of school

huang_school_name = (school_data_complete['school_name'][0])
huang_school_name

'Huang High School'

In [212]:
#unique school types
huang_school_type = (school_data_complete['type'][0])
huang_school_type

'District'

In [218]:
# budget per student, for Huang High school
huang_per_student_budget = school_index["budget"][0]/school_index["size"][0]
huang_per_student_budget

655.0

In [172]:
#Huang high school MATH scores
huang_math_scores = student_data_index.loc["Huang High School", "math_score"]
huang_math_scores

school_name
Huang High School    79
Huang High School    61
Huang High School    60
Huang High School    58
Huang High School    84
                     ..
Huang High School    95
Huang High School    81
Huang High School    73
Huang High School    85
Huang High School    73
Name: math_score, Length: 2917, dtype: int64

In [173]:
#AVG MATH SCORES for Huang High School
huang_average_math = huang_math_scores.mean()
huang_average_math

76.62941378128214

In [174]:
#Huang high school READING SCORES
huang_reading_scores = student_data_index.loc["Huang High School", "reading_score"]
huang_reading_scores

school_name
Huang High School    66
Huang High School    94
Huang High School    90
Huang High School    67
Huang High School    97
                     ..
Huang High School    98
Huang High School    64
Huang High School    66
Huang High School    70
Huang High School    82
Name: reading_score, Length: 2917, dtype: int64

In [175]:
#AVG READING SCORES for Huang High School
huang_average_reading = huang_reading_scores.mean()
huang_average_reading                       

81.18272197463148

In [176]:
# STUDENTS PASSING MATH for Huang High School
# Student ID	student_name	gender	grade	reading_score	math_score
huang_passing_math_count = student_data.loc[(student_data["math_score"] > 70) & (student_data["school_name"] == "Huang High School"), :]
                                                
huang_passing_math_count

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
4,4,Bonnie Ray,F,9th,Huang High School,97,84
5,5,Bryan Miranda,M,9th,Huang High School,94,94
6,6,Sheena Carter,F,11th,Huang High School,82,80
8,8,Michael Roth,M,10th,Huang High School,95,87
...,...,...,...,...,...,...,...
2912,2912,Michael Combs,M,9th,Huang High School,98,95
2913,2913,Monica Barajas,F,9th,Huang High School,64,81
2914,2914,Carlos Garner,M,12th,Huang High School,66,73
2915,2915,April Williams,F,12th,Huang High School,70,85


In [177]:
# number of passing math students
huang_num_passing_math_students = huang_passing_math_count.count()
print(huang_num_passing_math_students.unique())

[1847]


In [178]:
#total students at Huang High School with math scores
huang_total_students = school_index.loc["Huang High School","size"]
huang_total_students

2917

In [179]:
# %PASSING MATH (1847/2917)
huang_passing_math_percentage = huang_num_passing_math_students/huang_total_students * 100
print(f"The percentage of students passing math is : {huang_passing_math_percentage.unique()}.")

The percentage of students passing math is : [63.31847789].


In [180]:
## students PASSING READING for Huang High School
# Student ID	student_name	gender	grade	reading_score	math_score
huang_passing_reading_count = student_data.loc[(student_data["reading_score"] > 70) & (student_data["school_name"] == "Huang High School"), :]
                                                
huang_passing_reading_count

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
4,4,Bonnie Ray,F,9th,Huang High School,97,84
5,5,Bryan Miranda,M,9th,Huang High School,94,94
6,6,Sheena Carter,F,11th,Huang High School,82,80
...,...,...,...,...,...,...,...
2909,2909,Richard Willis,M,10th,Huang High School,94,89
2910,2910,Tiffany Diaz,F,9th,Huang High School,80,77
2911,2911,Justin Evans,M,12th,Huang High School,94,95
2912,2912,Michael Combs,M,9th,Huang High School,98,95


In [181]:
# number of passing reading students
huang_num_passing_reading_students = huang_passing_reading_count.count()
print(huang_num_passing_reading_students.unique())

[2299]


In [182]:
#total students at Huang High School with math scores
huang_total_students = school_index.loc["Huang High School","size"]
huang_total_students

2917

In [183]:
# %PASSING READING (2299/2917)
huang_passing_reading_percentage = huang_num_passing_reading_students/huang_total_students * 100
print(f"The percentage of students passing math is : {huang_passing_reading_percentage.unique()}%")

The percentage of students passing math is : [78.81384985]%


In [223]:
huang_overall_passing_rate = (huang_passing_math_percentage + huang_passing_reading_percentage) / 2
print(f"The percentage of overall passing rate is: {huang_overall_passing_rate.unique()} %" )

The percentage of overall passing rate is: [71.06616387] %


In [226]:
#summary table for school profile
#Create DataFrame with above results with cleaner formatting
huang_summary_table = pd.DataFrame({"School Name":[huang_school_name],
                             "School Type": [huang_school_type],
                             "Total Students": [huang_total_students],
                             "Total School Budget": [huang_school_budget],
                             "Per Student Budget": [huang_per_student_budget],
                             "Average Math Score": [huang_average_math],
                             "Average Reading Score": [huang_average_reading],
                             "% Passing Math Rate": [huang_passing_math_percentage],
                             "% Passing Reading Rate":[huang_passing_reading_percentage],
                             "% Overall Passing Rate":[huang_overall_passing_rate]})
huang_summary_table.round(2)

NameError: name 'avg' is not defined

## School index(1):  Figueroa High School

In [185]:
#Figueroa high school MATH scores
figueroa_math_scores = student_data_index.loc["Figueroa High School", "math_score"]
figueroa_math_scores

school_name
Figueroa High School    87
Figueroa High School    84
Figueroa High School    77
Figueroa High School    64
Figueroa High School    64
                        ..
Figueroa High School    92
Figueroa High School    66
Figueroa High School    64
Figueroa High School    83
Figueroa High School    65
Name: math_score, Length: 2949, dtype: int64

In [186]:
#AVG MATH SCORES for Huang High School
figueroa_average_math = figueroa_math_scores.mean()
figueroa_average_math

76.71176670057646

In [187]:
#Huang high school READING SCORES
figueroa_reading_scores = student_data_index.loc["Figueroa High School", "reading_score"]
figueroa_reading_scores


school_name
Figueroa High School    85
Figueroa High School    97
Figueroa High School    67
Figueroa High School    97
Figueroa High School    79
                        ..
Figueroa High School    81
Figueroa High School    99
Figueroa High School    98
Figueroa High School    91
Figueroa High School    99
Name: reading_score, Length: 2949, dtype: int64

In [188]:
#AVG READING SCORES for Huang High School
figueroa_average_reading = figueroa_reading_scores.mean()
figueroa_average_reading 

81.15801966768396

In [189]:
# STUDENTS PASSING MATH for Huang High School
# Student ID	student_name	gender	grade	reading_score	math_score
figueroa_passing_math_count = student_data.loc[(student_data["math_score"] > 70) & (student_data["school_name"] == "Figueroa High School"), :]
figueroa_passing_math_count

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
2917,2917,Amy Jacobs,F,10th,Figueroa High School,85,87
2918,2918,Nathan Campbell,M,12th,Figueroa High School,97,84
2919,2919,Randall Stewart,M,12th,Figueroa High School,67,77
2922,2922,Amanda Hamilton DDS,F,9th,Figueroa High School,72,93
2923,2923,Anthony Pace,M,10th,Figueroa High School,66,90
...,...,...,...,...,...,...,...
5858,5858,Rachel Knight,F,11th,Figueroa High School,78,71
5859,5859,Courtney Gonzalez,F,12th,Figueroa High School,75,91
5860,5860,Susan Payne,F,10th,Figueroa High School,81,92
5861,5861,April Sullivan,F,9th,Figueroa High School,81,92


In [190]:

# number of passing math students
figueroa_num_passing_math_students = figueroa_passing_math_count.count()
print(figueroa_num_passing_math_students.unique())


[1880]


In [191]:
#total students at Huang High School with math scores
figueroa_total_students = school_index.loc["Figueroa High School","size"]
figueroa_total_students


2949

In [192]:
# %PASSING MATH (1880/2949)
figueroa_passing_math_percentage = figueroa_num_passing_math_students/figueroa_total_students * 100
print(f"The percentage of students passing math is : {figueroa_passing_math_percentage.unique()}.")

The percentage of students passing math is : [63.75042387].


In [193]:
## students PASSING READING for Figueroa High School
# Student ID	student_name	gender	grade	reading_score	math_score
figueroa_passing_reading_count = student_data.loc[(student_data["reading_score"] > 70) & (student_data["school_name"] == "Figueroa High School"), :]
figueroa_passing_reading_count



Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
2917,2917,Amy Jacobs,F,10th,Figueroa High School,85,87
2918,2918,Nathan Campbell,M,12th,Figueroa High School,97,84
2920,2920,Jennifer Brown,F,9th,Figueroa High School,97,64
2921,2921,Denise Lopez,F,10th,Figueroa High School,79,64
2922,2922,Amanda Hamilton DDS,F,9th,Figueroa High School,72,93
...,...,...,...,...,...,...,...
5861,5861,April Sullivan,F,9th,Figueroa High School,81,92
5862,5862,Jessica Mccann,F,11th,Figueroa High School,99,66
5863,5863,Charles Walker,M,11th,Figueroa High School,98,64
5864,5864,Tammy Burns,F,9th,Figueroa High School,91,83


In [194]:
# number of passing reading students
figueroa_num_passing_reading_students = figueroa_passing_reading_count.count()
print(figueroa_num_passing_reading_students.unique())



[2313]


In [195]:
#total students at High School with math scores
figueroa_total_students = school_index.loc["Figueroa High School","size"]
figueroa_total_students


2949

In [196]:
# %PASSING READIN (2299/2917)
figueroa_passing_reading_percentage = figueroa_num_passing_reading_students/figueroa_total_students * 100
print(f"The percentage of students passing math is : {figueroa_passing_reading_percentage.unique()}%")



The percentage of students passing math is : [78.43336724]%


In [197]:

figueroa_overall_passing_rate = (figueroa_passing_math_percentage + figueroa_passing_reading_percentage) / 2
print(f"The percentage of overall passing rate is: {figueroa_overall_passing_rate.unique()} %" )


The percentage of overall passing rate is: [71.09189556] %


In [198]:
#Calculate the overall_passing_score
overall_passing_rate = (average_math_score + average_reading_score) / 2
overall_passing_rate

80.43160582078121

In [199]:
# Calculate the percentage of students with a passing math score (70 or greater)
passing_math_count = school_data_complete[(school_data_complete["math_score"] >= 70)].count()["student_name"]
passing_math_percentage = passing_math_count / float(student_count) * 100
print(f"The number of students passing math is: {passing_math_count}")
print(f"The percentage of students passing math is: {passing_math_percentage}.")

The number of students passing math is: 29370
The percentage of students passing math is: 74.9808526933878.


In [200]:
#Calculate the percentage of students with a passing reading score (70 or greater)
passing_reading_count = school_data_complete[(school_data_complete["reading_score"] >= 70)].count()["student_name"]
passing_reading_percentage = passing_reading_count / float(student_count) * 100
print(f"The number of students passing reading is: {passing_reading_count}.")
print(f"The percentage of students passing reading is: {passing_reading_percentage}")

The number of students passing reading is: 33610.
The percentage of students passing reading is: 85.80546336482001


In [201]:
#Create DataFrame with above results with cleaner formatting
summary_table = pd.DataFrame({"Total Schools":[school_count],
                             "Total Students":[student_count],
                             "Total Budget":total_budget,
                             "Average Math Score":average_math_score,
                             "Average Reading Score":average_reading_score,
                             "% Passing Math":[passing_math_percentage],
                             "% Passing Reading":[passing_reading_percentage],
                             "% Overall Passing Rate":[overall_passing_rate]})
summary_table.round(2)

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,82932329558,78.99,81.88,74.98,85.81,80.43


## School Summary

* Create an overview table that summarizes key metrics about each school, including:
  * School Name
  * School Type
  * Total Students
  * Total School Budget
  * Per Student Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)
  
* Create a dataframe to hold the above results

## Top Performing Schools (By Passing Rate)

* Sort and display the top five schools in overall passing rate

In [202]:
#create overview table that summarizes each school
# Index(['Student ID', 'student_name', 'gender', 'grade', 'school_name',
#        'reading_score', 'math_score', 'School ID', 'type', 'size', 'budget'],
#       dtype='object')
school_data_complete[["school_name", "type", "student_name", "budget", "math_score", "reading_score"]]

Unnamed: 0,school_name,type,student_name,budget,math_score,reading_score
0,Huang High School,District,Paul Bradley,1910635,79,66
1,Huang High School,District,Victor Smith,1910635,61,94
2,Huang High School,District,Kevin Rodriguez,1910635,60,90
3,Huang High School,District,Dr. Richard Scott,1910635,58,67
4,Huang High School,District,Bonnie Ray,1910635,84,97
...,...,...,...,...,...,...
39165,Thomas High School,Charter,Donna Howard,1043130,90,99
39166,Thomas High School,Charter,Dawn Bell,1043130,70,95
39167,Thomas High School,Charter,Rebecca Tanner,1043130,84,73
39168,Thomas High School,Charter,Desiree Kidd,1043130,90,99


In [203]:
#Creating list of school types (charter vs. district)
school_type = school_data['type'].unique()
school_type

array(['District', 'Charter'], dtype=object)

In [204]:
# Create an overview table that summarizes key metrics about each school, including:
# School Name
# School Type
# Total Students
# Total School Budget
# Per Student Budget
# Average Math Score
# Average Reading Score
# % Passing Math
# % Passing Reading
# Overall Passing Rate (Average of the above two)
# Create a dataframe to hold the above results


#sorting and displaying top 5 schools with Overall Passing


In [205]:
# school_data_complete.columns()

## Bottom Performing Schools (By Passing Rate)

In [206]:

bottom = school_data_complete[""]
bottom_five_df.head()

KeyError: ''

* Sort and display the five worst-performing schools

In [None]:
overall_passing_df = summary_table.sort_values("% Overall Passing Rate")
overall_passing_df.tail(5)

## Math Scores by Grade

* Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.

  * Create a pandas series for each grade. Hint: use a conditional statement.
  
  * Group each series by school
  
  * Combine the series into a dataframe
  
  * Optional: give the displayed data cleaner formatting

In [None]:
ninth_grade_scores = school_data_complete.loc[(school_data_complete['grades'] == "9th" & school_data_complete['math_score'] == "78.98537145774827")]
ninth_grade_scores

## Reading Score by Grade 

* Perform the same operations as above for reading scores

## Scores by School Spending

* Create a table that breaks down school performances based on average Spending Ranges (Per Student). Use 4 reasonable bins to group school spending. Include in the table each of the following:
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

In [None]:
# Sample bins. Feel free to create your own bins.
spending_bins = [0, 585, 615, 645, 675]
group_names = ["<$585", "$585-615", "$615-645", "$645-675"]

## Scores by School Size

* Perform the same operations as above, based on school size.

In [None]:
# Sample bins. Feel free to create your own bins.
size_bins = [0, 1000, 2000, 5000]
group_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

## Scores by School Type

* Perform the same operations as above, based on school type.

In [None]:
# cleaned_school_df.to_csv("output/PyCitySchools_main.py", encoding="utf-8" index=False, header=True)