### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [1]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
df = school_data_complete.copy()
df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [7]:
df.describe()

Unnamed: 0,Student ID,reading_score,math_score,School ID,size,budget
count,39170.0,39170.0,39170.0,39170.0,39170.0,39170.0
mean,19584.5,81.87784,78.985371,6.978172,3332.95711,2117241.0
std,11307.549359,10.23958,12.309968,4.444329,1323.914069,874998.7
min,0.0,63.0,55.0,0.0,427.0,248087.0
25%,9792.25,73.0,69.0,3.0,1858.0,1081356.0
50%,19584.5,82.0,79.0,7.0,2949.0,1910635.0
75%,29376.75,91.0,89.0,11.0,4635.0,3022020.0
max,39169.0,99.0,99.0,14.0,4976.0,3124928.0


In [8]:
## District Summary

# Calculate the total number of schools

def count_unique(obj):
    return len(obj.unique())

total_schools = count_unique(df["school_name"])
print(f"Total schools = {total_schools}")

Total schools = 15


In [9]:
## District Summary

# Calculate the total number of students

total_students = df["student_name"].count()
print(f"Total students = {total_students}")

Total students = 39170


In [10]:
## District Summary

# Calculate the total budget

def sum_unique(obj):
    return sum(obj.unique())

total_budget = sum_unique(df["budget"])
print(f"Total budget = {total_budget}")

Total budget = 24649428


In [11]:
## District Summary

# Calculate the average math score

def calc_average(obj):
    return obj.mean()

avg_mscore = calc_average(df["math_score"])
print(f"Average math score = {avg_mscore}")

Average math score = 78.98537145774827


In [12]:
## District Summary

# Calculate the average reading score

avg_rscore = calc_average(df["reading_score"])
print(f"Average reading score = {avg_rscore}")

Average reading score = 81.87784018381414


In [20]:
## District Summary

# Calculate the percentage of students with a passing math score (70 or greater)

total_pass_math = len(df[df["math_score"] >= 70])
total_fail_math = len(df[df["math_score"] < 70])

total_take_math = total_pass_math + total_fail_math

print(total_pass_math + total_fail_math)              

print(total_students == total_pass_math + total_fail_math) 

print(f"Total pass math = {total_pass_math}")
print(f"Total fail math = {total_fail_math}")

per_pmath = (total_pass_math / total_take_math) * 100

print(f"Total % pass math = {per_pmath}")

39170
True
Total pass math = 29370
Total fail math = 9800
Total % pass math = 74.9808526933878


In [21]:
## District Summary

# Calculate the percentage of students with a passing reading score (70 or greater)

total_pass_reading = len(df[df["reading_score"] >= 70])
total_fail_reading = len(df[df["reading_score"] < 70])

total_take_reading = total_pass_reading + total_fail_reading

print(total_pass_reading + total_fail_reading)              

print(total_students == total_pass_reading + total_fail_reading) 

print(f"Total pass reading = {total_pass_reading}")
print(f"Total fail reading = {total_fail_reading}")

per_preading = (total_pass_reading / total_take_reading) * 100

print(f"Total % pass reading = {per_preading}")


39170
True
Total pass reading = 33610
Total fail reading = 5560
Total % pass reading = 85.80546336482001


In [24]:
## District Summary

# Calculate the percentage of students who passed math and reading (% Overall Passing)

total_pass_both = len(      df[   (df["reading_score"] >= 70) & (df["math_score"] >= 70) ])

total_take_both = total_students    

print(f"Total pass both = {total_pass_both}")
print(f"Total take both = {total_take_both}")

per_pboth = (total_pass_both / total_take_both) * 100

print(f"Total % pass both = {per_pboth}")

Total pass both = 25528
Total take both = 39170
Total % pass both = 65.17232575950983


In [27]:
# Create a dataframe to hold the above results

district_summary = pd.DataFrame(
    
    {
        "Total Schools" : [total_schools],
        "Total Students" : [total_students],
        "Total Budget" : [total_budget],
        "Average Math Score" : [avg_mscore],
        "Average Reading Score" : [avg_rscore],
        "% Passing Math" : [per_pmath],
        "% Passing Reading" : [per_preading],
        "% Overall Passing" : [per_pboth],
        "Total Schools" : [total_schools],
                
    }
        
)

district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,65.172326


In [28]:
# Optional: give the displayed data cleaner formatting
    
    ## look up how to do this
    
    
# district_summary.map("{}".format)
    
    

## District Summary

* Calculate the total number of schools

* Calculate the total number of students

* Calculate the total budget

* Calculate the average math score 

* Calculate the average reading score

* Calculate the percentage of students with a passing math score (70 or greater)

* Calculate the percentage of students with a passing reading score (70 or greater)

* Calculate the percentage of students who passed math **and** reading (% Overall Passing)

* Create a dataframe to hold the above results

* Optional: give the displayed data cleaner formatting

## School Summary

* Create an overview table that summarizes key metrics about each school, including:
  * School Name
  * School Type
  * Total Students
  * Total School Budget
  * Per Student Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * % Overall Passing (The percentage of students that passed math **and** reading.)
  
* Create a dataframe to hold the above results

In [72]:
# School Summary
# Create an overview table that summarizes key metrics about each school, including:

# School Name (make index)
# School Type
# Total Students
# Total School Budget
# Per Student Budget
# Average Math Score
# Average Reading Score
# % Passing Math
# % Passing Reading
# % Overall Passing (The percentage of students that passed math and reading.)
# Create a dataframe to hold the above results

df = school_data_complete.copy()
df_totals = df.groupby(["school_name"])["math_score"].sum()
df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [76]:
df_scalars = df.groupby(["school_name"])[  ["type", "size", "budget"]    ].min()

df_scalars["Per Student Budget"] = df_scalars["budget"] / df_scalars["size"]

print(df_scalars)

                           type  size   budget  Per Student Budget
school_name                                                       
Bailey High School     District  4976  3124928               628.0
Cabrera High School     Charter  1858  1081356               582.0
Figueroa High School   District  2949  1884411               639.0
Ford High School       District  2739  1763916               644.0
Griffin High School     Charter  1468   917500               625.0
Hernandez High School  District  4635  3022020               652.0
Holden High School      Charter   427   248087               581.0
Huang High School      District  2917  1910635               655.0
Johnson High School    District  4761  3094650               650.0
Pena High School        Charter   962   585858               609.0
Rodriguez High School  District  3999  2547363               637.0
Shelton High School     Charter  1761  1056600               600.0
Thomas High School      Charter  1635  1043130               6

In [75]:
df_means = df.groupby(["school_name"])[["size", "math_score", "reading_score"]].mean()
print(df_means)

                         size  math_score  reading_score
school_name                                             
Bailey High School     4976.0   77.048432      81.033963
Cabrera High School    1858.0   83.061895      83.975780
Figueroa High School   2949.0   76.711767      81.158020
Ford High School       2739.0   77.102592      80.746258
Griffin High School    1468.0   83.351499      83.816757
Hernandez High School  4635.0   77.289752      80.934412
Holden High School      427.0   83.803279      83.814988
Huang High School      2917.0   76.629414      81.182722
Johnson High School    4761.0   77.072464      80.966394
Pena High School        962.0   83.839917      84.044699
Rodriguez High School  3999.0   76.842711      80.744686
Shelton High School    1761.0   83.359455      83.725724
Thomas High School     1635.0   83.418349      83.848930
Wilson High School     2283.0   83.274201      83.989488
Wright High School     1800.0   83.682222      83.955000


In [83]:
df_totals = df.groupby(["school_name"])[  ["math_score", "reading_score"]].sum()
print(df_totals)


                       math_score  reading_score
school_name                                     
Bailey High School         383393         403225
Cabrera High School        154329         156027
Figueroa High School       226223         239335
Ford High School           211184         221164
Griffin High School        122360         123043
Hernandez High School      358238         375131
Holden High School          35784          35789
Huang High School          223528         236810
Johnson High School        366942         385481
Pena High School            80654          80851
Rodriguez High School      307294         322898
Shelton High School        146796         147441
Thomas High School         136389         137093
Wilson High School         190115         191748
Wright High School         150628         151119


## Top Performing Schools (By % Overall Passing)

* Sort and display the top five performing schools by % overall passing.

## Bottom Performing Schools (By % Overall Passing)

* Sort and display the five worst-performing schools by % overall passing.

## Math Scores by Grade

* Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.

  * Create a pandas series for each grade. Hint: use a conditional statement.
  
  * Group each series by school
  
  * Combine the series into a dataframe
  
  * Optional: give the displayed data cleaner formatting

## Reading Score by Grade 

* Perform the same operations as above for reading scores

## Scores by School Spending

* Create a table that breaks down school performances based on average Spending Ranges (Per Student). Use 4 reasonable bins to group school spending. Include in the table each of the following:
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

## Scores by School Size

* Perform the same operations as above, based on school size.

## Scores by School Type

* Perform the same operations as above, based on school type