PyCity Schools Analysis

>my analysis here<

In [3]:
# Dependencies and Setup
from pathlib import Path
import pandas as pd

# File to Load
school_data_to_load = Path("../Resources/schools_complete.csv")
student_data_to_load = Path("../Resources/students_complete.csv")

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

In [76]:
# Have a look at the merged data using various pandas functions
school_data_complete.head(100)


Unnamed: 0,Student ID,student_name,gender,year,school_name,reading_score,maths_score,School ID,type,size,budget
0,0,Paul Bradley,M,9,Huang High School,96,94,0,Government,2917,1910635
1,1,Victor Smith,M,12,Huang High School,90,43,0,Government,2917,1910635
2,2,Kevin Rodriguez,M,12,Huang High School,41,76,0,Government,2917,1910635
3,3,Richard Scott,M,12,Huang High School,89,86,0,Government,2917,1910635
4,4,Bonnie Ray,F,9,Huang High School,87,69,0,Government,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
95,95,Kevin Martinez,M,11,Huang High School,43,89,0,Government,2917,1910635
96,96,Jessica Smith,F,9,Huang High School,55,49,0,Government,2917,1910635
97,97,Dawn Trujillo,F,11,Huang High School,49,89,0,Government,2917,1910635
98,98,Virginia Ramirez MD,F,10,Huang High School,66,95,0,Government,2917,1910635


In [38]:
school_data_complete.describe

<bound method NDFrame.describe of        Student ID     student_name gender  year         school_name  \
0               0     Paul Bradley      M     9   Huang High School   
1               1     Victor Smith      M    12   Huang High School   
2               2  Kevin Rodriguez      M    12   Huang High School   
3               3    Richard Scott      M    12   Huang High School   
4               4       Bonnie Ray      F     9   Huang High School   
...           ...              ...    ...   ...                 ...   
39165       39165     Donna Howard      F    12  Thomas High School   
39166       39166        Dawn Bell      F    10  Thomas High School   
39167       39167   Rebecca Tanner      F     9  Thomas High School   
39168       39168     Desiree Kidd      F    10  Thomas High School   
39169       39169  Carolyn Jackson      F    11  Thomas High School   

       reading_score  maths_score  School ID         type  size   budget  
0                 96           94     

In [12]:
school_data_complete.shape

(39170, 11)

Local Government Area Summary

In [123]:
# Calculate the Totals (Schools and Students)
school_count = len(school_data_complete["school_name"].unique())
student_count = len(school_data_complete["Student ID"].unique())

# Calculate the Total Budget
total_budget = school_data["budget"].sum()


In [124]:
print(f"Number of schools: {school_count}")
print(f"Number of students: {student_count}")
print(f"Total budget: {total_budget}")

Number of schools: 15
Number of students: 39170
Total budget: 24649428


In [117]:
# Calculate the Average Scores
average_maths_score = school_data_complete["maths_score"].mean()
average_reading_score = school_data_complete["reading_score"].mean()

In [60]:
print(f"Average math score: {average_maths_score}")
print(f"Average reading score: {average_reading_score}")

Average math score: 70.33819249425581
Average reading score: 69.98013786060761


In [98]:
type(school_data_complete["reading_score"])

pandas.core.series.Series

In [197]:
# Calculate the Percentage Pass Rates
passing_maths_count = school_data_complete[(school_data_complete["maths_score"] >= 50)].count()["student_name"]
passing_maths_percentage = passing_maths_count / float(student_count) * 100

passing_reading_count = school_data_complete[(school_data_complete["reading_score"] >= 50)].count()["student_name"]
passing_reading_percentage = passing_reading_count /float(student_count) * 100

passing_maths_reading_count = school_data_complete[(school_data_complete["maths_score"] >= 50) & (school_data_complete["reading_score"] >= 50)].count()["student_name"]
overall_passing_rate = passing_maths_reading_count/float(student_count)* 100

In [198]:
print(passing_maths_percentage)
print(passing_reading_percentage)
print(overall_passing_rate)

86.07863160582077
84.42685728874139
72.80827163645647


In [199]:
print(passing_reading_count)
print(passing_maths_reading_count)

33070
28519


In [216]:
# Convert to DataFrame
area_summary = pd.DataFrame({"Total Schools": [school_count],
                             "Total Students": [student_count],
                            "Total Budget": [total_budget],
                            "Average Maths Score": [average_maths_score],
                           "Average Reading Score": [average_reading_score],
                             "% Passing Maths": [passing_maths_percentage],
                             "% Passing Reading": [passing_reading_percentage],
                            "% Overall Passing": [overall_passing_rate]})
                             
                             
# Formatting
area_summary["Total Students"] = area_summary["Total Students"].map("{:,}".format)
area_summary["Total Budget"] = area_summary["Total Budget"].map("${:,.2f}".format)

# Display the DataFrame
area_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",70.338192,69.980138,86.078632,84.426857,72.808272


School Summary

In [270]:
# Use the code provided to select the type per school from school_data
school_types = school_data.set_index(["school_name"])["type"]

# Calculate the total student count per school from school_data
per_school_counts = school_data.set_index(["school_name"])["size"]

# Calculate the total school budget and per capita spending per school from school_data
per_school_budget = school_data.set_index(["school_name"])["budget"]
per_school_capita = school_data["budget"]/school_data["size"]
per_school_capita_column = school_data["per_school_capital"] = per_school_capita

# Calculate the average test scores per school from school_data_complete
per_school_maths_grouped =  school_data_complete.groupby(["school_name"])
per_school_maths =  per_school_maths_grouped["maths_score"].mean()

per_school_reading_grouped = school_data_complete.groupby(["school_name"])
per_school_reading = per_school_reading_grouped["reading_score"].mean()


In [226]:
print(school_types)
print(per_school_counts)
print(per_school_maths)

school_name
Huang High School         Government
Figueroa High School      Government
Shelton High School      Independent
Hernandez High School     Government
Griffin High School      Independent
Wilson High School       Independent
Cabrera High School      Independent
Bailey High School        Government
Holden High School       Independent
Pena High School         Independent
Wright High School       Independent
Rodriguez High School     Government
Johnson High School       Government
Ford High School          Government
Thomas High School       Independent
Name: type, dtype: object
school_name
Huang High School        2917
Figueroa High School     2949
Shelton High School      1761
Hernandez High School    4635
Griffin High School      1468
Wilson High School       2283
Cabrera High School      1858
Bailey High School       4976
Holden High School        427
Pena High School          962
Wright High School       1800
Rodriguez High School    3999
Johnson High School      4761
Ford 

In [269]:
# Get the students who passed maths and passed reading by creating separate filtered DataFrames from school_data_complete.
schools_counts = school_data_complete.groupby(["school_name"])
maths  = schools_counts[["maths_score"] >= 50].sum()["school_name"]
print(maths)

#school_data_complete.groupby(["school_name"])
#print(school_passing_maths_grouped)
#school_passing_maths_grouped.count().head()

#maths = school_passing_maths_grouped["passing_maths_count"]
#school_data_complete.groupby(["school_name"])
#school_passing_maths = school_passing_maths1["maths_score"] >= 50
#school_passing_reading = school_data_complete.groupby(["student_name"])

# Get the students who passed both reading and maths in a separate DataFrame from school_data_complete.
#passing_maths_and_reading =


TypeError: '>=' not supported between instances of 'list' and 'int'

In [None]:
#  Calculate the Percentage Pass Rates
per_school_passing_maths =
per_school_passing_reading =
overall_passing_rate =

In [None]:
school_data_complete.groupby["school_name"]school_data_complete[(school_data_complete["maths_score"] >= 50)].count()["student_name"]

In [275]:
# Convert to DataFrame
per_school_summary = pd.DataFrame({"Total Type": [school_types],
                             "Total Students": [per_school_counts],
                            "Total Budget": [per_school_budget],
                            "Per Students Budget": [per_school_capita],
                           "Average Maths Score": [per_school_maths],
                             "Average Reading Score": [per_school_reading]})

# Formatting
#per_school_summary["Total School Budget"] = per_school_summary["Total School Budget"].map("${:,.2f}".format)
#per_school_summary["Per Student Budget"] = per_school_summary["Per Student Budget"].map("${:,.2f}".format)

# Display the DataFrame
per_school_summary


Unnamed: 0,Total Type,Total Students,Total Budget,Per Students Budget,Average Maths Score,Average Reading Score
0,school_name Huang High School Governme...,school_name Huang High School 2917 Figu...,school_name Huang High School 1910635 F...,0 655.0 1 639.0 2 600.0 3 652....,school_name Bailey High School 72.352894...,school_name Bailey High School 71.008842...
