In [1]:
# Add the Pandas dependency
import pandas as pd

In [2]:
# Files to load
schools_file = "Resources/schools_complete.csv"
students_file = "Resources/students_complete.csv"

In [4]:
# Read the school data file and store it in a Pandas DataFrame
school_data_df = pd.read_csv(schools_file)
school_data_df.head()

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [5]:
# Read the students data file and store it in a Pandas DataFrame
student_data_df = pd.read_csv(students_file)
student_data_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [6]:
# Determine if there are any missing values in the school data
school_data_df.count()

School ID      15
school_name    15
type           15
size           15
budget         15
dtype: int64

In [7]:
# Determine if there are any missing values in the student data
student_data_df.count()

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
dtype: int64

In [8]:
# Determine if there are any missing values in the school data
school_data_df.isnull()

Unnamed: 0,School ID,school_name,type,size,budget
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [10]:
# Determine if there are any missing values in the student data
student_data_df.isnull().sum()

Student ID       0
student_name     0
gender           0
grade            0
school_name      0
reading_score    0
math_score       0
dtype: int64

In [11]:
# Determine if there are *not* any missing values in the school data
school_data_df.notnull()

Unnamed: 0,School ID,school_name,type,size,budget
0,True,True,True,True,True
1,True,True,True,True,True
2,True,True,True,True,True
3,True,True,True,True,True
4,True,True,True,True,True
5,True,True,True,True,True
6,True,True,True,True,True
7,True,True,True,True,True
8,True,True,True,True,True
9,True,True,True,True,True


In [12]:
# Determine if there are *not* any missing values in the student data
student_data_df.notnull().sum()

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
dtype: int64

In [14]:
# Determine the data type of each column in school_data_df
school_data_df.dtypes

School ID       int64
school_name    object
type           object
size            int64
budget          int64
dtype: object

In [15]:
# Find the data type for the column budget only
school_data_df.budget.dtype
school_data_df["budget"].dtype

dtype('int64')

In [16]:
# Determine the data type of each column in student_data_df
student_data_df.dtypes

Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
dtype: object

In [17]:
# Use results from cleaning_student_names.ipynb to clean the data
# Define the list of prefixes and suffixes to replace
prefix_suffix = ['Dr. ', 'Miss ', 'Mr. ', 'Mrs. ', 'Ms. ', ' DDS', ' DVM', ' MD', ' PhD']
for ps in prefix_suffix:
    student_data_df.student_name = student_data_df.student_name.str.replace(ps,'')

  student_data_df.student_name = student_data_df.student_name.str.replace(ps,'')


In [18]:
# Define a new DataFrame by merging the two datasets
school_data_complete_df = pd.merge(
    student_data_df,
    school_data_df,
    on=["school_name", "school_name"]
)
school_data_complete_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [20]:
# Check the count of each column
student_count = school_data_complete_df.count()
student_count

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
School ID        39170
type             39170
size             39170
budget           39170
dtype: int64

In [21]:
# Define the number of students
student_count = school_data_complete_df['Student ID'].count()
student_count

39170

In [24]:
# Get the total number of schools
school_count = school_data_complete_df.school_name.unique()
school_count = len(school_count)
school_count

15

In [25]:
# Get the total budget of the school district
district_budget = school_data_df.budget.sum()
district_budget

24649428

In [26]:
# Get the average scores for reading and math
reading_avg = school_data_complete_df.reading_score.mean()
reading_avg

81.87784018381414

In [27]:
# Get the average scores for reading and math
math_avg = school_data_complete_df.math_score.mean()
math_avg

78.98537145774827

In [29]:
# Determine passing percentages in reading, math, and overall
# Passing score is 70
passing_score = 70

# New dataframe where math score is greater or equal to passing_score
# Returns a list of boolean values
passing_math = school_data_complete_df.math_score >= passing_score
passing_reading = school_data_complete_df.reading_score >= passing_score

In [39]:
# Get the number of students who passed math
passing_math_total = passing_math.sum()
passing_math_total

29370

In [38]:
# Get the number of students who pass reading
passing_reading_total = passing_reading.sum()
passing_reading_total

33610

In [37]:
# To filter the dataframe based on the passing score
passing_math_df = school_data_complete_df[school_data_complete_df.math_score >= passing_score]
passing_math_df.count()

Student ID       29370
student_name     29370
gender           29370
grade            29370
school_name      29370
reading_score    29370
math_score       29370
School ID        29370
type             29370
size             29370
budget           29370
dtype: int64

In [40]:
# To filter the dataframe based on the passing score
passing_reading_df = school_data_complete_df[school_data_complete_df.reading_score >= passing_score]
passing_reading_df.count()

Student ID       33610
student_name     33610
gender           33610
grade            33610
school_name      33610
reading_score    33610
math_score       33610
School ID        33610
type             33610
size             33610
budget           33610
dtype: int64

In [44]:
# Get the Percentages of students who passed math and reading respectively
math_pct = passing_math_total / student_count * 100
reading_pct = passing_reading_total / student_count * 100
print(f"The pct of students who passed math is {math_pct:.2f}%")
print(f"The pct of students who passed reading is {reading_pct:.2f}%")

The pct of students who passed math is 74.98%
The pct of students who passed reading is 85.81%


In [48]:
# Get the dataframe of students who passed both math and reading
passing_both_df = school_data_complete_df[
    (school_data_complete_df.math_score >= passing_score) &
    (school_data_complete_df.reading_score >= passing_score)
]
passing_both_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635
8,8,Michael Roth,M,10th,Huang High School,95,87,0,District,2917,1910635
9,9,Matthew Greene,M,10th,Huang High School,96,84,0,District,2917,1910635


In [49]:
# Get the total number of students who passed both math and reading
passing_both_total = passing_both_df.student_name.count()
passing_both_total

25528

In [50]:
# Get the percentage of students who passed both math and reading
both_pct = passing_both_total / student_count * 100
print(f"The pct of students who passed both math and reading is {both_pct:.2f}%")

The pct of students who passed both math and reading is 65.17%


In [69]:
# Add summary statistics into one dataframe
district_summary_df = pd.DataFrame(
    [
        {
            "Total Schools": school_count,
            "Total Students": student_count,
            "Total Budget": district_budget,
            "Average Math Score": math_avg,
            "Average Reading Score": reading_avg,
            "% Passing Math": math_pct,
            "% Passing Reading": reading_pct,
            "% Overall Passing": both_pct
        }
    ]
)
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,65.172326


In [74]:
# Format the "Total Students" to have the comma for a thousands separator.
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)

district_summary_df["Total Students"]

0    39,170
Name: Total Students, dtype: object

In [75]:
# Format the "Total Budget" to have the comma for a thousands separator and dollar sign
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)

district_summary_df["Total Budget"]

0    $24,649,428.00
Name: Total Budget, dtype: object

In [76]:
# Format the average scores to 1 decimal
# .. and pct passing to the nearest whole number
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)

district_summary_df["Average Math Score"]

0    79.0
Name: Average Math Score, dtype: object

In [77]:
# Format the average scores to 1 decimal
# .. and pct passing to the nearest whole number
district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)

district_summary_df["Average Reading Score"]

0    81.9
Name: Average Reading Score, dtype: object

In [70]:
# Format the average scores to 1 decimal
# .. and pct passing to the nearest whole number
district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}".format)

district_summary_df["% Passing Math"]

0    75
Name: % Passing Math, dtype: object

In [71]:
# Format the average scores to 1 decimal
# .. and pct passing to the nearest whole number
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}".format)

district_summary_df["% Passing Reading"]

0    86
Name: % Passing Reading, dtype: object

In [72]:
# Format the average scores to 1 decimal
# .. and pct passing to the nearest whole number
district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}".format)

district_summary_df["% Overall Passing"]

0    65
Name: % Overall Passing, dtype: object

In [78]:
# Check on the updated dataframe with formatting
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79.0,81.9,75,86,65
