In [1]:
import pandas as pd

In [2]:
#naming data input files
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

In [3]:
#reading input data into pandas dataframes
school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load)

In [4]:
# unwanted prefixes and suffixes - note spaces!
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

In [5]:
# pandas replace, as opposed to basic string replace, defaults to using regular expressions
# this matters for strings like "Dr. " with punctuation, and throws a warning
# the flag regex=False fixes this issue
for prefsuf in prefixes_suffixes:
    student_data_df["student_name"] = \
    student_data_df["student_name"].str.replace(prefsuf, "", regex=False)

In [6]:
# Combine the data into a single dataset.
school_data_complete_df = pd.merge(student_data_df, school_data_df, on=["school_name", "school_name"])
school_data_complete_df.head()
# remember that type, size, and budget are school properties - maybe rename?

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [7]:
# not sure why module wants me to use .count() instead of len()
total_student_count = len(school_data_complete_df)
total_school_count = len(school_data_df)
total_budget = school_data_df["budget"].sum()
average_math_score = school_data_complete_df["math_score"].mean()
average_reading_score = school_data_complete_df["reading_score"].mean()
print(f"These {total_school_count} schools have {total_student_count:,} students.\n"
      f"Their total budget is ${total_budget:,}.\n"
      f"The average math score is {average_math_score:.2f}, "
      f"and the average reading score is {average_reading_score:.2f}.")

These 15 schools have 39,170 students.
Their total budget is $24,649,428.
The average math score is 78.99, and the average reading score is 81.88.


In [8]:
# we're gonna need boolean Pass/Fail values for each student for many computations,
# so i'm gonna add them to the main dataframe we use
passing_math = school_data_complete_df["math_score"] >= 70
passing_reading = school_data_complete_df["reading_score"] >= 70
school_data_complete_df["pass_math"] = passing_math
school_data_complete_df["pass_reading"] = passing_reading
school_data_complete_df["pass_both"] = passing_math & passing_reading
school_data_complete_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,pass_math,pass_reading,pass_both
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,True,False,False
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,False,True,False
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,False,True,False
3,3,Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,False,False,False
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,True,True,True


In [9]:
# the sum of a series of booleans is the count of True's in it
math_pass_count = school_data_complete_df["pass_math"].sum()
reading_pass_count = school_data_complete_df["pass_reading"].sum()
both_pass_count = school_data_complete_df["pass_both"].sum()
print(f"{math_pass_count / total_student_count * 100:.2f}% of students passed math.\n"
      f"{reading_pass_count / total_student_count * 100:.2f}% of students passed reading.\n"
      f"{both_pass_count / total_student_count * 100:.2f}% of students passed both.")

74.98% of students passed math.
85.81% of students passed reading.
65.17% of students passed both.


In [None]:
# Adding a list of values with keys to create a new DataFrame.
district_summary_df = pd.DataFrame(
          [{"Total Schools": school_count,
          "Total Students": student_count,
          "Total Budget": total_budget,
          "Average Math Score": average_math_score,
          "Average Reading Score": average_reading_score,
          "% Passing Math": passing_math_percentage,
         "% Passing Reading": passing_reading_percentage,
        "% Overall Passing": overall_passing_percentage}])
district_summary_df