In [200]:
#import pandas, load in as shorthand so we can use it easier- don't have to write pandas every time
import pandas as pd

In [201]:
#load in data from csv files
df_test = pd.read_csv('test_data.csv')
df_students = pd.read_csv('students.csv')
df_sites = pd.read_csv('sites.csv')

In [202]:
#print first 5 rows of the test DataFrame
df_test.head()

Unnamed: 0,test_date,last_name,first_name,score,school_name
0,2017-10-01,Asis,Atif Sifiso,1,Hamilton Senior High School
1,2017-10-01,Savatier,Jaga,3,Hamilton Senior High School
2,2017-10-01,Wruck,Arnfinn,3,Hamilton Senior High School
3,2017-10-01,Venera Kohut,Jolanka,4,Hamilton Senior High School
4,2017-10-01,Benetton,Lorcan,4,Hamilton Senior High School


In [203]:
#print first 5 rows of the students DataFrame
df_students.head()

Unnamed: 0,student_id,last_name,first_name,site_id
0,1,Asis,Atif Sifiso,1
1,2,Savatier,Jaga,1
2,3,Wruck,Arnfinn,1
3,4,Venera Kohut,Jolanka,1
4,5,Benetton,Lorcan,1


In [204]:
#print first 5 rows of the sites DataFrame
df_sites.head()

Unnamed: 0,site_id,site_name
0,1,Hamilton Senior High School
1,2,James K. Polk Preparatory Academy
2,3,Rutherford B Hayes Junior High
3,4,FDR Middle School
4,5,The New School


In [205]:
#merge DataFrames of students and sites together on 'site_id', preserving all unmatched values from both DataFrames
df_students_sites = pd.merge(df_students, df_sites, how='outer', on='site_id')

#print first 5 rows of the merged students and sites DataFrame
df_students_sites.head()

Unnamed: 0,student_id,last_name,first_name,site_id,site_name
0,1,Asis,Atif Sifiso,1,Hamilton Senior High School
1,2,Savatier,Jaga,1,Hamilton Senior High School
2,3,Wruck,Arnfinn,1,Hamilton Senior High School
3,4,Venera Kohut,Jolanka,1,Hamilton Senior High School
4,5,Benetton,Lorcan,1,Hamilton Senior High School


In [206]:
#drop the 'site_name' and 'school_name" columns from their respective DataFrames, as they are not necessary in the ultimate CSV
df_students_sites.drop(columns=['site_name'], inplace=True)
df_test.drop(columns=['school_name'], inplace=True)

#merge the DataFrame of students and sites with the test DataFrame, preserving all unmatched values from both DataFrames
df_merged = pd.merge(df_students_sites, df_test, how='outer')

In [207]:
#rename 'score' to 'test_score', as specified in the instructions
df_merged.rename(columns={'score': 'test_score'}, inplace=True)

In [208]:
#removes spaces from last name
df["last_name"] = df["last_name"].str.strip()

#group the DataFrame by 'last_name' and assign the first 'student_id' value in each group to all rows in that group, avoding duplicate student_id values for students with the same last name
df["student_id"] = df.groupby("last_name")["student_id"].transform("first")

#sort the merged DataFrame by 'student_id' from highest to lowest
df_merged.sort_values(by='student_id', ascending=False, inplace=True)

In [209]:
#drop duplicates based on 'student_id' and 'test_score' columns, keeping the first occurrence of each duplicate
df = df_merged.drop_duplicates(subset=["student_id", "test_score"])

#capitalize the first
#  letter of each word in the 'first_name' and 'last_name' columns
df[['last_name', 'first_name']] = df[['last_name', 'first_name']].apply(lambda x: x.str.title())

#print the whole DataFrame
with pd.option_context('display.max_rows', None):
    print(df)

#check how many rows are in the final DataFrame
len(df)

     student_id              last_name           first_name  site_id  \
20        100.0                Badcock            Caitlin R      5.0   
92         99.0                Nikolov  Omolara Aishwarya V      5.0   
62         98.0               Kendrick             Sostrate      5.0   
61         98.0               Kendrick             Sostrate      5.0   
84         97.0                  Misra          Flavianus H      5.0   
55         96.0               Horsfall              Jenny H      5.0   
131        95.0            Stainthorpe              Helen J      5.0   
39         94.0                  Dries              Odell G      5.0   
143        93.0            Van De Laar             Baxter V      5.0   
96         92.0                  Novak            Astridr Z      5.0   
78         91.0              Mathieson            Tat'Ana N      5.0   
44         90.0                  Fromm             Pravin R      5.0   
94         89.0                 Nikula             Klahan T     

108

In [210]:
#print match counts for each column, showing how many values are not null (i.e. how many matches there are for each column)
print('Match counts: ')
match_counts = df.notna().value_counts()
print( match_counts )

#match rates, which is calculated by dividing match counts by the total number of rows in the DataFrame, to show the proportion of matches for each column
match_rates = match_counts / match_counts.sum()
print( match_rates )

#convert to CSV
df.to_csv('matched_student_data.csv', columns=['student_id', 'test_date', 'test_score'], index=False)

Match counts: 
student_id  last_name  first_name  site_id  test_date  test_score
True        True       True        True     True       True          104
False       True       True        False    True       True            4
Name: count, dtype: int64
student_id  last_name  first_name  site_id  test_date  test_score
True        True       True        True     True       True          0.962963
False       True       True        False    True       True          0.037037
Name: count, dtype: float64
