In [10]:
import pandas as pd

In [11]:
# dataframes can be joined horizontally, or concatenated vertically, depending on requirement. example:

staff_df = pd.DataFrame([{'Name' : 'Kelly', 'Role' : 'Director of HR'},
                        {'Name' : 'Sally', 'Role' : 'Course Liasion'},
                        {'Name' : 'James', 'Role' : 'Grader'}])
staff_df = staff_df.set_index('Name')
student_df = pd.DataFrame([{'Name' : 'James', 'School' : 'Business'},
                           {'Name' : 'Mike', 'School' : 'Law'},
                           {'Name' : 'Sally', 'School' : 'Engineering'}])
student_df = student_df.set_index('Name')
print(staff_df.head())
print(student_df.head())
# we can see that there are some overlapping instances between the two dataframes

                 Role
Name                 
Kelly  Director of HR
Sally  Course Liasion
James          Grader
            School
Name              
James     Business
Mike           Law
Sally  Engineering


In [12]:
# now, we want to calculate the union of both dataframes using the merge() function with outer join
pd.merge(staff_df, student_df, how = 'outer', left_index = True, right_index = True) # union using the indices as joining columns

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Kelly,Director of HR,
Mike,,Law
Sally,Course Liasion,Engineering


In [13]:
# for intersection (inner join)
pd.merge(staff_df, student_df, how='inner', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Sally,Course Liasion,Engineering
James,Grader,Business


In [14]:
# retrieve a list of all staff members, regardless of whether they are students, but if they are, get those details as well (left join)
pd.merge(staff_df, student_df, how='left', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kelly,Director of HR,
Sally,Course Liasion,Engineering
James,Grader,Business


In [15]:
# retrieve a list of all students and their roles if they were also staff (right join)
pd.merge(student_df, staff_df, how = 'right', left_index=True, right_index=True)

Unnamed: 0_level_0,School,Role
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kelly,,Director of HR
Sally,Engineering,Course Liasion
James,Business,Grader


In [16]:
# we can use any rows in our dataframes to use as the basis of merging them
staff_df = staff_df.reset_index()
student_df = student_df.reset_index()

# perform right join
pd.merge(staff_df, student_df, how = 'right', on = 'Name') # column name in 'on' need to exist in both DFs

Unnamed: 0,Name,Role,School
0,James,Grader,Business
1,Mike,,Law
2,Sally,Course Liasion,Engineering


In [18]:
# now, we will add some inconsistent data points to this dataframe
staff_df = pd.DataFrame([{'Name' : 'Kelly', 'Role' : 'Director of HR', 'Location' : 'State Street'},
                        {'Name' : 'Sally', 'Role' : 'Course Liasion', 'Location' : 'Washington Avenue'},
                        {'Name' : 'James', 'Role' : 'Grader', 'Location' : 'Washington Avenue'}]) # location - office address
student_df = pd.DataFrame([{'Name' : 'James', 'School' : 'Business', 'Location' : '1024 Billiard Avenue'},
                           {'Name' : 'Mike', 'School' : 'Law', 'Location' : 'Fraternity House #22'},
                           {'Name' : 'Sally', 'School' : 'Engineering', 'Location' : '512 Wilson Crescent'}]) # location - home address

# lets see what happens when we perform left join
pd.merge(staff_df, student_df, how = 'left', on = 'Name')

# location_x refers to location in left DF, location_y in right DF

Unnamed: 0,Name,Role,Location_x,School,Location_y
0,Kelly,Director of HR,State Street,,
1,Sally,Course Liasion,Washington Avenue,Engineering,512 Wilson Crescent
2,James,Grader,Washington Avenue,Business,1024 Billiard Avenue


In [19]:
# we can use multiple columns to join dataframes in the 'on' parameter
staff_df = pd.DataFrame([{'First Name': 'Kelly', 'Last Name': 'Desjardins', 
                          'Role': 'Director of HR'},
                         {'First Name': 'Sally', 'Last Name': 'Brooks', 
                          'Role': 'Course liasion'},
                         {'First Name': 'James', 'Last Name': 'Wilde', 
                          'Role': 'Grader'}])
student_df = pd.DataFrame([{'First Name': 'James', 'Last Name': 'Hammond', 
                            'School': 'Business'},
                           {'First Name': 'Mike', 'Last Name': 'Smith', 
                            'School': 'Law'},
                           {'First Name': 'Sally', 'Last Name': 'Brooks', 
                            'School': 'Engineering'}])

pd.merge(staff_df, student_df, how = 'inner', on = ['First Name', 'Last Name'])
# only outputs the rows where both the firstname AND lastname are matching

Unnamed: 0,First Name,Last Name,Role,School
0,Sally,Brooks,Course liasion,Engineering


In [25]:
# concatenation is the process of merging dataframes on top of each other, in case of yearly data being combined into one dataframe
df_2008 = pd.read_csv('datasets/college_scorecards/MERGED2008_09_PP.csv', on_bad_lines = 'skip')
df_2009 = pd.read_csv('datasets/college_scorecards/MERGED2009_10_PP.csv', on_bad_lines = 'skip')
df_2010 = pd.read_csv('datasets/college_scorecards/MERGED2010_11_PP.csv', on_bad_lines = 'skip')
df_2010.head(3)

  df_2008 = pd.read_csv('datasets/college_scorecards/MERGED2008_09_PP.csv', on_bad_lines = 'skip')
  df_2009 = pd.read_csv('datasets/college_scorecards/MERGED2009_10_PP.csv', on_bad_lines = 'skip')
  df_2010 = pd.read_csv('datasets/college_scorecards/MERGED2010_11_PP.csv', on_bad_lines = 'skip')


Unnamed: 0,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL,...,OMAWDP8_NOTFIRSTTIME_POOLED_SUPP,OMENRUP_NOTFIRSTTIME_POOLED_SUPP,OMENRYP_FULLTIME_POOLED_SUPP,OMENRAP_FULLTIME_POOLED_SUPP,OMAWDP8_FULLTIME_POOLED_SUPP,OMENRUP_FULLTIME_POOLED_SUPP,OMENRYP_PARTTIME_POOLED_SUPP,OMENRAP_PARTTIME_POOLED_SUPP,OMAWDP8_PARTTIME_POOLED_SUPP,OMENRUP_PARTTIME_POOLED_SUPP
0,100654,100200,1002,Alabama A & M University,Normal,AL,35762,,,,...,,,,,,,,,,
1,100663,105200,1052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,,,,...,,,,,,,,,,
2,100690,2503400,25034,Amridge University,Montgomery,AL,36117-3553,,,,...,,,,,,,,,,


In [26]:
# see the length of each df
print(len(df_2008))
print(len(df_2009))
print(len(df_2010))

6975
7149
7414


In [29]:
# to concatenate the dataframes
frames = [df_2008, df_2009, df_2010]
new_df = pd.concat(frames)

In [30]:
print(len(new_df))

21538


In [31]:
len(new_df) == len(df_2008) + len(df_2009) + len(df_2010)
#concatenation is successful

True

In [32]:
# to set the indices to the years
pd.concat(frames, keys = ['2008', '2009', '2010'])

Unnamed: 0,Unnamed: 1,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL,...,OMAWDP8_NOTFIRSTTIME_POOLED_SUPP,OMENRUP_NOTFIRSTTIME_POOLED_SUPP,OMENRYP_FULLTIME_POOLED_SUPP,OMENRAP_FULLTIME_POOLED_SUPP,OMAWDP8_FULLTIME_POOLED_SUPP,OMENRUP_FULLTIME_POOLED_SUPP,OMENRYP_PARTTIME_POOLED_SUPP,OMENRAP_PARTTIME_POOLED_SUPP,OMAWDP8_PARTTIME_POOLED_SUPP,OMENRUP_PARTTIME_POOLED_SUPP
2008,0,100654,100200,1002,Alabama A & M University,Normal,AL,35762,,,,...,,,,,,,,,,
2008,1,100663,105200,1052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,,,,...,,,,,,,,,,
2008,2,100690,2503400,25034,Amridge University,Montgomery,AL,36117-3553,,,,...,,,,,,,,,,
2008,3,100706,105500,1055,University of Alabama in Huntsville,Huntsville,AL,35899,,,,...,,,,,,,,,,
2008,4,100724,100500,1005,Alabama State University,Montgomery,AL,36104-0271,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010,7409,45896402,00145954,1459,Strayer University-Charleston Campus,North Charleston,SC,29418,,,,...,,,,,,,,,,
2010,7410,45897301,00145990,1459,Strayer University-Irving,Irving,TX,75063,,,,...,,,,,,,,,,
2010,7411,45897302,00145992,1459,Strayer University-Katy,Houston,TX,77079,,,,...,,,,,,,,,,
2010,7412,45897303,00145994,1459,Strayer University-Northwest Houston,Houston,TX,77064,,,,...,,,,,,,,,,


In [None]:
# concatenation also has inner and outer methods, analogous to the left and right join in the merge() function