<a href="https://colab.research.google.com/github/brunofbpaula/DataScience-UM-Coursera/blob/main/Pandas/DataFrame/MergingDataFrames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Merging DataFrames

In [9]:
import pandas as pd

# Creating a Staff DataFrame
staff_df = pd.DataFrame([{'Name': 'Bruno', 'Role': 'Tech Lead'},
                        {'Name': 'Barbara', 'Role':  'Mechanical Engineer'},
                        {'Name': 'Lipe', 'Role': 'Software Developer'}])
staff_df = staff_df.set_index('Name')

# Creating students DataFrame
students_df = pd.DataFrame([{'Name': 'Lipe', 'School': 'Computer Science'},
                           {'Name': 'Miguel', 'School': 'Civil Engineering'},
                           {'Name': 'Barbara', 'School': 'Mechanical Engineering'}])
students_df = students_df.set_index('Name')

print(staff_df.head())
print(students_df.head())

                        Role
Name                        
Bruno              Tech Lead
Barbara  Mechanical Engineer
Lipe      Software Developer
                         School
Name                           
Lipe           Computer Science
Miguel        Civil Engineering
Barbara  Mechanical Engineering


In [10]:
# If we want the union of these, we call the merge() function.
# It takes the DataFrame on the left and the one on the right,
# telling merge that we want it to use an outer join. And we
# want to use left and right indices as the joining columns.

pd.merge(staff_df, students_df, how='outer', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Barbara,Mechanical Engineer,Mechanical Engineering
Bruno,Tech Lead,
Lipe,Software Developer,Computer Science
Miguel,,Civil Engineering


In [11]:
# Now let's do an inner join
pd.merge(staff_df, students_df, how='inner', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Barbara,Mechanical Engineer,Mechanical Engineering
Lipe,Software Developer,Computer Science


In [12]:
# Left join
# It will return all information on the left and its correspondent on the right
pd.merge(staff_df, students_df, how='left', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bruno,Tech Lead,
Barbara,Mechanical Engineer,Mechanical Engineering
Lipe,Software Developer,Computer Science


In [13]:
# Now same thing on the contrary (all information on the right)
pd.merge(staff_df, students_df, how='right', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Lipe,Software Developer,Computer Science
Miguel,,Civil Engineering
Barbara,Mechanical Engineer,Mechanical Engineering


In [14]:
# Using ON parameter
staff_df = staff_df.reset_index()
students_df = students_df.reset_index()

pd.merge(staff_df, students_df, how='right', on='Name')

Unnamed: 0,Name,Role,School
0,Lipe,Software Developer,Computer Science
1,Miguel,,Civil Engineering
2,Barbara,Mechanical Engineer,Mechanical Engineering


In [15]:
# Working with same-name columns

# Creating a Staff DataFrame
staff_df = pd.DataFrame([{'Name': 'Bruno', 'Location': 'Canada'},
                        {'Name': 'Barbara', 'Location':  'USA'},
                        {'Name': 'Lipe', 'Location': 'Brazil'}])
staff_df = staff_df.set_index('Name')

# Creating students DataFrame
students_df = pd.DataFrame([{'Name': 'Bruno', 'Location': '27 King\'s College Cir, Toronto'},
                           {'Name': 'Lipe', 'Location': 'R. da Reitoria, R. Cidade Universitária, 374 - Butantã, São Paulo '},
                           {'Name': 'Barbara', 'Location': '1320 S Dixie Hwy, Coral Gables'}])
students_df = students_df.set_index('Name')

# Both DataFrames have a 'location' column. In the staff_df, the location is the country of residency while
# in the students_df it's their universities' adresses. The mergen function will preserve this information
# and help differentiate the two columns.
pd.merge(staff_df, students_df, how="left", on="Name")

Unnamed: 0_level_0,Location_x,Location_y
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bruno,Canada,"27 King's College Cir, Toronto"
Barbara,USA,"1320 S Dixie Hwy, Coral Gables"
Lipe,Brazil,"R. da Reitoria, R. Cidade Universitária, 374 -..."


## Multi-indexing and multiple columns

In [16]:
# Using a list of the multiple columns that should be used to join keys
# from both DataFrames on the parameter.

staff_df = pd.DataFrame([{'Name': 'Bruno', 'Last Name': 'de Paula', 'Role': 'Tech Lead'},
                        {'Name': 'Barbara', 'Last Name': 'Geres', 'Role':  'Mechanical Engineer'},
                        {'Name': 'Lipe', 'Last Name': 'Cardoso', 'Role': 'Software Developer'}])
staff_df = staff_df.set_index('Name')

students_df = pd.DataFrame([{'Name': 'Lipe', 'Last Name': 'Cardoso', 'School': 'Computer Science'},
                           {'Name': 'Miguel', 'Last Name': 'de Paula', 'School': 'Civil Engineering'},
                           {'Name': 'Barbara', 'Last Name': 'Gordon', 'School': 'Chemistry'}])

pd.merge(staff_df, students_df, how='inner', on=['Name', 'Last Name'])

Unnamed: 0,Name,Last Name,Role,School
0,Lipe,Cardoso,Software Developer,Computer Science
