<a href="https://colab.research.google.com/github/binhvd/Data-Management-2/blob/main/Demo2/Join-Excel-Files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
@author: Ashish Chouhan
ETL with Pandas
"""

# Import Libraries required for Processing
import pandas as pd

# Setting file names used during execution of process
# Input File :  Master file consists of Basic Information of Person and ID associated with it
#               Country_Code file consist of Country information associated with the ID
file_masters = 'masters.xlsx'
file_incoming = 'country code.xlsx'

# Output File : Mastes_Dups file will hold the duplicate record present in the Master File if any
#               Output file will hold the complete informaiton of Person and Country for that ID. 
file_masters_dups = 'masters_dups.xlsx'
file_out = 'output.xlsx'

# Read Master File to fetch Person Information

In [2]:
# Read the Masters.xlsx file using pandas.read_excel function and populate the records into a Master Dataframe
df_masters = pd.read_excel(file_masters, sheet_name = 'input')
df_masters

Unnamed: 0,ID,Lname,Fname,Name
0,100001,Bogey,Hyberts,"Bogey, Hyberts"
1,100002,Mireielle,Lindauer,"Mireielle, Lindauer"
2,100003,Claiborn,Schacter,"Claiborn, Schacter"
3,100004,Hildagard,Eberhart,"Hildagard, Eberhart"
4,100005,Welsh,Westerman,"Welsh, Westerman"
5,100006,Wrennie,Platt,"Wrennie, Platt"
6,100007,Barris,Reynales,"Barris, Reynales"
7,100008,Wendye,Cicinelli,"Wendye, Cicinelli"
8,100009,Ethe,Slasor,"Ethe, Slasor"
9,100010,Valentine,Horsley,"Valentine, Horsley"


In [3]:
# Sort records by IDs which are present in the Dataframe in Ascending Order
df_masters.sort_values('ID', ascending = True, inplace = True)

# Master Dataframe is having records which are arranged in the ascending order of ID
df_masters

Unnamed: 0,ID,Lname,Fname,Name
0,100001,Bogey,Hyberts,"Bogey, Hyberts"
1,100002,Mireielle,Lindauer,"Mireielle, Lindauer"
10,100002,Mireielle,Lindauer,"Mireielle, Lindauer"
2,100003,Claiborn,Schacter,"Claiborn, Schacter"
3,100004,Hildagard,Eberhart,"Hildagard, Eberhart"
4,100005,Welsh,Westerman,"Welsh, Westerman"
5,100006,Wrennie,Platt,"Wrennie, Platt"
11,100006,Wrennie,Platt,"Wrennie, Platt"
6,100007,Barris,Reynales,"Barris, Reynales"
7,100008,Wendye,Cicinelli,"Wendye, Cicinelli"


In [4]:
# Extract Duplicate record in a new dataframe keeping the first record from the dataframe which is already sorted in ascending order of ID
df_dups = df_masters[df_masters.duplicated(keep = 'first')]
df_dups

Unnamed: 0,ID,Lname,Fname,Name
10,100002,Mireielle,Lindauer,"Mireielle, Lindauer"
11,100006,Wrennie,Platt,"Wrennie, Platt"


In [5]:
# Export the output of duplicates to spreadsheet before deleting the duplicate record
writer = pd.ExcelWriter(file_masters_dups)
df_dups.to_excel(writer, 'output')
writer.save()

In [6]:
# Remove Duplicate record based on ID from the Master Data Frame keeping only the first Record
df_masters.drop_duplicates(subset='ID', keep='first', inplace=True)

# Master Dataframe is having only Unique records and also is arranged in Ascending Order of ID
df_masters

Unnamed: 0,ID,Lname,Fname,Name
0,100001,Bogey,Hyberts,"Bogey, Hyberts"
1,100002,Mireielle,Lindauer,"Mireielle, Lindauer"
2,100003,Claiborn,Schacter,"Claiborn, Schacter"
3,100004,Hildagard,Eberhart,"Hildagard, Eberhart"
4,100005,Welsh,Westerman,"Welsh, Westerman"
5,100006,Wrennie,Platt,"Wrennie, Platt"
6,100007,Barris,Reynales,"Barris, Reynales"
7,100008,Wendye,Cicinelli,"Wendye, Cicinelli"
8,100009,Ethe,Slasor,"Ethe, Slasor"
9,100010,Valentine,Horsley,"Valentine, Horsley"


In [7]:
# Read Country_Code File to fetch Country Information
# Read the Country_Code.xlsx file using pandas.read_excel function and populate the records into a secondary Dataframe
df_incoming = pd.read_excel(file_incoming, sheet_name = 'input')
df_incoming

Unnamed: 0,ID,Country
0,100001,USA
1,100002,USA
2,100003,UK
3,100008,UK
4,100009,China
5,100010,USA


# Process to Merge(Join) Two Tables present in two different dataframe

In [8]:
# Join dataframe and populate into a new dataframe with Right Outer Join such that Master Dataframe (df_masters) is on left, Secondary Dataframe (df_incoming) is on right
df_join_right = pd.merge(df_masters, df_incoming, how='right', on=['ID'])
df_join_right

Unnamed: 0,ID,Lname,Fname,Name,Country
0,100001,Bogey,Hyberts,"Bogey, Hyberts",USA
1,100002,Mireielle,Lindauer,"Mireielle, Lindauer",USA
2,100003,Claiborn,Schacter,"Claiborn, Schacter",UK
3,100008,Wendye,Cicinelli,"Wendye, Cicinelli",UK
4,100009,Ethe,Slasor,"Ethe, Slasor",China
5,100010,Valentine,Horsley,"Valentine, Horsley",USA


In [9]:
# Join dataframe and populate into a new dataframe with Left Outer Join such that Master Dataframe (df_masters) is on left, Secondary Dataframe (df_incoming) is on right
df_join_left = pd.merge(df_masters, df_incoming, how = 'left', on=['ID'])
df_join_left

Unnamed: 0,ID,Lname,Fname,Name,Country
0,100001,Bogey,Hyberts,"Bogey, Hyberts",USA
1,100002,Mireielle,Lindauer,"Mireielle, Lindauer",USA
2,100003,Claiborn,Schacter,"Claiborn, Schacter",UK
3,100004,Hildagard,Eberhart,"Hildagard, Eberhart",
4,100005,Welsh,Westerman,"Welsh, Westerman",
5,100006,Wrennie,Platt,"Wrennie, Platt",
6,100007,Barris,Reynales,"Barris, Reynales",
7,100008,Wendye,Cicinelli,"Wendye, Cicinelli",UK
8,100009,Ethe,Slasor,"Ethe, Slasor",China
9,100010,Valentine,Horsley,"Valentine, Horsley",USA


In [10]:
# Drop the column 'Name' from the dataframe 
df_join_left.drop(['Name'], axis = 1, inplace = True)
df_join_left

Unnamed: 0,ID,Lname,Fname,Country
0,100001,Bogey,Hyberts,USA
1,100002,Mireielle,Lindauer,USA
2,100003,Claiborn,Schacter,UK
3,100004,Hildagard,Eberhart,
4,100005,Welsh,Westerman,
5,100006,Wrennie,Platt,
6,100007,Barris,Reynales,
7,100008,Wendye,Cicinelli,UK
8,100009,Ethe,Slasor,China
9,100010,Valentine,Horsley,USA


In [11]:
# Export the result of Left Outer Join into a Excel file
writer = pd.ExcelWriter(file_out)
df_join_left.to_excel(writer, 'output')
writer.save()