## Combining DataFrames

## Part One- Using Concatenation

NOTE: Keep in mind we can use this only when both of the DataFrames have same order or format.

In [1]:
import numpy as np
import pandas as pd

In [2]:
data_one = {'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}
data_two = {'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}

In [3]:
one = pd.DataFrame(data_one)
two = pd.DataFrame(data_two)

In [4]:
one

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [5]:
two

Unnamed: 0,C,D
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [6]:
# concatenating based on columns (axis=1)
pd.concat([one, two], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [7]:
# NOTE: Keep in mind that while changing the axis for above DFs it creates unnecessary NaN values cells
pd.concat([one, two], axis=0)

Unnamed: 0,A,B,C,D
0,A0,B0,,
1,A1,B1,,
2,A2,B2,,
3,A3,B3,,
0,,,C0,D0
1,,,C1,D1
2,,,C2,D2
3,,,C3,D3


In [8]:
# to avoid above NaN values
# We can do one thing renaming columns of one of the DF to another DF

two.columns = one.columns

In [9]:
two

Unnamed: 0,A,B
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [10]:
# now do concatentaion to avoid NaN values
pd.concat([one, two], axis=0)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [11]:
# to avoid repeated index values do below
mydf = pd.concat([one, two], axis=0)
mydf.index = range(len(mydf))

mydf

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
4,C0,D0
5,C1,D1
6,C2,D2
7,C3,D3


In [12]:
# Creating another example to show concatenation based on rows and columns

In [34]:


countries = ['USA', 'CANADA', 'MEXICO']
data1 = {'year': [1776, 1867, 1821], 'pop': [328, 38, 126]}
data2 = {'GDP': [20.5, 1.7, 1.22], 'Perct': ['75%', np.nan, '25%']}

# Creating DFs
df1 = pd.DataFrame(data1, index=countries)
df2 = pd.DataFrame(data2, index=countries)

In [35]:
df1

Unnamed: 0,year,pop
USA,1776,328
CANADA,1867,38
MEXICO,1821,126


In [36]:
df2

Unnamed: 0,GDP,Perct
USA,20.5,75%
CANADA,1.7,
MEXICO,1.22,25%


In [38]:
# First based on columns
pd.concat([df1, df2], axis=1)

Unnamed: 0,year,pop,GDP,Perct
USA,1776,328,20.5,75%
CANADA,1867,38,1.7,
MEXICO,1821,126,1.22,25%


In [39]:
# Second based on rows

# Creating DFs

data1 = {'year': [1776, 1867], 'Pop': [328, 38], 'GDP': [20.5, 1.7]}
data2 = {'year': [1895, 1821], 'Pop': [29, 126], 'GDP': [1.22, 5.1]}
countries1 = ['USA', 'CANADA']
countries2 = ['MEXICO', 'BRAZIL']

df3 = pd.DataFrame(data1, index=countries1)
df4 = pd.DataFrame(data2, index=countries2)

In [40]:
df3

Unnamed: 0,year,Pop,GDP
USA,1776,328,20.5
CANADA,1867,38,1.7


In [41]:
df4

Unnamed: 0,year,Pop,GDP
MEXICO,1895,29,1.22
BRAZIL,1821,126,5.1


In [42]:
pd.concat([df3, df4], axis=0)

Unnamed: 0,year,Pop,GDP
USA,1776,328,20.5
CANADA,1867,38,1.7
MEXICO,1895,29,1.22
BRAZIL,1821,126,5.1


## Part Two - Using Inner Merge

NOTE: When DFs are not in same order or format then we use merge
1. Inner Merge
2. Outer Merge
3. Left or Right Merge

1. Inner Merge: Merges two DFs having same records

STEPS: 
1. First find on which column we need to do merging
    For this we have to do below:
    1. First check for unique identifier
    2. Check for same column name present in both DFs
2. Decide how to do merging means passing inner, outer, left or right

In [43]:
# Currently we are working on Inner merge

# Creating DFs
data1 = {'reg_id': [1, 2, 3, 4], 'name': ['Andrew', 'Bob', 'Charlie', 'David']}
data2 = {'log_id': [1, 2, 3, 4], 'name': ['Xavier', 'Andrew', 'Yolando', 'Bob']}

registrations = pd.DataFrame(data1)
logins = pd.DataFrame(data2)

In [44]:
registrations

Unnamed: 0,reg_id,name
0,1,Andrew
1,2,Bob
2,3,Charlie
3,4,David


In [45]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolando
3,4,Bob


In [47]:
help(pd.merge) # details of merge method

Help on function merge in module pandas.core.reshape.merge:

merge(left: 'DataFrame | Series', right: 'DataFrame | Series', how: 'str' = 'inner', on: 'IndexLabel | None' = None, left_on: 'IndexLabel | None' = None, right_on: 'IndexLabel | None' = None, left_index: 'bool' = False, right_index: 'bool' = False, sort: 'bool' = False, suffixes: 'Suffixes' = ('_x', '_y'), copy: 'bool' = True, indicator: 'bool' = False, validate: 'str | None' = None) -> 'DataFrame'
    Merge DataFrame or named Series objects with a database-style join.
    
    A named Series object is treated as a DataFrame with a single named column.
    
    The join is done on columns or indexes. If joining columns on
    columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
    on indexes or indexes on a column or columns, the index will be passed on.
    When performing a cross merge, no column specifications to merge on are
    allowed.
    
    
        If both key columns contain rows where t

In [51]:
pd.merge(registrations, logins, how='inner', on='name')

# Selected name as it unique values and also this column is present in both DFs

# NOTE: Column order depends on version of pandas and order of dfs passed for merging

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2
1,2,Bob,4


In [52]:
pd.merge(logins, registrations, how='inner', on='name')

Unnamed: 0,log_id,name,reg_id
0,2,Andrew,1
1,4,Bob,2
