# Data Merging
#### Table of Contents
1. Merging
2. Joining
3. Concatinating

In [1]:
import pandas as pd

## Merging

In [2]:
df1 = pd.DataFrame({'HPI' : [80, 85, 88, 85],
                    'Int_rate' : [2, 3, 2, 2],
                    'US_GDP' : [50, 55, 65, 55]}, 
                   index = [2001, 2002, 2003, 2004])

df2 = pd.DataFrame({'HPI' : [80, 85, 88, 85],
                    'Int_rate' : [2, 3, 2, 2],
                    'US_GDP' : [50, 55, 65, 55]}, 
                   index = [2005, 2006, 2007, 2008])

df3 = pd.DataFrame({'HPI' : [80, 85, 88, 85],
                    'Unemployment' : [7, 8, 9, 6],
                    'Low_tier_HPI' : [50, 52, 50, 53]}, 
                   index = [2001, 2002, 2003, 2004])

In [3]:
print(pd.merge(df1, df2, on='HPI'))

   HPI  Int_rate_x  US_GDP_x  Int_rate_y  US_GDP_y
0   80           2        50           2        50
1   85           3        55           3        55
2   85           3        55           2        55
3   85           2        55           3        55
4   85           2        55           2        55
5   88           2        65           2        65


In [4]:
print(pd.merge(df1, df2, on=['HPI', 'Int_rate']))

   HPI  Int_rate  US_GDP_x  US_GDP_y
0   80         2        50        50
1   85         3        55        55
2   88         2        65        65
3   85         2        55        55


In [9]:
df1 = pd.DataFrame({'Year' : [2001, 2002, 2003, 2004],
                    'Int_rate' : [2, 3, 2, 2],
                    'US_GDP' : [50, 55, 65, 55]})

df3 = pd.DataFrame({'Year' : [2001, 2003, 2004, 2005],
                    'Unemployment' : [7, 8, 9, 6],
                    'Low_tier_HPI' : [50, 52, 50, 53]})

merged = pd.merge(df1, df3, on='Year')
print(merged)

   Year  Int_rate  US_GDP  Unemployment  Low_tier_HPI
0  2001         2      50             7            50
1  2003         2      65             8            52
2  2004         2      55             9            50


In [10]:
merged.set_index('Year', inplace=True)
print(merged)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001         2      50             7            50
2003         2      65             8            52
2004         2      55             9            50


In [11]:
merged = pd.merge(df1, df3, on='Year', how='left')
merged.set_index('Year', inplace=True)
print(merged)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001         2      50           7.0          50.0
2002         3      55           NaN           NaN
2003         2      65           8.0          52.0
2004         2      55           9.0          50.0


In [12]:
merged = pd.merge(df1, df3, on='Year', how='right')
merged.set_index('Year', inplace=True)
print(merged)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001       2.0    50.0             7            50
2003       2.0    65.0             8            52
2004       2.0    55.0             9            50
2005       NaN     NaN             6            53


In [13]:
merged = pd.merge(df1, df3, on='Year', how='inner')
merged.set_index('Year', inplace=True)
print(merged)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001         2      50             7            50
2003         2      65             8            52
2004         2      55             9            50


In [14]:
merged = pd.merge(df1, df3, on='Year', how='outer')
merged.set_index('Year', inplace=True)
print(merged)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001       2.0    50.0           7.0          50.0
2002       3.0    55.0           NaN           NaN
2003       2.0    65.0           8.0          52.0
2004       2.0    55.0           9.0          50.0
2005       NaN     NaN           6.0          53.0


## Joining

In [5]:
df1 = pd.DataFrame({'HPI' : [80, 85, 88, 85],
                    'Int_rate' : [2, 3, 2, 2],
                    'US_GDP' : [50, 55, 65, 55]}, 
                   index = [2001, 2002, 2003, 2004])

df2 = pd.DataFrame({'HPI' : [80, 85, 88, 85],
                    'Int_rate' : [2, 3, 2, 2],
                    'US_GDP' : [50, 55, 65, 55]}, 
                   index = [2005, 2006, 2007, 2008])

df3 = pd.DataFrame({'HPI' : [80, 85, 88, 85],
                    'Unemployment' : [7, 8, 9, 6],
                    'Low_tier_HPI' : [50, 52, 50, 53]}, 
                   index = [2001, 2002, 2003, 2004])

In [7]:
df1.set_index('HPI', inplace=True)
df3.set_index('HPI', inplace=True)

In [8]:
joined = df1.join(df3)
print(joined)

     Int_rate  US_GDP  Unemployment  Low_tier_HPI
HPI                                              
80          2      50             7            50
85          3      55             8            52
85          3      55             6            53
85          2      55             8            52
85          2      55             6            53
88          2      65             9            50


In [None]:
df1 = pd.DataFrame({'Year' : [2001, 2002, 2003, 2004],
                    'Int_rate' : [2, 3, 2, 2],
                    'US_GDP' : [50, 55, 65, 55]})

df3 = pd.DataFrame({'Year' : [2001, 2003, 2004, 2005],
                    'Unemployment' : [7, 8, 9, 6],
                    'Low_tier_HPI' : [50, 52, 50, 53]})

df1.set_index('HPI', inplace=True)
df3.set_index('HPI', inplace=True)

joined = df1.join(df3)
print(joined)