# Data Merging
#### Table of Contents
1. Merging
2. Joining
3. Concatinating
4. appending

## Basics
- ```join()``` - use when index matters
- ```merge()``` - use when index doesn't matters
- ```concate()``` or ```append``` - 

In [1]:
import pandas as pd

## Merging

In [2]:
df1 = pd.DataFrame({'HPI' : [80, 85, 88, 85],
                    'Int_rate' : [2, 3, 2, 2],
                    'US_GDP' : [50, 55, 65, 55]}, 
                   index = [2001, 2002, 2003, 2004])

df2 = pd.DataFrame({'HPI' : [80, 85, 88, 85],
                    'Int_rate' : [2, 3, 2, 2],
                    'US_GDP' : [50, 55, 65, 55]}, 
                   index = [2005, 2006, 2007, 2008])

df3 = pd.DataFrame({'HPI' : [80, 85, 88, 85],
                    'Unemployment' : [7, 8, 9, 6],
                    'Low_tier_HPI' : [50, 52, 50, 53]}, 
                   index = [2001, 2002, 2003, 2004])

In [3]:
print(pd.merge(df1, df2, on='HPI'))

   HPI  Int_rate_x  US_GDP_x  Int_rate_y  US_GDP_y
0   80           2        50           2        50
1   85           3        55           3        55
2   85           3        55           2        55
3   85           2        55           3        55
4   85           2        55           2        55
5   88           2        65           2        65


In [4]:
print(pd.merge(df1, df2, on=['HPI', 'Int_rate']))

   HPI  Int_rate  US_GDP_x  US_GDP_y
0   80         2        50        50
1   85         3        55        55
2   88         2        65        65
3   85         2        55        55


In [9]:
df1 = pd.DataFrame({'Year' : [2001, 2002, 2003, 2004],
                    'Int_rate' : [2, 3, 2, 2],
                    'US_GDP' : [50, 55, 65, 55]})

df3 = pd.DataFrame({'Year' : [2001, 2003, 2004, 2005],
                    'Unemployment' : [7, 8, 9, 6],
                    'Low_tier_HPI' : [50, 52, 50, 53]})

merged = pd.merge(df1, df3, on='Year')
print(merged)

   Year  Int_rate  US_GDP  Unemployment  Low_tier_HPI
0  2001         2      50             7            50
1  2003         2      65             8            52
2  2004         2      55             9            50


In [10]:
merged.set_index('Year', inplace=True)
print(merged)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001         2      50             7            50
2003         2      65             8            52
2004         2      55             9            50


In [11]:
merged = pd.merge(df1, df3, on='Year', how='left')
merged.set_index('Year', inplace=True)
print(merged)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001         2      50           7.0          50.0
2002         3      55           NaN           NaN
2003         2      65           8.0          52.0
2004         2      55           9.0          50.0


In [12]:
merged = pd.merge(df1, df3, on='Year', how='right')
merged.set_index('Year', inplace=True)
print(merged)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001       2.0    50.0             7            50
2003       2.0    65.0             8            52
2004       2.0    55.0             9            50
2005       NaN     NaN             6            53


In [13]:
merged = pd.merge(df1, df3, on='Year', how='inner')  # default
merged.set_index('Year', inplace=True)
print(merged)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001         2      50             7            50
2003         2      65             8            52
2004         2      55             9            50


In [14]:
merged = pd.merge(df1, df3, on='Year', how='outer')
merged.set_index('Year', inplace=True)
print(merged)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001       2.0    50.0           7.0          50.0
2002       3.0    55.0           NaN           NaN
2003       2.0    65.0           8.0          52.0
2004       2.0    55.0           9.0          50.0
2005       NaN     NaN           6.0          53.0


## Joining

In [5]:
df1 = pd.DataFrame({'HPI' : [80, 85, 88, 85],
                    'Int_rate' : [2, 3, 2, 2],
                    'US_GDP' : [50, 55, 65, 55]}, 
                   index = [2001, 2002, 2003, 2004])

df2 = pd.DataFrame({'HPI' : [80, 85, 88, 85],
                    'Int_rate' : [2, 3, 2, 2],
                    'US_GDP' : [50, 55, 65, 55]}, 
                   index = [2005, 2006, 2007, 2008])

df3 = pd.DataFrame({'HPI' : [80, 85, 88, 85],
                    'Unemployment' : [7, 8, 9, 6],
                    'Low_tier_HPI' : [50, 52, 50, 53]}, 
                   index = [2001, 2002, 2003, 2004])

In [7]:
df1.set_index('HPI', inplace=True)
df3.set_index('HPI', inplace=True)

In [8]:
joined = df1.join(df3)
print(joined)

     Int_rate  US_GDP  Unemployment  Low_tier_HPI
HPI                                              
80          2      50             7            50
85          3      55             8            52
85          3      55             6            53
85          2      55             8            52
85          2      55             6            53
88          2      65             9            50


In [2]:
import pandas as pd

df1 = pd.DataFrame({'Year' : [2001, 2002, 2003, 2004],
                    'Int_rate' : [2, 3, 2, 2],
                    'US_GDP' : [50, 55, 65, 55]})

df3 = pd.DataFrame({'Year' : [2001, 2003, 2004, 2005],
                    'Unemployment' : [7, 8, 9, 6],
                    'Low_tier_HPI' : [50, 52, 50, 53]})

df1.set_index('Year', inplace=True)
df3.set_index('Year', inplace=True)

joined = df1.join(df3)
print(joined)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001         2      50           7.0          50.0
2002         3      55           NaN           NaN
2003         2      65           8.0          52.0
2004         2      55           9.0          50.0


In [6]:
help(df1.join)

Help on method join in module pandas.core.frame:

join(other: 'DataFrame | Series', on: 'IndexLabel | None' = None, how: 'str' = 'left', lsuffix: 'str' = '', rsuffix: 'str' = '', sort: 'bool' = False) -> 'DataFrame' method of pandas.core.frame.DataFrame instance
    Join columns of another DataFrame.
    
    Join columns with `other` DataFrame either on index or on a key
    column. Efficiently join multiple DataFrame objects by index at once by
    passing a list.
    
    Parameters
    ----------
    other : DataFrame, Series, or list of DataFrame
        Index should be similar to one of the columns in this one. If a
        Series is passed, its name attribute must be set, and that will be
        used as the column name in the resulting joined DataFrame.
    on : str, list of str, or array-like, optional
        Column or index level name(s) in the caller to join on the index
        in `other`, otherwise joins index-on-index. If multiple
        values given, the `other` DataFr

In [7]:
joined = df1.join(df3, how="right")
print(joined)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001       2.0    50.0             7            50
2003       2.0    65.0             8            52
2004       2.0    55.0             9            50
2005       NaN     NaN             6            53


In [8]:
joined = df1.join(df3, how="inner")
print(joined)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001         2      50             7            50
2003         2      65             8            52
2004         2      55             9            50


In [9]:
joined = df1.join(df3, how="outer")
print(joined)

      Int_rate  US_GDP  Unemployment  Low_tier_HPI
Year                                              
2001       2.0    50.0           7.0          50.0
2002       3.0    55.0           NaN           NaN
2003       2.0    65.0           8.0          52.0
2004       2.0    55.0           9.0          50.0
2005       NaN     NaN           6.0          53.0


## Appending

In [11]:
import pandas as pd 
import numpy as np
heights_A = pd.Series([176.2, 158.4, 167.6, 156.2, 161.4], index=["s1", "s2", "s3", "s4", "s5"])
weights_A = pd.Series([85.1, 90.2, 76.8, 80.4, 78.9], index=["s1", "s2", "s3", "s4", "s5"])
data = {"Student_height" : heights_A, "Student_weight" : weights_A}
df_A = pd.DataFrame(data)
# print(df_A)

df_A["Gender"] = ["M", "F", "M", "M", "F"]
print(df_A)

    Student_height  Student_weight Gender
s1           176.2            85.1      M
s2           158.4            90.2      F
s3           167.6            76.8      M
s4           156.2            80.4      M
s5           161.4            78.9      F


In [12]:
s = pd.Series([165.4, 82.7, "F"], index=["Student_height", "Student_weight", "Gender"], name="s6")
print(s)

Student_height    165.4
Student_weight     82.7
Gender                F
Name: s6, dtype: object


In [13]:
df_AA = df_A.append(s)
print(df_AA)

    Student_height  Student_weight Gender
s1           176.2            85.1      M
s2           158.4            90.2      F
s3           167.6            76.8      M
s4           156.2            80.4      M
s5           161.4            78.9      F
s6           165.4            82.7      F


  df_AA = df_A.append(s)
