# Data Frame Merge

In [1]:
import numpy as np
import pandas as pd

## Append
Combining Series and Dataframes by Row

In [2]:
train_data = pd.read_csv("C:/Users/LENOVO/Python/california_housing_train.csv")

In [3]:
test_data = pd.read_csv("C:/Users/LENOVO/Python/california_housing_test.csv")

In [4]:
train_data.shape

(17000, 9)

In [5]:
train_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [6]:
test_data.shape

(3000, 9)

In [7]:
test_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


In [8]:
full_data = train_data.append(test_data)

AttributeError: 'DataFrame' object has no attribute 'append'

In [9]:
full_data = pd.concat([train_data, test_data], ignore_index=True)

In [10]:
full_data.shape

(20000, 9)

In [11]:
full_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


## Concatenate

In [12]:
data1 = {"country" : ["JPY", "CHN", "IND"],
         "year" : [2019, 2019, 2020]}

data2 = {"country" : ["JPY", "CHN", "KOR"],
         "city" : ["Tokyo", "Beijing", "Seoul"]}


In [13]:
data1 = pd.DataFrame(data1)
data2 = pd.DataFrame(data2)

In [14]:
print(data1)
print(data2)

  country  year
0     JPY  2019
1     CHN  2019
2     IND  2020
  country     city
0     JPY    Tokyo
1     CHN  Beijing
2     KOR    Seoul


In [15]:
df_column = pd.concat([data1, data2], axis = 1)
df_column

Unnamed: 0,country,year,country.1,city
0,JPY,2019,JPY,Tokyo
1,CHN,2019,CHN,Beijing
2,IND,2020,KOR,Seoul


In [16]:
df_row = pd.concat([data1, data2], axis = 0)
df_row

Unnamed: 0,country,year,city
0,JPY,2019.0,
1,CHN,2019.0,
2,IND,2020.0,
0,JPY,,Tokyo
1,CHN,,Beijing
2,KOR,,Seoul


## Join

In [17]:
population = pd.read_csv("C:/Users/LENOVO/Python/worldbank_pope.csv")
population.head()

Unnamed: 0,Country Name,Country Code,Year,Value
0,Afghanistan,AFG,1960,9035043.0
1,Afghanistan,AFG,1961,9214083.0
2,Afghanistan,AFG,1962,9404406.0
3,Afghanistan,AFG,1963,9604487.0
4,Afghanistan,AFG,1964,9814318.0


In [18]:
gdp = pd.read_csv("C:/Users/LENOVO/Python/worldbank_gdpe.csv")
gdp.head()

Unnamed: 0,Country Name,Country Code,Year,Value
0,Afghanistan,AFG,2000,3521418000.0
1,Afghanistan,AFG,2001,2813572000.0
2,Afghanistan,AFG,2002,3825701000.0
3,Afghanistan,AFG,2003,4520947000.0
4,Afghanistan,AFG,2004,5224897000.0


### Outer join

In [19]:
world_bank_outer = population.merge(gdp,
                                    on = ["Country Code", "Country Name", "Year"],
                                    how = "outer")

In [20]:
world_bank_outer

Unnamed: 0,Country Name,Country Code,Year,Value_x,Value_y
0,Aruba,ABW,1960,54922.0,
1,Aruba,ABW,1961,55578.0,
2,Aruba,ABW,1962,56320.0,
3,Aruba,ABW,1963,57002.0,
4,Aruba,ABW,1964,57619.0,
...,...,...,...,...,...
16925,Zimbabwe,ZWE,2019,15271368.0,2.183223e+10
16926,Zimbabwe,ZWE,2020,15526888.0,2.150970e+10
16927,Zimbabwe,ZWE,2021,15797210.0,2.837124e+10
16928,Zimbabwe,ZWE,2022,16069056.0,2.736663e+10


### Inner join

In [21]:
world_bank_inner = population.merge(gdp,
                                    on = ["Country Code", "Country Name", "Year"],
                                    how = "inner")
world_bank_inner

Unnamed: 0,Country Name,Country Code,Year,Value_x,Value_y
0,Afghanistan,AFG,2000,20130327.0,3.521418e+09
1,Afghanistan,AFG,2001,20284307.0,2.813572e+09
2,Afghanistan,AFG,2002,21378117.0,3.825701e+09
3,Afghanistan,AFG,2003,22733049.0,4.520947e+09
4,Afghanistan,AFG,2004,23560654.0,5.224897e+09
...,...,...,...,...,...
13974,Zimbabwe,ZWE,2019,15271368.0,2.183223e+10
13975,Zimbabwe,ZWE,2020,15526888.0,2.150970e+10
13976,Zimbabwe,ZWE,2021,15797210.0,2.837124e+10
13977,Zimbabwe,ZWE,2022,16069056.0,2.736663e+10


### Left join

In [22]:
world_bank_left = population.merge(gdp,
                                    on = ["Country Code", "Country Name", "Year"],
                                    how = "left")
world_bank_left

Unnamed: 0,Country Name,Country Code,Year,Value_x,Value_y
0,Afghanistan,AFG,1960,9035043.0,
1,Afghanistan,AFG,1961,9214083.0,
2,Afghanistan,AFG,1962,9404406.0,
3,Afghanistan,AFG,1963,9604487.0,
4,Afghanistan,AFG,1964,9814318.0,
...,...,...,...,...,...
16925,Zimbabwe,ZWE,2019,15271368.0,2.183223e+10
16926,Zimbabwe,ZWE,2020,15526888.0,2.150970e+10
16927,Zimbabwe,ZWE,2021,15797210.0,2.837124e+10
16928,Zimbabwe,ZWE,2022,16069056.0,2.736663e+10


### Right Join

In [23]:
world_bank_right = population.merge(gdp,
                                    on = ["Country Code", "Country Name", "Year"],
                                    how = "right")
world_bank_right

Unnamed: 0,Country Name,Country Code,Year,Value_x,Value_y
0,Afghanistan,AFG,2000,20130327.0,3.521418e+09
1,Afghanistan,AFG,2001,20284307.0,2.813572e+09
2,Afghanistan,AFG,2002,21378117.0,3.825701e+09
3,Afghanistan,AFG,2003,22733049.0,4.520947e+09
4,Afghanistan,AFG,2004,23560654.0,5.224897e+09
...,...,...,...,...,...
13974,Zimbabwe,ZWE,2019,15271368.0,2.183223e+10
13975,Zimbabwe,ZWE,2020,15526888.0,2.150970e+10
13976,Zimbabwe,ZWE,2021,15797210.0,2.837124e+10
13977,Zimbabwe,ZWE,2022,16069056.0,2.736663e+10


### Multiple

In [24]:
world_bank_multiple = population.merge(gdp,
                                    on = ["Country Code", "Country Code", "Year"]
                                    )
world_bank_multiple

Unnamed: 0,Country Name_x,Country Code,Year,Value_x,Country Name_y,Value_y
0,Afghanistan,AFG,2000,20130327.0,Afghanistan,3.521418e+09
1,Afghanistan,AFG,2001,20284307.0,Afghanistan,2.813572e+09
2,Afghanistan,AFG,2002,21378117.0,Afghanistan,3.825701e+09
3,Afghanistan,AFG,2003,22733049.0,Afghanistan,4.520947e+09
4,Afghanistan,AFG,2004,23560654.0,Afghanistan,5.224897e+09
...,...,...,...,...,...,...
13974,Zimbabwe,ZWE,2019,15271368.0,Zimbabwe,2.183223e+10
13975,Zimbabwe,ZWE,2020,15526888.0,Zimbabwe,2.150970e+10
13976,Zimbabwe,ZWE,2021,15797210.0,Zimbabwe,2.837124e+10
13977,Zimbabwe,ZWE,2022,16069056.0,Zimbabwe,2.736663e+10


In [25]:
population

Unnamed: 0,Country Name,Country Code,Year,Value
0,Afghanistan,AFG,1960,9035043.0
1,Afghanistan,AFG,1961,9214083.0
2,Afghanistan,AFG,1962,9404406.0
3,Afghanistan,AFG,1963,9604487.0
4,Afghanistan,AFG,1964,9814318.0
...,...,...,...,...
16925,Zimbabwe,ZWE,2019,15271368.0
16926,Zimbabwe,ZWE,2020,15526888.0
16927,Zimbabwe,ZWE,2021,15797210.0
16928,Zimbabwe,ZWE,2022,16069056.0
