In [1]:
import pandas as pd

all_08 = pd.read_csv('./DataSet/all_alpha_08_fixed_datatypes.csv')
all_18 = pd.read_csv('./DataSet/all_alpha_18_fixed_datatypes.csv')

In [2]:
all_08.shape, all_18.shape

((987, 13), (832, 13))

## Merging the two Datasets together

The four types of merges in pandas. Below, "key" refers to common columns in both dataframes that we're joining on.

1. **Inner Join** - Use intersection of keys from both frames.
1. **Outer Join** - Use union of keys from both frames.
1. **Left Join** - Use keys from left frame only.
1. **Right Join** - Use keys from right frame only.

Below are diagrams to visualize each type.

In [6]:
# Renaming the `2008` dataset columns to have `_2008` suffix
all_08.rename(columns=lambda x: f'{x}_2008', inplace=True)

In [11]:
all_08.columns

Index(['model_2008', 'displ_2008', 'cyl_2008', 'trans_2008', 'drive_2008',
       'fuel_2008', 'veh_class_2008', 'air_pollution_score_2008',
       'city_mpg_2008', 'hwy_mpg_2008', 'cmb_mpg_2008',
       'greenhouse_gas_score_2008', 'smartway_2008'],
      dtype='object')

### Inner Join
  
When performing inner join, the data points has to present in both datasets to get merged

In [14]:
inner = pd.merge(all_08, all_18, how='inner', left_on='model_2008', right_on='model', indicator=True)

In [15]:
# Check inner Join shape
inner.shape

(922, 27)

In [18]:
# Check the `_merge` column, we see the result both
# b/c the data points found in both datasets
inner.head()

Unnamed: 0,model_2008,displ_2008,cyl_2008,trans_2008,drive_2008,fuel_2008,veh_class_2008,air_pollution_score_2008,city_mpg_2008,hwy_mpg_2008,...,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway,_merge
0,ACURA RDX,2.3,4,Auto-S5,4WD,Gasoline,SUV,7.0,17.0,22.0,...,2WD,Gasoline,small SUV,3.0,20.0,28.0,23.0,5,No,both
1,ACURA RDX,2.3,4,Auto-S5,4WD,Gasoline,SUV,7.0,17.0,22.0,...,4WD,Gasoline,small SUV,3.0,19.0,27.0,22.0,4,No,both
2,AUDI A3,2.0,4,Man-6,2WD,Gasoline,station wagon,7.0,21.0,29.0,...,4WD,Gasoline,small car,7.0,24.0,31.0,27.0,6,No,both
3,AUDI A3,2.0,4,Man-6,2WD,Gasoline,station wagon,7.0,21.0,29.0,...,2WD,Gasoline,small car,7.0,26.0,35.0,29.0,6,No,both
4,AUDI A3,2.0,4,Auto-S6,2WD,Gasoline,station wagon,7.0,22.0,29.0,...,4WD,Gasoline,small car,7.0,24.0,31.0,27.0,6,No,both


### Saving the result

In [20]:
inner.drop(columns='_merge').to_csv('./DataSet/Inner_Merged.csv', index=False)

### Outer Join
  
When performing outer join, all the datapoints found in both datasets will be included

In [21]:
outer = pd.merge(all_08, all_18, how='outer', left_on='model_2008', right_on='model', indicator=True)

In [22]:
# Check outer Join shape
outer.shape

(2263, 27)

In [23]:
# Check the `_merge` column, we see the result both
# b/c the data points found in both datasets
inner.head()

Unnamed: 0,model_2008,displ_2008,cyl_2008,trans_2008,drive_2008,fuel_2008,veh_class_2008,air_pollution_score_2008,city_mpg_2008,hwy_mpg_2008,...,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway,_merge
0,ACURA RDX,2.3,4,Auto-S5,4WD,Gasoline,SUV,7.0,17.0,22.0,...,2WD,Gasoline,small SUV,3.0,20.0,28.0,23.0,5,No,both
1,ACURA RDX,2.3,4,Auto-S5,4WD,Gasoline,SUV,7.0,17.0,22.0,...,4WD,Gasoline,small SUV,3.0,19.0,27.0,22.0,4,No,both
2,AUDI A3,2.0,4,Man-6,2WD,Gasoline,station wagon,7.0,21.0,29.0,...,4WD,Gasoline,small car,7.0,24.0,31.0,27.0,6,No,both
3,AUDI A3,2.0,4,Man-6,2WD,Gasoline,station wagon,7.0,21.0,29.0,...,2WD,Gasoline,small car,7.0,26.0,35.0,29.0,6,No,both
4,AUDI A3,2.0,4,Auto-S6,2WD,Gasoline,station wagon,7.0,22.0,29.0,...,4WD,Gasoline,small car,7.0,24.0,31.0,27.0,6,No,both
