## 4.6.2 Merging and Exporting - Products

### This script contains the following points:

#### 1. Import orders_products_combined.pkl
#### 2. Check the dimensions of the imported dataframe
#### 3. Merge the orders_products_combined df with df_prods (from the cleaned products_checked file)
#### 4. Use merge flag frequency to check whether there was a full match between the two dataframes
#### 5. Export the merged file in pickle format

### Importing libraries

In [2]:
#import libraries
import pandas as pd
import numpy as np
import os

### 01. & 02. Importing data & Checking dimensions for each df

In [11]:
#create string path for main project folder
path = r'/Users/AngieUS/Desktop/Instacart Project'

#import orders_products_prior data
df_ords_prods_comb = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

#check df_ords_prods_comb
df_ords_prods_comb.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2398795,1,2,3,7,15.0,196,1,1,both
1,2398795,1,2,3,7,15.0,10258,2,0,both
2,2398795,1,2,3,7,15.0,12427,3,1,both
3,2398795,1,2,3,7,15.0,13176,4,0,both
4,2398795,1,2,3,7,15.0,26088,5,1,both


In [7]:
#import df_prods data using cleaned products_checked file
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = False)

#remove irrelevant column (unnamed is dup of index)
df_prods = df_prods.drop(['Unnamed: 0'], axis=1)

#check df_prods
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [19]:
#display dimensions of df_ords_prods_comb - rows, columns
df_ords_prods_comb.shape

(30356421, 9)

In [20]:
#display dimensions of df_prods - rows, columns
df_prods.shape

(49672, 5)

### 03. Merge df_ords_prods_comb and df_prods using product_id = key - delete previous indicator column first

In [15]:
#remove previous indicator column (_merge)
df_ords_prods_comb = df_ords_prods_comb.drop(['_merge'], axis=1)

In [17]:
#merge (inner join) using 'product_id' as the key
df_ords_prods_merge = df_ords_prods_comb.merge(df_prods, on = 'product_id', indicator = True)

#check df_ords_prods_merge
df_ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,10258,2,0,Pistachios,117,19,3.0,both
2,2398795,1,2,3,7,15.0,12427,3,1,Original Beef Jerky,23,19,4.4,both
3,2398795,1,2,3,7,15.0,13176,4,0,Bag of Organic Bananas,24,4,10.3,both
4,2398795,1,2,3,7,15.0,26088,5,1,Aged White Cheddar Popcorn,23,19,4.7,both


In [18]:
#display dimensions of df_ords_prods_merge
df_ords_prods_merge.shape

(30328763, 14)

### 04. Check merge flag frequency

In [21]:
#display merge flag frequency for df_ords_prods_merge
df_ords_prods_merge['_merge'].value_counts()

_merge
both          30328763
left_only            0
right_only           0
Name: count, dtype: int64

### 05. Export data

In [22]:
#export data (pkl file) as ords_prods_merge
df_ords_prods_merge.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_merge.pkl'))