## 4.6 Merging and exporting data - Part 2

### This script contains the following points:

#### 1. Merge orders_products combined and df_prods_checked

### The following dataframes were manipulated/created:

#### a) 1 column deleted from 'df_prods' (extra index). Exported as 'prods_checked.csv' (overwritten previous df_prods_checked)
#### b) new dataset created: 'orders_products_merged.pkl' (left join of 'orders_products combined' and 'df_prods_checked')

In [18]:
# Import librabries

import pandas as pd
import numpy as np
import os

In [19]:
# Import dataframes

path = r'C:\Users\bruna\Career Foundry\08-2023 Instacart Basket Analysis'

In [20]:
df_ords_prods_combined = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

In [21]:
df_ords_prods_combined.shape

(32434489, 11)

In [22]:
df_prods_checked = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'prods_checked.csv'), index_col = False)

In [23]:
df_prods_checked.shape

(49672, 6)

In [24]:
# Visualising dataframes
df_ords_prods_combined.tail()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,time_of_order,days_since_prior_order,first_order_flag,product_id,add_to_cart_order,reordered,_merge
32434484,2977660,206209,13,1,12,7.0,False,14197,5,1,both
32434485,2977660,206209,13,1,12,7.0,False,38730,6,0,both
32434486,2977660,206209,13,1,12,7.0,False,31477,7,0,both
32434487,2977660,206209,13,1,12,7.0,False,6567,8,0,both
32434488,2977660,206209,13,1,12,7.0,False,22920,9,0,both


In [25]:
df_prods_checked.tail()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
49667,49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49668,49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49669,49690,49686,Artisan Baguette,112,3,7.8
49670,49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7
49671,49692,49688,Fresh Foaming Cleanser,73,11,13.5


In [26]:

# Dropping unnecessary columns in df_prods_checked

df_prods_checked = df_prods_checked.drop(columns=['Unnamed: 0'])

In [27]:
df_prods_checked.tail()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
49667,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49668,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49669,49686,Artisan Baguette,112,3,7.8
49670,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7
49671,49688,Fresh Foaming Cleanser,73,11,13.5



## 1. Merge df_ords and df_ords_prior

In [28]:
# Double-check merge rates using an outer join
    
df_ords_prods_merged = df_ords_prods_combined.merge(df_prods_checked, on = 'product_id', how = 'outer', indicator='exists')

In [29]:
df_ords_prods_merged.tail()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,time_of_order,days_since_prior_order,first_order_flag,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,exists
32435065,,,,,,,,37703,,,,Ultra Sun Blossom Liquid 90 loads Fabric Enhan...,75.0,17.0,14.3,right_only
32435066,,,,,,,,43725,,,,Sweetart Jelly Beans,100.0,21.0,8.1,right_only
32435067,,,,,,,,45971,,,,12 Inch Taper Candle White,101.0,17.0,9.8,right_only
32435068,,,,,,,,46625,,,,Single Barrel Kentucky Straight Bourbon Whiskey,31.0,7.0,1.7,right_only
32435069,,,,,,,,49540,,,,Pure Squeezed Lemonade,31.0,7.0,9.1,right_only


In [30]:
df_ords_prods_merged['exists'].value_counts()

both          32404859
left_only        30200
right_only          11
Name: exists, dtype: int64

In [31]:
df_ords_prods_merged.shape

(32435070, 16)

In [32]:
#Visualising left only
print(df_ords_prods_merged[df_ords_prods_merged['exists'].str.contains('left')])


           order_id   user_id  order_number  orders_day_of_week  \
12039117     7099.0      27.0          63.0                 3.0   
12039118  1837192.0      27.0          80.0                 2.0   
12039119  2915432.0     298.0           1.0                 4.0   
12039120   613874.0     298.0           3.0                 1.0   
12039121   690386.0     479.0           1.0                 3.0   
...             ...       ...           ...                 ...   
32413293  1514902.0  179816.0           4.0                 5.0   
32413294  3287190.0  179816.0           5.0                 6.0   
32413295  2943026.0  179816.0           8.0                 0.0   
32413296  2108832.0  179816.0          15.0                 2.0   
32413297  1077336.0  202849.0          52.0                 1.0   

          time_of_order  days_since_prior_order first_order_flag  product_id  \
12039117           10.0                     1.0            False        6799   
12039118            8.0            

In [33]:
# Checking for duplicates in 'product_in'

duplicates_all_occurences = df_prods_checked['product_id'].duplicated(keep=False)

In [34]:
print(df_prods_checked[duplicates_all_occurences])

       product_id                                       product_name  \
6784         6800                          Revive Zero Vitamin Water   
6785         6800                 Sprouted Quinoa Flakes Baby Cereal   
26504       26520  Clinical Advanced Solid Ultimate Fresh Anti-Pe...   
26505       26520       Cheese Shredded Sharp Cheddar Reduced Fat 2%   

       aisle_id  department_id  prices  
6784         64              7     6.4  
6785         92             18    14.0  
26504        80             11    10.6  
26505        21             16     2.9  


In [35]:
# keeping these for now as I don't have access to the correct product_id and it's a very small % of rows with this issue

In [36]:

# Doing an inner join instead 
df_ords_prods_merged = df_ords_prods_combined.merge(df_prods_checked, on = 'product_id', indicator='exists')

In [37]:
df_ords_prods_merged['exists'].value_counts()

both          32404859
left_only            0
right_only           0
Name: exists, dtype: int64

In [38]:
df_ords_prods_merged.shape

(32404859, 16)

In [39]:
df_ords_prods_merged.tail()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,time_of_order,days_since_prior_order,first_order_flag,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,exists
32404854,1320836,202557,17,2,15,1.0,False,43553,2,1,both,Orange Energy Shots,64,7,3.7,both
32404855,31526,202557,18,5,11,3.0,False,43553,2,1,both,Orange Energy Shots,64,7,3.7,both
32404856,758936,203436,1,2,7,,True,42338,4,0,both,"Zucchini Chips, Pesto",50,19,6.9,both
32404857,2745165,203436,2,3,5,15.0,False,42338,16,1,both,"Zucchini Chips, Pesto",50,19,6.9,both
32404858,3093936,205420,1,4,14,,True,28818,8,0,both,Hot Oatmeal Multigrain Raisin,130,14,10.3,both


## 2. Exporting Dataframes

In [40]:
# Exporting 'orders_products_merged.pkl'

df_ords_prods_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))