# Combining & Exporting Data

## This script contains the following points:
1. Loading csv files to practice on
2. Combining dataframes 
3. Confirming results
4. Exporting results in suitable format

# 4.6 Importing libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Creating path to root folder
path = r"C:\Users\cavba\Documents\Instacart Basket Analysis"

In [3]:
# importing orders.csv
df_ords=pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_cleaned.csv'), index_col = False)

In [4]:
# Check the output
df_ords.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,order_id,user_id,order_number,order_days_of_week,order_hour_of_day,days_since_prior_order,ordered_today
0,0,0,2539329,1,1,2,8,,True
1,1,1,2398795,1,2,3,7,15.0,False
2,2,2,473747,1,3,3,12,21.0,False
3,3,3,2254736,1,4,4,7,29.0,False
4,4,4,431534,1,5,4,15,28.0,False


In [5]:
df_ords.shape

(3421083, 9)

In [7]:
df_ords.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'])

Unnamed: 0,order_id,user_id,order_number,order_days_of_week,order_hour_of_day,days_since_prior_order,ordered_today
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False
...,...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0,False
3421079,1854736,206209,11,4,10,30.0,False
3421080,626363,206209,12,1,12,18.0,False
3421081,2977660,206209,13,1,12,7.0,False


In [8]:
df_ords.shape

(3421083, 9)

In [9]:
#Removing extra index columns
df_ords1 = df_ords.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'])

In [10]:
df_ords1.shape

(3421083, 7)

In [11]:
# Importing orders_products_prior.csv
df_ords_prior = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'order_products__prior.csv'), index_col = False)

In [12]:
# Check the output
df_ords_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [13]:
df_ords_prior.shape

(32434489, 4)

In [14]:
df_ords_prior.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

In [15]:
df_ords_prior.duplicated()

0           False
1           False
2           False
3           False
4           False
            ...  
32434484    False
32434485    False
32434486    False
32434487    False
32434488    False
Length: 32434489, dtype: bool

In [21]:
# Importing products.csv
df_prods=pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col= 0)

In [22]:
# Check output
df_prods.shape

(49672, 5)

 ## Task 4.6 Step 3: IMPORTING COMBINED PICKLE FILE

In [23]:
# Importing pickle file
df_ords_prods_combo=pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

## Task 4.6 Step 4: Checking shape

In [24]:
# Checking shape to confirm it is the same as before
df_ords_prods_combo.shape

(32434489, 11)

In [25]:
# Checking output
df_ords_prods_combo.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,prior,1,2,8,,196,1,0,both
1,2539329,1,prior,1,2,8,,14084,2,0,both
2,2539329,1,prior,1,2,8,,12427,3,0,both
3,2539329,1,prior,1,2,8,,26088,4,0,both
4,2539329,1,prior,1,2,8,,26405,5,0,both


In [26]:
# Checking shape of products data set
df_prods.shape

(49672, 5)

In [27]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [28]:
df_ords_prods_combo.isnull().sum()

order_id                        0
user_id                         0
eval_set                        0
order_number                    0
order_dow                       0
order_hour_of_day               0
days_since_prior_order    2078068
product_id                      0
add_to_cart_order               0
reordered                       0
_merge                          0
dtype: int64

In [32]:
df_ords_prods_combo.drop(columns = ['_merge'])

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,prior,1,2,8,,196,1,0
1,2539329,1,prior,1,2,8,,14084,2,0
2,2539329,1,prior,1,2,8,,12427,3,0
3,2539329,1,prior,1,2,8,,26088,4,0
4,2539329,1,prior,1,2,8,,26405,5,0
...,...,...,...,...,...,...,...,...,...,...
32434484,2977660,206209,prior,13,1,12,7.0,14197,5,1
32434485,2977660,206209,prior,13,1,12,7.0,38730,6,0
32434486,2977660,206209,prior,13,1,12,7.0,31477,7,0
32434487,2977660,206209,prior,13,1,12,7.0,6567,8,0


In [29]:
df_prods.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49672 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49672 non-null  int64  
 1   product_name   49672 non-null  object 
 2   aisle_id       49672 non-null  int64  
 3   department_id  49672 non-null  int64  
 4   prices         49672 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 2.3+ MB


## Task 4.6 Step 5 Merging datasets

Product_id is the matching key in both data frames. Since the number of columns do not match wide format concatenating would be the best way to combine the data.

In [34]:
df_merged_large = df_prods.merge(df_ords_prods_combo, on = 'product_id')

In [36]:
df_merged_large.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,prior,28,6,11,3.0,5,0,both
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,prior,30,6,17,20.0,1,1,both
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,prior,2,0,21,6.0,20,0,both
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,prior,1,3,13,,10,0,both
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,prior,3,4,17,9.0,11,1,both


In [37]:
df_merged_large.shape

(32404859, 15)

In [38]:
df_merged_large.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 15 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   product_id              int64   
 1   product_name            object  
 2   aisle_id                int64   
 3   department_id           int64   
 4   prices                  float64 
 5   order_id                int64   
 6   user_id                 int64   
 7   eval_set                object  
 8   order_number            int64   
 9   order_dow               int64   
 10  order_hour_of_day       int64   
 11  days_since_prior_order  float64 
 12  add_to_cart_order       int64   
 13  reordered               int64   
 14  _merge                  category
dtypes: category(1), float64(2), int64(10), object(2)
memory usage: 3.7+ GB


In [39]:
#Confirming results with the merge flag
df_merged_large['_merge'].value_counts()

both          32404859
right_only           0
left_only            0
Name: _merge, dtype: int64

## Task 4.6 Step 7

In [40]:
# Export data to pickle
df_merged_large.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))