# 4.6 revised Combining & Exporting Data
## Contents
## 1. Import libraries&data sets
## 2. Consistency Checks of df_ords_prior
## 3. Export the merged file in pickle format 

## 1. Import libraries & data set

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [5]:
# Create project folder path
path=r'/Users/yevgeniyaem/Documents/Weiterbildung Data Analytics/11-2024 Instacart Basket Analysis'

In [7]:
path

'/Users/yevgeniyaem/Documents/Weiterbildung Data Analytics/11-2024 Instacart Basket Analysis'

In [9]:
# Import "orders_checked.csv"
df_ords=pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'), index_col = False)

In [54]:
# Check the output
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   order_id                int64  
 2   user_id                 int64  
 3   customers_order_count   int64  
 4   orders_day_of_week      int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(6)
memory usage: 182.7 MB


In [13]:
# Import "products_checked.csv"
df_prods=pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = False)

In [15]:
df_prods.shape

(49672, 6)

In [17]:
# Import "orders_products_prior.csv"
df_ords_prior=pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders_products_prior.csv'), index_col = False)

In [19]:
# Check the output 
df_ords_prior.shape

(32434489, 4)

## 2. Consistency Check of df_ords_prior

In [22]:
# Check the output
df_ords_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [24]:
# Check the output
df_ords_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 989.8 MB


### "order_id" - should be considered string values instead of numerical, bc there are all unique numbers.

In [27]:
# Change data type in "order_id" to string values instead of numerical.
df_ords_prior['order_id'] = df_ords_prior['order_id'].astype('str')

In [29]:
# Check the output
df_ords_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   order_id           object
 1   product_id         int64 
 2   add_to_cart_order  int64 
 3   reordered          int64 
dtypes: int64(3), object(1)
memory usage: 989.8+ MB


In [31]:
# Check for any mixed type columns in df_ords_prior
for col in df_ords_prior.columns.tolist():
  weird = (df_ords_prior[[col]].map(type) != df_ords_prior[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords_prior[weird]) > 0:
    print (col)

### no mixed data types found

In [34]:
# Checking the data types of all the columns in the df_ords_prior 
df_ords_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   order_id           object
 1   product_id         int64 
 2   add_to_cart_order  int64 
 3   reordered          int64 
dtypes: int64(3), object(1)
memory usage: 989.8+ MB


### There are no mixed-type data found in any of the columns/variables in the df_ords dataframe.

In [37]:
# Find missing values
df_ords_prior.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

### no missing values found

In [43]:
# Finding duplicates
# Create subset of df_ords_prior_dups that contains only rows of duplicates
df_ords_prior_dups = df_ords_prior[df_ords_prior.duplicated()]

In [49]:
df_ords_prior_dups

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


### The dataframe created to check duplicates has returned empty, meaning there are no duplicates in the data set

In [58]:
### "order_id" column data type in df_ords schould be changed to textual from numerical (already done in 4.5 wasn't saved).
### Hence one more time changing.
# Change data type in "order_id" to string values instead of numerical.
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [60]:
# Check the output
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   order_id                object 
 2   user_id                 int64  
 3   customers_order_count   int64  
 4   orders_day_of_week      int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 182.7+ MB


In [80]:
# Merge df_ords with df_ords_prior via "order_id"
df_merged_large = df_ords.merge(df_ords_prior, on = 'order_id', indicator = True)

In [92]:
# Check the output
df_merged_large.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,customers_order_count,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,0,2539329,1,1,2,8,0.0,196,1,0,both
1,0,2539329,1,1,2,8,0.0,14084,2,0,both
2,0,2539329,1,1,2,8,0.0,12427,3,0,both
3,0,2539329,1,1,2,8,0.0,26088,4,0,both
4,0,2539329,1,1,2,8,0.0,26405,5,0,both


In [66]:
# Check the merging
df_merged_large['_merge'].value_counts()

_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64

In [72]:
# Double-check of merge with attribute how=outer
df_merged_large = df_ords.merge(df_ords_prior, on = 'order_id', how = 'outer', indicator = True) 
# >>> in this exersice we don't apply this code

In [2]:
# Check the merging
df_merged_large['_merge'].value_counts()

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


NameError: name 'df_merged_large' is not defined

## 3. Export the merged file in pickle format 

In [86]:
# Export the merged file in pickle format as “orders_products_combined.pkl”.
df_merged_large.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_combined.pkl'))

In [88]:
# Check
df_merged_large.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 11 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   Unnamed: 0              int64   
 1   order_id                object  
 2   user_id                 int64   
 3   customers_order_count   int64   
 4   orders_day_of_week      int64   
 5   order_hour_of_day       int64   
 6   days_since_prior_order  float64 
 7   product_id              int64   
 8   add_to_cart_order       int64   
 9   reordered               int64   
 10  _merge                  category
dtypes: category(1), float64(1), int64(8), object(1)
memory usage: 2.4+ GB
