## GroupBy: Data Pre-processing GroupBy JSON Data

### Drive Set Up


In [None]:
#Mount the RAW session level data: shopping.pkl
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
# The path below should point to the directory containing this notebook and the associated utility files
# Change it if necessary
os.chdir('/content/drive/MyDrive/GroupBy/')

### Import Library

In [None]:
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize #package for flattening json in pandas df
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import time

### Load and Flatten JSON into Dataframe


In [None]:
### load GroupBy dataset
with open('fourthbrain_project_data.json', encoding="utf8") as json_file:      
    json_f = json_file.readlines()
    json_data = list(map(json.loads, json_f))

In [None]:
#load and flatten json into dataframe
df_json = json_normalize(json_data, ['hits','product'],[['hits','eventType','category'],
  ['hits','serverTime'],
  ['hits','hitSequence'],
  ['hits','isEntrance'],                                                            
  ['hits','isExit'],
  ['hits','totalitems'],
  ['hits','totalquantity'],
  ['hits','totalprice'],
  'customerId',
  'customerArea',
  'customerVisitorId',
  'customerLoginId',
  'customerSessionId',
  'sessionStartTime',
  'sessionEndTime',
  'customerSessionNumber',
  'totals'], errors='ignore')

In [None]:
#flatten the 'totals' df_json
df_json_concat = pd.concat([df_json, df_json['totals'].apply(pd.Series)], axis=1)

In [None]:
df_json = pd.read_csv('Copy of df_json_final_ver_2.csv')
df_json.shape

(1578809, 35)

### Explore Data

In [None]:
#preview data
df_json_concat.head(2)

Unnamed: 0,sku,ID,name,price,collection,attributedSearches,quantity,listLocation,listPosition,hits.eventType.category,hits.serverTime,hits.hitSequence,hits.isEntrance,hits.isExit,hits.totalitems,hits.totalquantity,hits.totalprice,customerId,customerArea,customerVisitorId,customerLoginId,customerSessionId,sessionStartTime,sessionEndTime,customerSessionNumber,totals,timeOnSite,events,newSession,bounce,totalSearches,totalNavigations,totalRefinements,totalViewProducts,totalViewProductRevenue,totalNullSearches,uniqueViewProducts,totalAddToCarts,totalAddToCartQty,totalAddToCartRevenue,uniqueAddToCarts,uniqueSearches,queriesSearched,uniqueRefinements,uniqueNavigations,totalOrders,totalOrderQty,totalOrderRevenue,uniqueOrders
0,BR013,BR013,nutritional yeast seasoning,4.99,Bragg,[],,,,viewProduct,2020-12-02 03:03:04.214607 UTC,1,True,True,,,,swansonhealth,Production,cki6s4tr200013bdpkaye9mdn,anonymous,cki6s4tr200013bdpkaye9mdn1606878184,2020-12-02 03:03:04.214607 UTC,2020-12-02 03:03:04.214607 UTC,1,"{'timeOnSite': '00:00:00', 'events': '1', 'new...",00:00:00,1,True,1,0,0,0,1,4.99,0,1,,,,,,,,,,,,
1,DRB004,DRB004,pure castile bar soap peppermint,4.5,Dr. Bronner's,[],,,,viewProduct,2020-12-02 02:06:37.430706 UTC,1,True,True,,,,swansonhealth,Production,cki6rrijb000125c5cdol7ahd,anonymous,cki6rrijb000125c5cdol7ahd1606874797,2020-12-02 02:06:37.430706 UTC,2020-12-02 02:06:37.430706 UTC,1,"{'timeOnSite': '00:00:00', 'events': '1', 'new...",00:00:00,1,True,1,0,0,0,1,4.5,0,1,,,,,,,,,,,,


In [None]:
#check uniqueness of data values in relations to the product_id column
df_unique_cust = df_json_concat.groupby('customerSessionId')['hits.eventType.category'].nunique().reset_index()

#confirm that category_id and brand remains static product_id's
product_id_filter_non_unique = df_unique_cust[df_unique_cust['hits.eventType.category']>3]
product_id_filter_non_unique   

Unnamed: 0,customerSessionId,hits.eventType.category
8,cjo07r72i00013mb61nrkd6wu1606868238,4
11,cjo0a04ca000128c76p115l5c1606880706,4
20,cjo68ggna00013gb9eeiw6m3y1606879169,4
25,cjo6avakq00013abemkr948co1606870622,4
30,cjo6e7rhs000141b5ql9ueg0k1606888043,4
...,...,...
46916,cki7nbfr900013e99kl2s1e8f1606927941,4
46960,cki7ncx6q00013qa660p3y7od1606928105,4
47033,cki7nflxa00013hczzsjcjlgs1606927999,4
47048,cki7ngbtk00013pd8ipu7v7g31606928057,4


In [None]:
# check data info
df_json_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1714266 entries, 0 to 1714265
Data columns (total 49 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   sku                      object 
 1   ID                       object 
 2   name                     object 
 3   price                    float64
 4   collection               object 
 5   attributedSearches       object 
 6   quantity                 object 
 7   listLocation             object 
 8   listPosition             object 
 9   hits.eventType.category  object 
 10  hits.serverTime          object 
 11  hits.hitSequence         object 
 12  hits.isEntrance          object 
 13  hits.isExit              object 
 14  hits.totalitems          object 
 15  hits.totalquantity       object 
 16  hits.totalprice          object 
 17  customerId               object 
 18  customerArea             object 
 19  customerVisitorId        object 
 20  customerLoginId          object 
 21  customer

In [None]:
# check count of missing values
null_columns= df_json_concat.columns[df_json_concat.isnull().any()]
df_json_concat[null_columns].isnull().sum()

sku                        1573530
name                       1573529
price                      1573530
quantity                   1674638
listLocation                140737
listPosition                140737
hits.totalitems            1674638
hits.totalquantity         1674638
hits.totalprice            1674638
totalViewProducts           272037
totalViewProductRevenue     272037
uniqueViewProducts          272037
totalAddToCarts             676849
totalAddToCartQty           676849
totalAddToCartRevenue       676849
uniqueAddToCarts            676849
uniqueSearches               48522
queriesSearched              48522
uniqueRefinements          1577690
uniqueNavigations          1577690
totalOrders                1153060
totalOrderQty              1153060
totalOrderRevenue          1153060
uniqueOrders               1153060
dtype: int64

In [None]:
#check percentage of missing values
percent_missing = round(df_json_concat.isnull().sum() * 100 / len(df_json_concat),2)
a = percent_missing > 30
b = percent_missing[a]
a

sku                         True
ID                         False
name                        True
price                       True
collection                 False
attributedSearches         False
quantity                    True
listLocation               False
listPosition               False
hits.eventType.category    False
hits.serverTime            False
hits.hitSequence           False
hits.isEntrance            False
hits.isExit                False
hits.totalitems             True
hits.totalquantity          True
hits.totalprice             True
customerId                 False
customerArea               False
customerVisitorId          False
customerLoginId            False
customerSessionId          False
sessionStartTime           False
sessionEndTime             False
customerSessionNumber      False
totals                     False
timeOnSite                 False
events                     False
newSession                 False
bounce                     False
totalSearc

In [None]:
#check if missing values from one column is impacting another column  
df_cart_price = df_json_concat[(df_json_concat['price'].isnull()) & (df_json_concat['hits.eventType.category']=='addToCart')]
df_cart_price.head()

Unnamed: 0,sku,ID,name,price,collection,attributedSearches,quantity,listLocation,listPosition,hits.eventType.category,hits.serverTime,hits.hitSequence,hits.isEntrance,hits.isExit,hits.totalitems,hits.totalquantity,hits.totalprice,customerId,customerArea,customerVisitorId,customerLoginId,customerSessionId,sessionStartTime,sessionEndTime,customerSessionNumber,totals,timeOnSite,events,newSession,bounce,totalSearches,totalNavigations,totalRefinements,totalViewProducts,totalViewProductRevenue,totalNullSearches,uniqueViewProducts,totalAddToCarts,totalAddToCartQty,totalAddToCartRevenue,uniqueAddToCarts,uniqueSearches,queriesSearched,uniqueRefinements,uniqueNavigations,totalOrders,totalOrderQty,totalOrderRevenue,uniqueOrders
1549306,,SW854,n-acetyl cysteine,,default,[],1,,,addToCart,2020-12-02 02:19:24.589035 UTC,11,False,False,1,1,0,swansonhealth,Production,cki6s473200013b98h0wdev3q,anonymous,cki6s473200013b98h0wdev3q1606875386,2020-12-02 02:16:26.595421 UTC,2020-12-02 04:00:15.900761 UTC,1,"{'timeOnSite': '01:43:49', 'events': '222', 'n...",01:43:49,222,True,0,77,21,0,97,1521.9,0,54,48,48,655.58,47,12,; astaxanthin; fenugreek; gaia tea; magnesium;...,,,,,,


In [None]:
#check if missing values from one column is impacting another column  
df_total_order = df_json_concat[(df_json_concat['totalOrderRevenue'].isnull()) & (df_json_concat['hits.eventType.category']=='order')]
df_total_order.head()

Unnamed: 0,sku,ID,name,price,collection,attributedSearches,quantity,listLocation,listPosition,hits.eventType.category,hits.serverTime,hits.hitSequence,hits.isEntrance,hits.isExit,hits.totalitems,hits.totalquantity,hits.totalprice,customerId,customerArea,customerVisitorId,customerLoginId,customerSessionId,sessionStartTime,sessionEndTime,customerSessionNumber,totals,timeOnSite,events,newSession,bounce,totalSearches,totalNavigations,totalRefinements,totalViewProducts,totalViewProductRevenue,totalNullSearches,uniqueViewProducts,totalAddToCarts,totalAddToCartQty,totalAddToCartRevenue,uniqueAddToCarts,uniqueSearches,queriesSearched,uniqueRefinements,uniqueNavigations,totalOrders,totalOrderQty,totalOrderRevenue,uniqueOrders


In [None]:
#check if missing values from one column is impacting another column  
df_total_cart = df_json_concat[(df_json_concat['totalAddToCartRevenue'].isnull()) & (df_json_concat['hits.eventType.category']=='addToCart')]
df_total_cart.head()

Unnamed: 0,sku,ID,name,price,collection,attributedSearches,quantity,listLocation,listPosition,hits.eventType.category,hits.serverTime,hits.hitSequence,hits.isEntrance,hits.isExit,hits.totalitems,hits.totalquantity,hits.totalprice,customerId,customerArea,customerVisitorId,customerLoginId,customerSessionId,sessionStartTime,sessionEndTime,customerSessionNumber,totals,timeOnSite,events,newSession,bounce,totalSearches,totalNavigations,totalRefinements,totalViewProducts,totalViewProductRevenue,totalNullSearches,uniqueViewProducts,totalAddToCarts,totalAddToCartQty,totalAddToCartRevenue,uniqueAddToCarts,uniqueSearches,queriesSearched,uniqueRefinements,uniqueNavigations,totalOrders,totalOrderQty,totalOrderRevenue,uniqueOrders


In [None]:
#check uniqueness of data values in relations to the 'ID' column
df_unique_ID = df_json_concat.groupby('ID')['price','name'].nunique().add_prefix('num_').reset_index()
filter_ID = df_unique_ID[df_unique_ID['num_price']>1]
filter_ID

Unnamed: 0,ID,num_price,num_name
14,ABA005,2,1
15,ABA006,4,2
27,ABA037,3,1
31,ABA045,2,1
33,ABA047,2,2
...,...,...,...
17456,ZR041,2,1
17459,ZR044,2,1
17467,ZR052,2,1
17477,ZR062,2,1


In [None]:
#check uniqueness of data values in relations to the 'customer_session_id' column
df_cust_sess_id = df_json_concat.groupby('customerSessionId')['price','name','hits.eventType.category'].nunique().add_prefix('num_').reset_index()
filter_cust_sess = df_cust_sess_id[df_cust_sess_id['num_price']>1]
df_cust_sess_id.head()

Unnamed: 0,customerSessionId,num_price,num_name,num_hits.eventType.category
0,cjo06cah000013ea3jqwotb8g1606873564,1,1,2
1,cjo06cah000013ea3jqwotb8g1606885819,0,0,1
2,cjo06cah000013ea3jqwotb8g1606920471,0,0,1
3,cjo06esrd00013gblo1n3haw71606927978,1,1,1
4,cjo071cbe00014kbixrluiaso1606871144,3,3,3


### Update or Drop Column Names

In [None]:
#drop uneccessary columns
df_json_drop_cols = df_json_concat.drop(columns=['sku','attributedSearches','listLocation', 'listPosition',
                            'customerArea','totals','bounce', 'totalSearches','totalNavigations',
                            'totalRefinements', 'totalNullSearches', 'totalAddToCartQty',
                            'uniqueSearches', 'queriesSearched','uniqueRefinements','uniqueNavigations',
                            'totalOrderQty','hits.totalquantity','customerLoginId','hits.totalquantity','customerId'])

In [None]:
#rename columns
df_json_concat = df_json_drop_cols.rename(columns = {'ID': 'product_id','name':'product_name', 'price': 'product_price',
                                                'collection':'product_brand','hits.isExit':'last_session_event', 'hits.isEntrance':'first_session_event',
                                                'hits.totalprice': 'total_price_cart', 'events':'totals_events',
                                                'hits.serverTime': 'event_time'})

In [None]:
#one hot coding for event type
categorical_columns = ['hits.eventType.category']
df_json_concat = pd.get_dummies(df_json_concat, columns=categorical_columns)

In [None]:
#rename columns
df_json_concat = df_json_concat.rename(columns = {'hits.eventType.category_addToCart': 'event_type_cart',
                                                  'hits.eventType.category_order': 'event_type_purchase',
                                                  'hits.eventType.category_viewProduct': 'event_type_view',
                                                  'hits.hitSequence': 'hitSequence'})

In [None]:
# copy dataframe for addressing missing values
df_json_clean = df_json_concat

### Address Missing Values

##### Missing values for Product Price

In [None]:
##### Reduce missing values for  Product Price #####

# create dataframe without nan values and use it for later mapping
df_no_nan_prod_price = df_json_clean.dropna(subset=['product_price'])

# create groupby dataframe with product_price and product_id
group_prod_id = df_no_nan_prod_price.groupby('product_id')['product_price'].agg(pd.Series.mean).to_frame()
group_prod_id = group_prod_id.reset_index(drop=False)

# create mapping dictionaty for price
price_dict = pd.Series(group_prod_id.product_price.values, index = group_prod_id.product_id).to_dict()

#fill in missing values for category code
df_json_clean.product_price = df_json_clean.product_price.fillna(df_json_clean.product_id.map(price_dict))

###### Missing values for Product Name

In [None]:
##### Reduce missing values for  Product Name #####

# create dataframe without nan values for product_name column
df_no_nan_prod_name = df_json_clean.dropna(subset=['product_name'])

# create dataframe that's grouped by the category id column and selects the most common value for product_name column
group_prod_id_2 = df_no_nan_prod_name.groupby('product_id')['product_name'].agg(pd.Series.mode).to_frame()
group_prod_id_2 = group_prod_id_2.reset_index(drop=False)

# create a dictionary for reference mapping that uses the category_id column as key and the brand column as values
prod_name_dict = pd.Series(group_prod_id_2.product_name.values, index = group_prod_id_2.product_id).to_dict()

# fill in missing values for brand using reference dictionary 
df_json_clean.product_name = df_json_clean.product_name.fillna(df_json_clean.product_id.map(prod_name_dict))

###### Missing values for numerical columns 

      Columns should have a NaN value. Consquently, the NaN values can be replaced with  0

In [None]:
# columns with misisng values that can be filled in with 0
cols_fill_0 = ['totalViewProducts','totalViewProductRevenue','totalAddToCarts','totalAddToCartRevenue', 'total_price_cart','totalOrders','totalOrderRevenue','uniqueViewProducts', 'uniqueAddToCarts', 'uniqueOrders']

# replace missing values with 0
df_json_clean[cols_fill_0] = df_json_clean[cols_fill_0].fillna(0)

###### Final updates to address missing values

In [None]:
# re-check address missing values
null_columns= df_json_clean.columns[df_json_clean.isnull().any()]
df_json_clean[null_columns].isnull().sum()

product_name        135457
product_price       135457
quantity           1674638
hits.totalitems    1674638
dtype: int64

In [None]:
#re-check percentage of missing values
percent_missing = round(df_json_clean.isnull().sum() * 100 / len(df_json_clean),2)
percent_missing

product_id                         0.00
product_name                       7.90
product_price                      7.90
product_brand                      0.00
quantity                          97.69
event_time                         0.00
hitSequence                        0.00
first_session_event                0.00
last_session_event                 0.00
hits.totalitems                   97.69
total_price_cart                   0.00
customerVisitorId                  0.00
customerSessionId                  0.00
sessionStartTime                   0.00
sessionEndTime                     0.00
customerSessionNumber              0.00
timeOnSite                         0.00
totals_events                      0.00
newSession                         0.00
totalViewProducts                  0.00
totalViewProductRevenue            0.00
uniqueViewProducts                 0.00
totalAddToCarts                    0.00
totalAddToCartRevenue              0.00
uniqueAddToCarts                   0.00


In [None]:
# drop rows with missing values for  product_name
df_json_clean = df_json_clean.dropna(subset=['product_name'])

# drop rows with missing values for  product_price
df_json_clean = df_json_clean.dropna(subset=['product_price'])

# drop columns with significant amount of missing data
df_json_clean = df_json_clean.drop(['quantity'], axis = 1)

In [None]:
# re-check address missing values
null_columns= df_json_clean.columns[df_json_clean.isnull().any()]
df_json_clean[null_columns].isnull().sum()

hits.totalitems    1539181
dtype: int64

In [None]:
#transition into final dataframe
df_json_final = df_json_clean

### Create Time Based Features

In [None]:
#convert timestamps
df_json_final['event_time'] = df_json_final['event_time'].str.split(".").str.get(0)
df_json_final['sessionStartTime'] = df_json_final['sessionStartTime'].str.split(".").str.get(0)
df_json_final['sessionEndTime'] = df_json_final['sessionEndTime'].str.split(".").str.get(0)

#convert event_time columns to date_time data type
df_json_final['event_time'] = pd.to_datetime(df_json_final['event_time'])
df_json_final['sessionStartTime'] = pd.to_datetime(df_json_final['sessionStartTime'])
df_json_final['sessionEndTime'] = pd.to_datetime(df_json_final['sessionEndTime'])

In [None]:
#drop 'timeOnSite' column
df_json_final = df_json_final.drop(columns=['timeOnSite']) 

#re-create timeOnSite
df_json_final['timeOnSite'] = df_json_final['sessionEndTime']- df_json_final['sessionStartTime']
df_json_final['timeOnSite'] = df_json_final['timeOnSite']/np.timedelta64(1,'s')

In [None]:
#create feature: year
df_json_final['event_year'] = df_json_final.event_time.dt.year

#create feature: month
df_json_final['event_month'] = df_json_final.event_time.dt.month

#create feature: day of the week
df_json_final['event_day_of_week'] = df_json_final.event_time.dt.dayofweek

#create feature: day of the month
df_json_final['event_day_of_month'] = df_json_final.event_time.dt.day

#create feature: day of the year
df_json_final['event_day_of_year'] = df_json_final.event_time.dt.dayofyear

#create feature: hour
df_json_final['event_hour'] = df_json_final.event_time.dt.hour

#create feature: minute
df_json_final['event_minute'] = df_json_final.event_time.dt.minute

#create feature: second
df_json_final['event_second'] = df_json_final.event_time.dt.second

#create feature: micorsecond
df_json_final['event_micorsecond'] = df_json_final.event_time.dt.microsecond

In [None]:
#create feature: weekend flag
day_names = df_json_final.event_time.dt.day_name()
df_json_final['weekend']  = day_names.apply(lambda x : 1 if x in ['Saturday','Sunday'] else 0)

In [None]:
#create feature: holiday
import holidays
us_holidays = holidays.US()
df_json_final['holiday']  = df_json_final['event_time'].apply(lambda x : x in us_holidays)

In [None]:
#check data
df_json_final.head()

Unnamed: 0,product_id,product_name,product_price,product_brand,event_time,hitSequence,first_session_event,last_session_event,hits.totalitems,total_price_cart,customerVisitorId,customerSessionId,sessionStartTime,sessionEndTime,customerSessionNumber,totals_events,newSession,totalViewProducts,totalViewProductRevenue,uniqueViewProducts,totalAddToCarts,totalAddToCartRevenue,uniqueAddToCarts,totalOrders,totalOrderRevenue,uniqueOrders,event_type_cart,event_type_purchase,hits.eventType.category_search,event_type_view,timeOnSite,event_year,event_month,event_day_of_week,event_day_of_month,event_day_of_year,event_hour,event_minute,event_second,event_micorsecond,weekend,holiday
0,BR013,nutritional yeast seasoning,4.99,Bragg,2020-12-02 03:03:04,1,True,True,,0.0,cki6s4tr200013bdpkaye9mdn,cki6s4tr200013bdpkaye9mdn1606878184,2020-12-02 03:03:04,2020-12-02 03:03:04,1,1,True,1,4.99,1,0,0.0,0,0,0.0,0,0,0,0,1,0.0,2020,12,2,2,337,3,3,4,0,0,False
1,DRB004,pure castile bar soap peppermint,4.5,Dr. Bronner's,2020-12-02 02:06:37,1,True,True,,0.0,cki6rrijb000125c5cdol7ahd,cki6rrijb000125c5cdol7ahd1606874797,2020-12-02 02:06:37,2020-12-02 02:06:37,1,1,True,1,4.5,1,0,0.0,0,0,0.0,0,0,0,0,1,0.0,2020,12,2,2,337,2,6,37,0,0,False
2,SUS022,100% daily value multi-vitamin,8.39,Superior Source,2020-12-02 03:16:23,1,True,True,,0.0,cki6uboil00013he8jwwj434w,cki6uboil00013he8jwwj434w1606878983,2020-12-02 03:16:23,2020-12-02 03:16:23,1,1,True,1,8.39,1,0,0.0,0,0,0.0,0,0,0,0,1,0.0,2020,12,2,2,337,3,16,23,0,0,False
3,NLS002,natural elderberry concentrate blend,11.99,Natural Sources,2020-12-02 03:47:05,1,True,True,,0.0,cki6vcrm500013vcysh1hk38z,cki6vcrm500013vcysh1hk38z1606880825,2020-12-02 03:47:05,2020-12-02 03:47:05,1,1,True,1,11.99,1,0,0.0,0,0,0.0,0,0,0,0,1,0.0,2020,12,2,2,337,3,47,5,0,0,False
4,MOM002,organic hair remover with tea tree refill jar,10.49,Moom,2020-12-02 03:49:57,1,True,True,,0.0,cki6vggk900013xe68nl4dao3,cki6vggk900013xe68nl4dao31606880997,2020-12-02 03:49:57,2020-12-02 03:49:57,1,1,True,1,10.49,1,0,0.0,0,0,0.0,0,0,0,0,1,0.0,2020,12,2,2,337,3,49,57,0,0,False


### Final Updates

In [None]:
#drop uneccessary columns
df_json_final = df_json_final.drop(columns=['event_time','sessionStartTime', 'sessionEndTime'])

In [None]:
#Encode 'newSession' boolean columns
df_json_final.loc[df_json_final['newSession'] == True, 'newSession'] = 1
df_json_final.loc[df_json_final['newSession'] == False, 'newSession'] = 0

#Encode 'holiday' boolean columns
df_json_final.loc[df_json_final['holiday'] == True, 'holiday'] = 1
df_json_final.loc[df_json_final['holiday'] == False, 'holiday'] = 0 

#Encode 'first_session_event' boolean columns
df_json_final.loc[df_json_final['first_session_event'] == True, 'first_session_event'] = 1
df_json_final.loc[df_json_final['first_session_event'] == False, 'first_session_event'] = 0

#Encode 'last_session_event' boolean columns
df_json_final.loc[df_json_final['last_session_event'] == True, 'last_session_event'] = 1
df_json_final.loc[df_json_final['last_session_event'] == False, 'last_session_event'] = 0

In [None]:
df_json_final.head()

Unnamed: 0,product_id,product_name,product_price,product_brand,hitSequence,first_session_event,last_session_event,hits.totalitems,total_price_cart,customerVisitorId,customerSessionId,customerSessionNumber,totals_events,newSession,totalViewProducts,totalViewProductRevenue,uniqueViewProducts,totalAddToCarts,totalAddToCartRevenue,uniqueAddToCarts,totalOrders,totalOrderRevenue,uniqueOrders,event_type_cart,event_type_purchase,hits.eventType.category_search,event_type_view,timeOnSite,event_year,event_month,event_day_of_week,event_day_of_month,event_day_of_year,event_hour,event_minute,event_second,event_micorsecond,weekend,holiday
0,BR013,nutritional yeast seasoning,4.99,Bragg,1,1,1,,0.0,cki6s4tr200013bdpkaye9mdn,cki6s4tr200013bdpkaye9mdn1606878184,1,1,1,1,4.99,1,0,0.0,0,0,0.0,0,0,0,0,1,0.0,2020,12,2,2,337,3,3,4,0,0,0
1,DRB004,pure castile bar soap peppermint,4.5,Dr. Bronner's,1,1,1,,0.0,cki6rrijb000125c5cdol7ahd,cki6rrijb000125c5cdol7ahd1606874797,1,1,1,1,4.5,1,0,0.0,0,0,0.0,0,0,0,0,1,0.0,2020,12,2,2,337,2,6,37,0,0,0
2,SUS022,100% daily value multi-vitamin,8.39,Superior Source,1,1,1,,0.0,cki6uboil00013he8jwwj434w,cki6uboil00013he8jwwj434w1606878983,1,1,1,1,8.39,1,0,0.0,0,0,0.0,0,0,0,0,1,0.0,2020,12,2,2,337,3,16,23,0,0,0
3,NLS002,natural elderberry concentrate blend,11.99,Natural Sources,1,1,1,,0.0,cki6vcrm500013vcysh1hk38z,cki6vcrm500013vcysh1hk38z1606880825,1,1,1,1,11.99,1,0,0.0,0,0,0.0,0,0,0,0,1,0.0,2020,12,2,2,337,3,47,5,0,0,0
4,MOM002,organic hair remover with tea tree refill jar,10.49,Moom,1,1,1,,0.0,cki6vggk900013xe68nl4dao3,cki6vggk900013xe68nl4dao31606880997,1,1,1,1,10.49,1,0,0.0,0,0,0.0,0,0,0,0,1,0.0,2020,12,2,2,337,3,49,57,0,0,0


### Store in CSV File


In [None]:
#save to csv
df_json_final.to_csv('df_json_final.csv', index=False)