# IC 4.7 Deriving New Variables

# Table of Contents
##### 1. Import libraries
##### 2. Import data
##### 3. Check data shape
##### 4. 4. Derive new variables
    4.1 Create 'price_label' column
    4.2 Create 'busiest_day' column
    4.3 Create 'Busiest_days' column
    4.4 Create 'busiest_period_of_day' column
##### 5. Export data

## 1. Import libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

## 2. Import data

In [3]:
# Create path
path = r'C:\Users\ashle\Documents\07-2024 Instacart Basket Analysis' # Assigns the master file path to variable 'path'

# Import data
df_ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge.pkl'))

## 3. Check data shape

In [4]:
# Check shape of df_ords_prods_merge
df_ords_prods_merge.shape

(32404859, 15)

In [7]:
# Check column names to reference when deriving new variables
df_ords_prods_merge.columns

Index(['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order', 'first_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', '_merge'],
      dtype='object')

In [9]:
# Drop '_merge' flag created during previous merge. It is no longer relevant
df_ords_prods_merge.drop(columns=['_merge'], inplace=True)

In [11]:
df_ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23,19,4.4
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0


## 4. Derive new variables

### 4.1 Create 'price_label' column

In [13]:
# Check frequency of 'prices' column

df_ords_prods_merge['prices'].value_counts(dropna = False)

prices
12.3    624261
10.3    510496
4.0     447272
6.3     438467
1.3     425691
         ...  
20.2       123
22.7        99
21.5        85
18.3        11
21.0         8
Name: count, Length: 242, dtype: int64

In [15]:
# Create 'High-range product' label and add to 'price_label' column

df_ords_prods_merge.loc[df_ords_prods_merge['prices'] > 15, 'price_label'] = 'High-range product'

In [16]:
# Create 'Mid-range product' label and add to 'price_label' column

df_ords_prods_merge.loc[(df_ords_prods_merge['prices'] <= 15) & (df_ords_prods_merge['prices'] > 5), 'price_label'] = 'Mid-range product'

In [19]:
# Create 'Low-range product' label and add to 'price_label' column

df_ords_prods_merge.loc[df_ords_prods_merge['prices'] < 5, 'price_label'] = 'Low-range product'

In [21]:
# Check frequency of new 'price_label' column

df_ords_prods_merge['price_label'].value_counts(dropna = False)

price_label
Mid-range product     21860860
Low-range product      9900417
High-range product      417678
NaN                     225904
Name: count, dtype: int64

### 4.2 Create 'busiest_day' column

In [23]:
# Check frequency of order_dow column

df_ords_prods_merge['order_dow'].value_counts(dropna = False)

order_dow
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [25]:
# Create a result list for the order day of week column showing busiest day, least busy, or regularly busy
result = []

for value in df_ords_prods_merge['order_dow']:
    if value == 0:
        result.append('Busiest day')
    elif value == 4:
        result.append('Least busy')
    else:
        result.append('Regularly busy')

In [26]:
# Assign result list to a new column called 'busiest_day'

df_ords_prods_merge['busiest_day'] = result

In [29]:
# Check frequency of new column 'busiest_day' compared to the order_dow column. 

df_ords_prods_merge['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

### 4.3 Create 'Busiest_days' column

In [31]:
# Check frequency on 'order_dow' column

df_ords_prods_merge['order_dow'].value_counts(dropna = False)

order_dow
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

0 = Saturday, 1 = Sunday, 2 = Monday, 3 = Tuesday, 4 = Wednesday, 5 = Thursday, 6 = Friday

In [33]:
# Create a result list for 'busiest days', 'slowest days', 'regular days'

result2 = []

for value in df_ords_prods_merge['order_dow']:
    if value == 0 or value == 1:
        result2.append('Busiest days')
    elif value == 3 or value == 4:
        result2.append('Slowest days')
    else:
        result2.append('Regular days')
    

In [35]:
# Assign result2 list to a new column 'busiest_days'

df_ords_prods_merge['busiest_days'] = result2

In [37]:
df_ords_prods_merge['busiest_days'].value_counts(dropna = False)

busiest_days
Regular days    12916111
Busiest days    11864412
Slowest days     7624336
Name: count, dtype: int64

The frequency counts in the new 'busiest_days' column are accurate compared to the frequency counts for each day in the 'order_dow' column as shown in above cells using the .value_counts function. 

### 4.4 Create 'busiest_period_of_day' column

In [39]:
# Check frequency of order_hour_of_day column

df_ords_prods_merge['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [41]:
# Create result3 list to hold 'Most orders', 'Average orders' and 'Fewest orders'.

result3 = []

for value in df_ords_prods_merge['order_hour_of_day']: 
    if value == 10:
        result3.append('Most orders')
    elif value == 3:
        result3.append('Fewest orders')
    else:
        result3.append('Average orders')

In [43]:
# Assign result3 to a new column called 'busiest_period_of_day'

df_ords_prods_merge['busiest_period_of_day'] = result3

In [45]:
# Check frequency of busiest_period_of_day column

df_ords_prods_merge['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Average orders    29591818
Most orders        2761760
Fewest orders        51281
Name: count, dtype: int64

In [47]:
# Check shape of df_ords_prods_merge before exporting
df_ords_prods_merge.shape

(32404859, 18)

## 5. Export Data

In [49]:
df_ords_prods_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merged_with_flags.pkl'))