## 4.7 Deriving New Variables

### This script contains the following points:

#### 1. Import data ords_prods_merge.pkl
#### 2. Check data dimensions and create subset
#### 3. Create condition code for data filter: Low, Mid, and High range product pricing
#### 4. Using If-Statements with the loc() function
#### 5. Using If-Statements with For-Loops
#### 6. Task questions
#### 7. Export file

### Importing libraries

In [1]:
#import libraries
import pandas as pd
import numpy as np
import os

### 01. & 02. Importing data & Checking dimensions

In [2]:
#create string path for main project folder
path = r'/Users/AngieUS/Desktop/Instacart Project'

#import ords_prods_merge data
df= pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge.pkl'))

#check df
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,10258,2,0,Pistachios,117,19,3.0,both
2,2398795,1,2,3,7,15.0,12427,3,1,Original Beef Jerky,23,19,4.4,both
3,2398795,1,2,3,7,15.0,13176,4,0,Bag of Organic Bananas,24,4,10.3,both
4,2398795,1,2,3,7,15.0,26088,5,1,Aged White Cheddar Popcorn,23,19,4.7,both


In [3]:
#display dimensions of df - rows, columns
df.shape    

(30328763, 14)

In [4]:
#create a subset of df
df_sub = df[:1000000]

#display dimensions of df_sub - rows, columns
df_sub.shape    

(1000000, 14)

### 03. Create user-defined function to sort data into different categories

In [5]:
#define function
def price_label(row):

  if row['prices'] <= 5:
    return 'Low range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid range product'
  elif row['prices'] > 15:
    return 'High range product'
  else: return 'Not enough data'

In [6]:
#apply function
df_sub['price_range'] = df_sub.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['price_range'] = df_sub.apply(price_label, axis=1)


In [7]:
#check values 
df_sub['price_range'].value_counts(dropna = False)

price_range
Mid range product     672722
Low range product     314975
High range product     12303
Name: count, dtype: int64

#### All records have been filtered into Low, Mid, and High (totals = 1000000)

In [8]:
#check highest priced record
df_sub['prices'].describe()

count    1000000.000000
mean          11.215621
std          412.753297
min            1.000000
25%            4.200000
50%            7.300000
75%           11.300000
max        99999.000000
Name: prices, dtype: float64

In [9]:
df_sub['prices'].max()

99999.0

### 04. Using If-Statements with the loc() function

In [10]:
#change df_sub from a slice to a true copy
df_sub = df_sub.copy()

#create High condition
df_sub.loc[df_sub['prices'] > 15, 'price_range_loc'] = 'High range product'

In [11]:
#create Mid condition
df_sub.loc[(df_sub['prices'] <= 15) & (df_sub['prices'] > 5), 'price_range_loc'] = 'Mid range product' 

In [12]:
#create Low condition
df_sub.loc[df_sub['prices'] <= 5, 'price_range_loc'] = 'Low range product'

In [13]:
#check values
df_sub['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid range product     672722
Low range product     314975
High range product     12303
Name: count, dtype: int64

### Repeat process using full dataframe (df) of the ords_prods_merge.pkl data set

In [14]:
#create High condition
df.loc[df['prices'] > 15, 'price_range'] = 'High-range product'

In [15]:
#create Mid condition
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range'] = 'Mid-range product' 

In [16]:
#create Low condition
df.loc[df['prices'] <= 5, 'price_range'] = 'Low-range product'

In [17]:
#check values
df['price_range'].value_counts(dropna = False)

price_range
Mid-range product     20462144
Low-range product      9476774
High-range product      389845
Name: count, dtype: int64

#### All records have been filtered into Low, Mid, and High (totals = 30328763)

### 05. Using If-Statements with For-Loops

In [18]:
#display the frequency of orders for each day of the week
df['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    5779087
1    5303718
6    4190948
5    3952326
2    3947564
3    3600589
4    3554531
Name: count, dtype: int64

In [19]:
#create new 'busiest day' column and filter df
result = []

for value in df["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [20]:
#display result of sort/filter
result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Reg

In [21]:
#add results column to df
df['busiest day'] = result

#display the frequency of 'busiest day' column in df
df['busiest day'].value_counts(dropna = False)

busiest day
Regularly busy    20995145
Busiest day        5779087
Least busy         3554531
Name: count, dtype: int64

#### All records have been filtered into Busiest, Least, and Regularly (totals = 30328763)

### 06. Task Questions

#### Note: I imported the ords_prods_merge.pkl data file as 'df' and used 'df_sub' as my subset dataframe of [:1000000] for the lesson so you will see me use 'df' to complete the task questions as that is my dataframe for the full data set.

### #2 Update 'Busiest day' column to 'Busiest days' and filter by the <u>two</u> busiest days. Create a new column for 'Slowest days' for the <u>two</u> slowest days.

In [22]:
#display ords_prods_merge (df)
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range,busiest day
0,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy
1,2398795,1,2,3,7,15.0,10258,2,0,Pistachios,117,19,3.0,both,Low-range product,Regularly busy
2,2398795,1,2,3,7,15.0,12427,3,1,Original Beef Jerky,23,19,4.4,both,Low-range product,Regularly busy
3,2398795,1,2,3,7,15.0,13176,4,0,Bag of Organic Bananas,24,4,10.3,both,Mid-range product,Regularly busy
4,2398795,1,2,3,7,15.0,26088,5,1,Aged White Cheddar Popcorn,23,19,4.7,both,Low-range product,Regularly busy


In [23]:
#display dimensions ords_prods_merge (df)
df.shape

(30328763, 16)

In [24]:
#display descriptive analysis of ords_prods_merge (df)
df.describe().round(2) 

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,aisle_id,department_id,prices
count,30328763.0,30328763.0,30328763.0,30328763.0,30328763.0,30328763.0,30328763.0,30328763.0,30328763.0,30328763.0,30328763.0,30328763.0
mean,1710734.25,102933.9,18.25,2.74,13.41,11.1,25596.49,8.36,0.63,71.19,9.92,11.98
std,987243.34,59468.47,17.59,2.09,4.25,8.78,14081.89,7.13,0.48,38.21,6.28,496.37
min,2.0,1.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
25%,856142.5,51409.0,6.0,1.0,10.0,5.0,13541.0,3.0,0.0,31.0,4.0,4.2
50%,1710898.0,102599.0,12.0,3.0,13.0,8.0,25272.0,6.0,1.0,83.0,9.0,7.4
75%,2565509.0,154385.0,25.0,5.0,16.0,15.0,37947.0,11.0,1.0,107.0,16.0,11.3
max,3421083.0,206209.0,99.0,6.0,23.0,30.0,49688.0,145.0,1.0,134.0,21.0,99999.0


In [25]:
#display the frequency of orders for each day of the week
df['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    5779087
1    5303718
6    4190948
5    3952326
2    3947564
3    3600589
4    3554531
Name: count, dtype: int64

In [26]:
#renaming df column
df.rename(columns = {'busiest days' : 'busiest_days'}, inplace = True)

#check df columns
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range,busiest day
0,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy
1,2398795,1,2,3,7,15.0,10258,2,0,Pistachios,117,19,3.0,both,Low-range product,Regularly busy
2,2398795,1,2,3,7,15.0,12427,3,1,Original Beef Jerky,23,19,4.4,both,Low-range product,Regularly busy
3,2398795,1,2,3,7,15.0,13176,4,0,Bag of Organic Bananas,24,4,10.3,both,Mid-range product,Regularly busy
4,2398795,1,2,3,7,15.0,26088,5,1,Aged White Cheddar Popcorn,23,19,4.7,both,Low-range product,Regularly busy


In [27]:
#change filter to 2 busiest & 2 slowest days and update results to 'busiest_days' column
result = []
for value in df["orders_day_of_week"]:
  if value in [0, 1]:
    result.append("Busiest day")
  elif value in [3, 4]:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

df['busiest_days'] = result

In [28]:
#display the frequency of 'busiest days' column in df
df['busiest_days'].value_counts(dropna = False)

busiest_days
Regularly busy    12090838
Busiest day       11082805
Least busy         7155120
Name: count, dtype: int64

#### Checks on Busiest and Slowest now total both days (11082805 = 5779087+5303718) and (7155120 = 3600589+3554531). Also, for efficiency and to avoid redundancies, I've renamed 'busiest_day' to 'busiest_days' and wrote the updated results to that column. In line 25 you can see where I created the column practical.

In [29]:
#display the frequency of orders for each hour of the day
df['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2593725
11    2564597
14    2517238
15    2487586
13    2487500
12    2445841
16    2364969
9     2311334
17    1943858
8     1622394
18    1520954
19    1169224
20     910005
7      844665
21     746254
22     592432
23     375889
6      274801
0      203460
1      108110
5       82706
2       63961
4       49400
3       47860
Name: count, dtype: int64

#### selecting 8 highest values for 'most' aligns with >2M, selecting the 8 lowest values for 'fewest' aligns with <500K.

In [30]:
#create new filter & column for the busiest hours of the day by most, average, and fewest orders
result = []
for value in df["order_hour_of_day"]:
  if value in [10, 11, 14, 15, 13, 12, 16, 9]:
    result.append("Most orders")
  elif value in [23, 6, 0, 1, 5, 2, 4, 3]:
    result.append("Fewest orders")
  else:
    result.append("Average orders")

df['busiest_period_of_day'] = result

In [31]:
#check df columns
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range,busiest day,busiest_days,busiest_period_of_day
0,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy,Average orders
1,2398795,1,2,3,7,15.0,10258,2,0,Pistachios,117,19,3.0,both,Low-range product,Regularly busy,Least busy,Average orders
2,2398795,1,2,3,7,15.0,12427,3,1,Original Beef Jerky,23,19,4.4,both,Low-range product,Regularly busy,Least busy,Average orders
3,2398795,1,2,3,7,15.0,13176,4,0,Bag of Organic Bananas,24,4,10.3,both,Mid-range product,Regularly busy,Least busy,Average orders
4,2398795,1,2,3,7,15.0,26088,5,1,Aged White Cheddar Popcorn,23,19,4.7,both,Low-range product,Regularly busy,Least busy,Average orders


# display the frequency of 'busiest_period_of_day' column in df
df['busiest_period_of_day'].value_counts(dropna = False)

#### Checks on 'Most' and 'Fewest' align as all 8 hours selected match (Most = 19772790, Fewest = 1206187).

### 06. Export file

In [33]:
#export data (pkl file) as ords_prods_merge_nv
df.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_merge_nv.pkl'))