# Contents

## 01 Set up

## 02 Generating a sample

## 03 Working with user defined functions

## 04 Working with the loc() function from the python librariesf

## 05 If statements with For loops

## 06 Handling a change request from a customer.

## 07 Ad Hoc Request - Determining Busiest Period of the Day

## 08 Creating a flag by merging a list to the dataframe

## 09 Exporting df to pickle file

### 01 Set up

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Defining the main project path
proj = r'C:\Users\bfd_6\Documents\Career Foundry Project\Course Part 2\A4 Python\Instacart Basket Analysis'

In [3]:
# Creating a data frame for the orders_products_merged dataset
df_op_merged = pd.read_pickle(os.path.join(proj, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))

### 02 Generating a sample

In [4]:
# Creating a sample for the steps in this exercise
df_sample = df_op_merged[:1000000]

In [5]:
# Checking on the sample
df_sample.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


#### Dropping the merge flag from the sample to make it more efficient.

In [6]:
# Dropping the _merge flag for the sample for efficiency. It is not needed for this practice session.
df_sample = df_sample.drop(columns = ['_merge'])

In [7]:
df_sample.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0


In [8]:
df_sample.shape

(1000000, 13)

### 03 Working with user defined functions

In [9]:
# Defining a new function
def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [10]:
# Applying the new function and creating a new column in the sample df
df_sample['price_range'] = df_sample.apply(price_label, axis = 1)

In [11]:
# Checking results for the new function
df_sample['price_range'].value_counts(dropna = False)

price_range
Mid-range product    756450
Low-range product    243550
Name: count, dtype: int64

In [12]:
# Checking the sample for max price
df_sample['prices'].max()

14.8

### 04 Working with the loc() function from the python libraries

#### Trying loc() function

In [13]:
df_sample.loc[df_sample['prices'] > 15, 'price_range_loc'] = 'High-range product'

  df_sample.loc[df_sample['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [14]:
df_sample.loc[(df_sample['prices'] <= 15) & (df_sample['prices'] > 5), 'price_range_loc'] = 'Mid range product'

In [15]:
df_sample.loc[df_sample['prices'] <= 5, 'price_range_loc'] = 'Low range product'

In [16]:
df_sample['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid range product    756450
Low range product    243550
Name: count, dtype: int64

#### Applying the loc() function method to the full dataframe

In [17]:
df_op_merged.loc[df_op_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'

  df_op_merged.loc[df_op_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [18]:
df_op_merged.loc[(df_op_merged['prices'] <= 15) & (df_op_merged['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [19]:
df_op_merged.loc[df_op_merged['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [20]:
df_op_merged['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

### 05 If statements with For loops

#### Developing a flag to create subgroups of busiest days of the week.

In [21]:
# Finding out which days are the busiest
df_op_merged['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [22]:
# Creating a list that will be merged to create a new column
result = []

for elf in df_op_merged["orders_day_of_week"]:
    if elf == 0:
        result.append("Busiest Day")
    elif elf == 4:
        result.append("Least Busy")
    else:
        result.append("Regularly Busy")

In [23]:
# Printing first 10 lines of list rather than millions of lines
result[:10]

['Regularly Busy',
 'Regularly Busy',
 'Regularly Busy',
 'Least Busy',
 'Least Busy',
 'Regularly Busy',
 'Regularly Busy',
 'Regularly Busy',
 'Regularly Busy',
 'Least Busy']

In [24]:
# Creating the "Busiest Day" column by merging the list "result"
df_op_merged['busiest_day'] = result

In [25]:
df_op_merged['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly Busy    22416875
Busiest Day        6204182
Least Busy         3783802
Name: count, dtype: int64

### 06 Handling a change request from a customer.

#### Generally, this would be a replacement of busiest_day with busiest_days. I'm electing to keep both for the purposes of this exercise. I'm interpreting the request to mean that the customer wants both Saturday and Sunday to have the "Busiest days" tag, and Tuesday and Wednesday to have the "Slowest Days" tag. In the field, I would clarify this with the client or my boss. 

In [26]:
# Creating a list that will be merged to create the new column requested by the client
n_list = []

for grinch in df_op_merged["orders_day_of_week"]:
    if grinch == 0 or grinch == 1:
        n_list.append("Busiest Days")
    elif grinch == 4 or grinch == 3:
        n_list.append("Slowest Days")
    else:
        n_list.append("Regular Day")

In [27]:
n_list[:10]

['Regular Day',
 'Slowest Days',
 'Slowest Days',
 'Slowest Days',
 'Slowest Days',
 'Regular Day',
 'Busiest Days',
 'Busiest Days',
 'Busiest Days',
 'Slowest Days']

In [28]:
# Creating the "Busiest Days" column by merging the list "n_list"
df_op_merged['busiest_days'] = n_list

In [29]:
df_op_merged['busiest_days'].value_counts(dropna = False)

busiest_days
Regular Day     12916111
Busiest Days    11864412
Slowest Days     7624336
Name: count, dtype: int64

#### The results are as expected. Busiest days reflect the sum of Saturday and Sunday, Slowest days reflect the sum of Tuesday and Wednesday and the balance is reflected by Regular Day

### 07 Ad Hoc Request - Determining Busiest Period of the Day

In [30]:
# Finding out which hours are busiest
df_op_merged['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

#### The time ranges were not specified in the client request. I would have clarified this with either my boss or the client. However, for the purposes of this exercise, I am specifying value_count less than 400,000 as Fewest, between 400,000 and less than 2 million as Average, and 2 million or greater as Most. That translates to less than 7 for Fewest, 7 - 9 am and 6 pm - midnight for Average, and 9 am until 6 pm for Most. 

In [31]:
# Creating a list that will be merged to create the new busiest_period_of_day column requested by the client
sc_list = []

for Santa in df_op_merged["order_hour_of_day"]:
    if Santa < 7:
        sc_list.append("Fewest Orders")
    elif (Santa >= 7 and Santa < 9) or Santa >= 18:
        sc_list.append("Average Orders")
    else:
        sc_list.append("Most Orders")

In [32]:
# Checking a sample of the list
sc_list[100:120]

['Most Orders',
 'Most Orders',
 'Average Orders',
 'Most Orders',
 'Most Orders',
 'Most Orders',
 'Most Orders',
 'Most Orders',
 'Most Orders',
 'Most Orders',
 'Most Orders',
 'Most Orders',
 'Most Orders',
 'Most Orders',
 'Most Orders',
 'Most Orders',
 'Most Orders',
 'Average Orders',
 'Most Orders',
 'Most Orders']

In [33]:
# Checking to see if the "Fewest Orders" value is present in sc_list
print("Fewest Orders" in sc_list)

True


### 08 Creating a flag by merging a list to the dataframe

In [34]:
# Creating the "Busiest Period of Day" column by merging the list "sc_list"
df_op_merged['busiest_period_of_day'] = sc_list

In [35]:
# Reviewing the frequency of the busiest period of day column
df_op_merged['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Most Orders       23205725
Average Orders     8312313
Fewest Orders       886821
Name: count, dtype: int64

In [36]:
df_op_merged.head(20)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly Busy,Regular Day,Average Orders
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly Busy,Slowest Days,Average Orders
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly Busy,Slowest Days,Most Orders
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least Busy,Slowest Days,Average Orders
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least Busy,Slowest Days,Most Orders
5,3367565,1,6,2,7,19.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly Busy,Regular Day,Average Orders
6,550135,1,7,1,9,20.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly Busy,Busiest Days,Most Orders
7,3108588,1,8,1,14,14.0,196,2,1,Soda,77,7,9.0,both,Mid-range product,Regularly Busy,Busiest Days,Most Orders
8,2295261,1,9,1,16,0.0,196,4,1,Soda,77,7,9.0,both,Mid-range product,Regularly Busy,Busiest Days,Most Orders
9,2550362,1,10,4,8,30.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least Busy,Slowest Days,Average Orders


### 09 Exporting df to pickle file

In [37]:
# Exporting pickle file to prepared data folder (columns added)
df_op_merged.to_pickle(os.path.join(proj, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))