## If-Statements
### If-Statements with User-Defined Functions
1. A definition of the name and arguments the function will take
2. What the function is meant to do

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Set display options for better viewing

pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)  # Limit columns
pd.set_option('display.max_rows', 50)      # Limit rows

In [3]:
# Creating shortcut for data file
path = r'/Users/anjanpakhrin/Documents/Instacart Basket Analysis/'

In [4]:
# Create path to data file

df_ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge.pkl'))

In [5]:
# Checking output

df_ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23,19,4.4
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0


#### Applying following criteria:

* If price <= 5$, then "low-range product"

* If 5$ < price <= 15$, then "mid-range product"

* If price > 15$, then "high-range product"

In [6]:
# Count number of 99999.0 withing df
count_99999= (df_ords_prods_merge['prices'] == 99999).sum()
print(f"Number of rows with price = 99999: {count_99999}")

Number of rows with price = 99999: 698


In [7]:
# Create subset of the first one million rows
# Colon (:) after a number, e.g., 20: includes all the numbers before 20.
# Colom (:) before a number, e.g., :20 includes all the number after 20.
df = df_ords_prods_merge[:1000000]

#### Defining **price label**

In [8]:
# Define function

def price_label(row):

    if row['prices'] <= 5:
        return 'Low-range product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High-range product'
    else: 
        return 'Not enoguh data'

#### Applying function **"price_label"** --> flag price label to every row

In [9]:
# create a new column "price_range" within the df
df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [10]:
# Checking the frequency of each price label
df['price_range'].value_counts(dropna = False)

price_range
Mid-range product     673183
Low-range product     314392
High-range product     12425
Name: count, dtype: int64

In [11]:
# Checking most expensive product within this subset
df['prices'].max()

99999.0

In [12]:
# Count number of 99999.0 within df
count_99999= (df['prices'] == 99999).sum()
print(f"Number of rows with price = 99999: {count_99999}")

Number of rows with price = 99999: 15


### If-Statements with the loc() Function

#### Creating conditions for **subset**

In [13]:
# If price > 15$, then "high-range product"

df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [14]:
# If 5 < price <=, then "mid-range product"

df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [15]:
# If price > 15$, then "high-range product"

df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

#### Applying conditions within whole dataframe **ords_prods_merge**

In [16]:
# If price > 15$, then "high-range product"

df_ords_prods_merge.loc[df_ords_prods_merge['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [17]:
# If 5 < price <=, then "mid-range product"

df_ords_prods_merge.loc[(df_ords_prods_merge['prices'] <= 15) & (df_ords_prods_merge['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [18]:
# If price > 15$, then "high-range product"

df_ords_prods_merge.loc[df_ords_prods_merge['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [19]:
df_ords_prods_merge['price_range_loc'].value_counts(dropna=False)

price_range_loc
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

In [20]:
df_ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,Mid-range product
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23,19,4.4,Low-range product
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,Low-range product
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product


### If-Statements with For-Loops

In [21]:
# Frequency of the "order_day_of_week"
df_ords_prods_merge['order_day_of_week'].value_counts(dropna = False)

order_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

#### Create new column to flag **"Busiest day", "Least busy" & "Regularly busy"**

In [22]:
# Define function

result = []
for value in df_ords_prods_merge['order_day_of_week']:
    if value == 0:
        result.append('Busiest day')
    elif value == 4:
        result.append('Least busy')
    else:
        result.append('Regularly busy')

In [23]:
# Create a new column "busiest_day"

df_ords_prods_merge['busiest_day'] = result

In [24]:
# Checking frequency of "busiest_day"
df_ords_prods_merge['busiest_day'].value_counts(dropna=False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

In [25]:
# Checking output
df_ords_prods_merge.head(10)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy
5,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy
6,2398795,1,2,3,7,15.0,False,10258,2,0,Pistachios,117,19,3.0,Low-range product,Regularly busy
7,2398795,1,2,3,7,15.0,False,12427,3,1,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy
8,2398795,1,2,3,7,15.0,False,13176,4,0,Bag of Organic Bananas,24,4,10.3,Mid-range product,Regularly busy
9,2398795,1,2,3,7,15.0,False,26088,5,1,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy


### Task

#### **Step 2:** Creating flag for two "**Busiest days**" and two "**Slowest days**"

In [26]:
# Step 1: Getting frequency of the "order_day_of_week"
df_ords_prods_merge['order_day_of_week'].value_counts(dropna = False)

order_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

From frequency count busiest days of the week can be identified:
- **two_busiest_days:** 0 --> Saturday & 1 --> Sunday
- **two_slowest_days:** 3 --> Tuesday & 4 --> Wednesday

In [27]:
# Step 2: Identifying top two busiest and slowest days
two_busiest_days = [0, 1]
two_slowest_days = [3, 4]

# Define loop
result = []
for value in df_ords_prods_merge['order_day_of_week']:
    if value in two_busiest_days:
        result.append('Busiest days')
    elif value in two_slowest_days:
        result.append('Slowest days')
    else:
        result.append('Regularly busy')

In [28]:
# Step3: Add a new column "busiest days"
df_ords_prods_merge['busiest_days'] = result

In [29]:
# Checking frequency of "busiests_day"
df_ords_prods_merge['busiest_days'].value_counts(dropna=False)

busiest_days
Regularly busy    12916111
Busiest days      11864412
Slowest days       7624336
Name: count, dtype: int64

In [30]:
# Checking output
df_ords_prods_merge.groupby('busiest_days').head(2)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Regularly busy
5,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days
6,2398795,1,2,3,7,15.0,False,10258,2,0,Pistachios,117,19,3.0,Low-range product,Regularly busy,Slowest days
33,550135,1,7,1,9,20.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest days
34,550135,1,7,1,9,20.0,False,10258,2,1,Pistachios,117,19,3.0,Low-range product,Regularly busy,Busiest days


In [31]:
# Number of rows
print(len(df_ords_prods_merge))

32404859


In [32]:
# Sum of busiest_days
print(df_ords_prods_merge['busiest_days'].value_counts().sum())

32404859


#### **Step 3:** Accuracy check & Observations

In [33]:
# Checking total number of rows in dataframe
total_rows = len(df_ords_prods_merge)

In [34]:
# Calculate the sum of "busiest_days"
total_sum = df_ords_prods_merge['busiest_days'].value_counts().sum()

In [35]:
# Comparing the totals
if total_sum == total_rows:
    print('Values in column "busiest_days" are correct')
else:
    print('Values in column "busiest_days" are not correct')

Values in column "busiest_days" are correct


#### **Observations**
- After check it is proved that the values displayed in the column "busiest_days" are accurate. The sum of three different categories equals to the total number of rows in the dataframe.

- Weekends **(Saturday/Sunday)** have ca. 11.9M orders, which is fewer than the "Regular weekdays **(Mon/Thu/Fri)**" with ca. 12.9M orders.

- With only about 7.6M orders **Tuesday** and **Wednesday** are the "Slowest days" of the week.

- This confirms that orders midweek are significantly fewer.

#### **Step 4:** Labeling **"busiet period of day"**
##### Categories:
    - Most orders
    - Average orders
    - Fewest orders

In [36]:
# Step 1: Getting frequency of the "order_hour_of_day"
df_ords_prods_merge['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

Based on the frequency output the hours of the day can be categorized, as suggested, into
- **Most orders:** Top hours with highest order counts (typically top 1/3)
- **Fewest orders:** Bottom hours with lowest order counts (typically bottom 1/3)
- **Average orders:** Middle hours with moderate order counts

In [37]:
# Step2: Labeling based on order counts
most_orders = [10, 11, 14, 15, 13, 12, 16, 9]
fewest_orders = [23, 6, 0, 1, 5, 2, 4, 3]
average_orders = [17, 8, 18, 19, 20, 7, 21, 22]

##### **Strategic hour grouping**
- **Most orders:** Hours 9-16 (8 hours covering 9 AM to 4 PM) --> alligns with traditional business and lunch perios
- **Fewest orders:** Hours 0-6, 23 (8 hours covering midnight to 6 AM and 11 PM) --> overnight hours with minimal activity
- **Average orders:** Hours 7-8, 17-22 (8 hours covering 7-8 AM and 5-10 PM) --> average activity which follows early morning and evening pattern

In [38]:
# Step 3: Define loop
result = []
for value in df_ords_prods_merge['order_hour_of_day']:
    if value in most_orders:
        result.append('Most orders')
    elif value in fewest_orders:
        result.append('Fewest orders')
    else:
        result.append('Average orders')

In [39]:
# Step 4: Adding flag column "busiest_period_od_day"
df_ords_prods_merge['busiest_period_of_day'] = result

In [40]:
# Checking output
df_ords_prods_merge.groupby('busiest_period_of_day').head(2)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Regularly busy,Average orders
11,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Most orders
12,473747,1,3,3,12,21.0,False,12427,2,1,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Slowest days,Most orders
617,600894,8,1,6,0,,True,21903,1,0,Organic Baby Spinach,123,4,8.2,Mid-range product,Regularly busy,Regularly busy,Fewest orders
618,600894,8,1,6,0,,True,28985,2,0,Michigan Organic Kale,83,4,13.4,Mid-range product,Regularly busy,Regularly busy,Fewest orders


#### **Step 5:** Print the frequency for the new column **"busiest_period_of_day"**

In [41]:
# Printing frequency
print(df_ords_prods_merge['busiest_period_of_day'].value_counts(dropna=False))

busiest_period_of_day
Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: count, dtype: int64


##### - Most orders ca. 21.1M took place in business and lunch periods, which is 9-16.
##### - Average orders (ca. 10M) were placed in early morning and evening hours, i.e., 7-8, 17-22.
##### - Fewest orders were placed in overnight hours as expected, i.e., 0-6, 23.

#### **Step 7:** Export datafraime as pickle file

In [42]:
# Export data as pickle
df_ords_prods_merge.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_merge_4-7.pkl'))