# Table of Contents

## 1. Importing Libraries
## 2. Creating Experiment Data
## 3. Data Wrangling & Subsetting
### 3.1 Data Wrangling
### 3.1.1 Dropping Columns
### 3.1.2 Counting Values
### 3.1.3 Renaming Columns
### 3.1.4 ChangingData Types
### 3.1.5 Transposing Data
## 3.2 Data Dictionary
## 3.3 Subsetting
## 3.4 Exporting Data
## 4. Remaining Tasks
## 5. Exporting Data

# 1. Importing Libraries

In [134]:
# Import Libraries
import pandas as pd
import numpy as np
import os

# 2. Importing Data

In [136]:
# Importing data set orders.csv
df_ords = pd.read_csv(r'/Users/xxx/Documents/Instacart Basket Analysis - 2025-01-05/02 - Data/Original Data/orders.csv', index_col = False)

In [137]:
# Importing data set products.csv
df_prods = pd.read_csv(r'/Users/xxx/Documents/Instacart Basket Analysis - 2025-01-05/02 - Data/Original Data/products.csv', index_col = False)

# 3. Data Wrangling and Subsetting

## 3.1 Data Wrangling

### 3.1.1 Dropping Column "eval_set"

In [141]:
df_ords.drop(columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


### 3.1.2 Counting Values

In [143]:
df_ords['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

### 3.1.3 Renaming Columns

In [145]:
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [146]:
print(df_ords.head()) 

   order_id  user_id eval_set  order_number  orders_day_of_week  \
0   2539329        1    prior             1                   2   
1   2398795        1    prior             2                   3   
2    473747        1    prior             3                   3   
3   2254736        1    prior             4                   4   
4    431534        1    prior             5                   4   

   order_hour_of_day  days_since_prior_order  
0                  8                     NaN  
1                  7                    15.0  
2                 12                    21.0  
3                  7                    29.0  
4                 15                    28.0  


### 3.1.4 Changing A Variables Data Type

In [148]:
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [149]:
df_ords['order_id'].dtypes

dtype('O')

### 3.1.5 Transposing Data

In [151]:
# Importing data set departments.csv
df_dep = pd.read_csv(r'/Users/xxx/Documents/Instacart Basket Analysis - 2025-01-05/02 - Data/Original Data/departments.csv', index_col = False)

In [152]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [153]:
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [154]:
df_dep_t = df_dep.T

In [155]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [156]:
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [157]:
# Take the first row of df_dep_t for header
new_header = df_dep_t.iloc[0]

In [158]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [159]:
# Delete Copy of First Row
df_dep_t_new = df_dep_t[1:]

In [160]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [161]:
# Set Header Row as New Header
df_dep_t_new.columns = new_header

In [162]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


## 3.2 Data Dictionaries

In [164]:
# Turning df_dep_t_new frame into a dictionary
data_dict = df_dep_t_new.to_dict('index')

In [165]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [166]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [167]:
# Look Up Department ID
print(data_dict.get('19'))

{'department': 'snacks'}


## 3.3 Subsetting

In [169]:
# Creating a Subset for 'snacks'
df_snacks =  df_prods[df_prods['department_id']==19]

In [170]:
df_snacks

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [171]:
df_prods['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [172]:
df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [173]:
df_snacks =  df_prods[df_prods['department_id']==19]

In [174]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [175]:
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [176]:
df_snacks_2.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [177]:
df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]

In [178]:
df_snacks_3.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


## 3.4 Exporting Data Frames

In [180]:
# Export Data Frame
df_ords.to_csv(os.path.join('/Users', 'xxx', 'Documents', 'Instacart Basket Analysis - 2025-01-05', '02 - Data','Prepared Data', 'orders_wrangled.csv'))

# 4. Remaining Tasks

### 4.1 Find another identifier variable in the df_ords dataframe that doesn’t need to be included in your analysis as a numeric variable and change it to a suitable format.

In [183]:
# Find numeric variable and change it to a suitable format
df_ords

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [184]:
# Change data type for 'order_hour_of_day' to time format
# df_ords['order_hour_formatted'] = df_ords['order_hour_of_day_time'].apply(lambda x: (pd.Timestamp('today') + x).strftime('%H:%M'))

In [185]:
df_ords['order_hour_of_day_time'] = pd.to_timedelta(df_ords['order_hour_of_day'], unit='h')

In [186]:
df_ords

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,order_hour_of_day_time
0,2539329,1,prior,1,2,8,,0 days 08:00:00
1,2398795,1,prior,2,3,7,15.0,0 days 07:00:00
2,473747,1,prior,3,3,12,21.0,0 days 12:00:00
3,2254736,1,prior,4,4,7,29.0,0 days 07:00:00
4,431534,1,prior,5,4,15,28.0,0 days 15:00:00
...,...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0,0 days 18:00:00
3421079,1854736,206209,prior,11,4,10,30.0,0 days 10:00:00
3421080,626363,206209,prior,12,1,12,18.0,0 days 12:00:00
3421081,2977660,206209,prior,13,1,12,7.0,0 days 12:00:00


In [187]:
# Delete 'order_hour_of_day_time' column
df_ords = df_ords.drop(columns = ['order_hour_of_day_time'])

In [188]:
df_ords

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [189]:
# Delete 'order_hour_formatted' column
# df_ords = df_ords.drop(columns = ['order_hour_formatted'])

In [190]:
df_ords

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [191]:
# Change data type of "days_since_prior_order" from floating to integer after replacing any NaN values with 0
df_ords['days_since_prior_order'] = df_ords['days_since_prior_order'].fillna(0).astype(int)

In [192]:
df_ords

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,0
1,2398795,1,prior,2,3,7,15
2,473747,1,prior,3,3,12,21
3,2254736,1,prior,4,4,7,29
4,431534,1,prior,5,4,15,28
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29
3421079,1854736,206209,prior,11,4,10,30
3421080,626363,206209,prior,12,1,12,18
3421081,2977660,206209,prior,13,1,12,7


### 4.2 Look for a variable in your df_ords dataframe with an unintuitive name and change its name without overwriting the dataframe.

In [194]:
# Change unintuitive variable name without overwriting the dataframe
df_ords.rename(columns = {'order_dow' : 'order_weekday'}, inplace = False)
df_ords.rename(columns = {'order_hour_of_day' : 'order_hour'}, inplace = False)
df_ords.rename(columns = {'days_since_prior_order' : 'days_since_last_order'}, inplace = False)


Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,prior,1,2,8,0
1,2398795,1,prior,2,3,7,15
2,473747,1,prior,3,3,12,21
3,2254736,1,prior,4,4,7,29
4,431534,1,prior,5,4,15,28
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29
3421079,1854736,206209,prior,11,4,10,30
3421080,626363,206209,prior,12,1,12,18
3421081,2977660,206209,prior,13,1,12,7


In [213]:
df_ords.shape

(3421083, 7)

### 4.3 Your client wants to know what the busiest hour is for placing orders. Find the frequency of the corresponding variable and share your findings.

In [196]:
# Calculate frequency of orders for each hour
hourly_frequency = df_ords['order_hour_of_day'].value_counts().sort_index()

# Find the busiest hour (the hour with the maximum frequency)
busiest_hour = hourly_frequency.idxmax()
busiest_hour_count = hourly_frequency.max()

# Display the results
print("Hourly Order Frequencies:")
print(hourly_frequency)
print(f"\nBusiest Hour: {busiest_hour} with {busiest_hour_count} orders")

Hourly Order Frequencies:
order_hour_of_day
0      22758
1      12398
2       7539
3       5474
4       5527
5       9569
6      30529
7      91868
8     178201
9     257812
10    288418
11    284728
12    272841
13    277999
14    283042
15    283639
16    272553
17    228795
18    182912
19    140569
20    104292
21     78109
22     61468
23     40043
Name: count, dtype: int64

Busiest Hour: 10 with 288418 orders


### 4.4 Determine the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary.

In [198]:
# Look Up Department ID #4
print(data_dict.get('4'))

{'department': 'produce'}


### 4.5 The sales team in your client’s organization wants to know more about breakfast item sales. Create a subset containing only the required information.

In [200]:
# Creating a Subset for 'breakfast'
df_breakfast =  df_prods[df_prods['department_id']==14]

In [201]:
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


In [202]:
# Join df_breakfast and df_ords to get order numbbers for breakfast items
# Reset the index on df_breakfast to use it for the merge
df_breakfast_reset = df_breakfast.reset_index()

# Merge df_breakfast_reset with df_ords on the first column, which serves as a logical product identifier
breakfast_orders = df_breakfast_reset.merge(df_ords, left_on='product_id', right_index=True, how='inner')

# Display the resulting subset
print(breakfast_orders.head())

   index  product_id                                      product_name  \
0     27          28                                 Wheat Chex Cereal   
1     33          34                                               NaN   
2     67          68                           Pancake Mix, Buttermilk   
3     89          90                                      Smorz Cereal   
4    210         211  Gluten Free Organic Cereal Coconut Maple Vanilla   

   aisle_id  department_id  prices order_id  user_id eval_set  order_number  \
0       121             14    10.1  3002854        3    prior             3   
1       121             14    12.2  3160850        3    prior             9   
2       130             14    13.7  3120740        7    prior            15   
3       121             14     3.9  1170872       11    prior             2   
4       130             14     3.6  2808909       19    prior             4   

   orders_day_of_week  order_hour_of_day  days_since_prior_order  
0            

In [203]:
# Calculate total sales value for breakfast items
# Calculate sales for each row
breakfast_orders['sales'] = breakfast_orders['order_number'] * breakfast_orders['prices']

# Calculate the total sales for all breakfast items
total_breakfast_sales = breakfast_orders['sales'].sum()

# Output the result
print(f"Total sales for breakfast items: ${total_breakfast_sales:.2f}")

Total sales for breakfast items: $153331.80


### 4.6 They’d also like to see details about products that customers might use to throw dinner parties. Your task is to find all observations from the entire dataframe that include items from the following departments: alcohol, deli, beverages, and meat/seafood. You’ll need to present this subset to your client.

In [205]:
# List of department IDs to include in the dinner party subset
department_ids = [5, 7, 12, 20] 

# Creating a Subset for dinner party items "alcohol, deli, beverages, meat seafood"
df_dinnerparty = df_prods[df_prods['department_id'].isin(department_ids)]

# Display the resulting subset
print(df_dinnerparty.head())

    product_id                                    product_name  aisle_id  \
2            3            Robust Golden Unsweetened Oolong Tea        94   
6            7                  Pure Coconut Water With Orange        98   
9           10  Sparkling Orange Juice & Prickly Pear Beverage       115   
10          11                               Peach Mango Juice        31   
16          17                               Rendered Duck Fat        35   

    department_id  prices  
2               7     4.5  
6               7     4.4  
9               7     8.4  
10              7     2.8  
16             12    17.1  


In [206]:
It’s important that you keep track of total counts in your dataframes. How many rows does the last dataframe you created have?

Object `have` not found.


In [207]:
# Total Number of Rows for df_dinnerparty 
total_rows = df_dinnerparty.shape[0]

# Print total number of rows
print(f"Total number of rows: {total_rows}")

Total number of rows: 7650


### 4.7 Someone from the data engineers team in Instacart thinks they’ve spotted something strange about the customer with a "user_id" of “1.” Extract all the information you can about this user.

In [209]:
# Filter df_ords for user_id 1
customer_orders = df_ords[df_ords['user_id'] == 1]

# Display the customer's order information
print(customer_orders)

   order_id  user_id eval_set  order_number  orders_day_of_week  \
0   2539329        1    prior             1                   2   
1   2398795        1    prior             2                   3   
2    473747        1    prior             3                   3   
3   2254736        1    prior             4                   4   
4    431534        1    prior             5                   4   
5   3367565        1    prior             6                   2   
6    550135        1    prior             7                   1   
7   3108588        1    prior             8                   1   
8   2295261        1    prior             9                   1   
9   2550362        1    prior            10                   4   
10  1187899        1    train            11                   4   

    order_hour_of_day  days_since_prior_order  
0                   8                       0  
1                   7                      15  
2                  12                      21  
3  

In [210]:
# Filter df_ords for user_id 1 to get customer-specific orders
customer_orders = df_ords[df_ords['user_id'] == 1]

# Align df_prods with customer_orders using the index
aligned_prods = df_prods.loc[customer_orders.index]

# Combine order data with aligned product information based on their index positions
customer_order_details = customer_orders.join(aligned_prods, rsuffix='_prod')

# Display the complete information about the customer's orders
print(customer_order_details)

   order_id  user_id eval_set  order_number  orders_day_of_week  \
0   2539329        1    prior             1                   2   
1   2398795        1    prior             2                   3   
2    473747        1    prior             3                   3   
3   2254736        1    prior             4                   4   
4    431534        1    prior             5                   4   
5   3367565        1    prior             6                   2   
6    550135        1    prior             7                   1   
7   3108588        1    prior             8                   1   
8   2295261        1    prior             9                   1   
9   2550362        1    prior            10                   4   
10  1187899        1    train            11                   4   

    order_hour_of_day  days_since_prior_order  product_id  \
0                   8                       0           1   
1                   7                      15           2   
2           

In [211]:
# Importing data set departments.csv
df_dep = pd.read_csv(r'/Users/xxx/Documents/Instacart Basket Analysis - 2025-01-05/02 - Data/Original Data/departments.csv', index_col = False)

# Transposing
df_dep.T
df_dep_trans = df_dep.T

# Index Reset
df_dep_trans.reset_index()


# Assign the first row of the DataFrame as the header
n_header = df_dep_trans.iloc[0] # grab the first row for the header


# Assign the first row to be the header and remove it from the DataFrame
df_dep_trans_new = df_dep_trans[1:] # take the data less the header row
df_dep_trans_new.columns = n_header # set the header row as the df header

# Print the DataFrame to verify
print(df_dep_trans_new.head())


department_id department
1                 frozen
2                  other
3                 bakery
4                produce
5                alcohol


In [212]:
# Use a left join to retain all records from customer_order_details
customer_complete_info = customer_order_details.merge(df_dep_t_new, on='department_id', how='left')

# Display the complete information about the customer's orders including department names
print(customer_complete_info)

KeyError: 'department_id'

### 4.8 You also need to provide some details about this user’s behavior. What basic stats can you provide based on the information you have?

In [None]:
customer_order_details.drop(columns = ['eval_set'])

In [None]:
#Basic Stats:
#How often did they order:
row_count = customer_order_details.shape[0]
print(f"Number of rows: {row_count}")

#On which days did they order: Monday, Tuesday, Wednesday, Thursday
#At what times did they order: Between 7am and 4pm
#What products did they order:  
#How many items do they order in one order:

In [None]:
#How much did they spend in total, average, min, max:
# Calculate the total revenue for customer_id 1
total_revenue = customer_order_details['prices'].sum()

# Calculate the average revenue for customer_id 1
average_revenue = customer_order_details['prices'].mean()

# Calculate the minimum revenue for customer_id 1
min_revenue = customer_order_details['prices'].min()

# Calculate the maximum revenue for customer_id 1
max_revenue = customer_order_details['prices'].max()

# Print the results
print(f"Total Revenue for Customer 1: {total_revenue}")
print(f"Average Revenue for Customer 1: {average_revenue}")
print(f"Minimum Revenue for Customer 1: {min_revenue}")
print(f"Maximum Revenue for Customer 1: {max_revenue}")

# 5. Exporting Data Frames

In [None]:
# Export df_ords Data Frame
df_ords.to_csv(os.path.join('/Users', 'xxx', 'Documents', 'Instacart Basket Analysis - 2025-01-05', '02 - Data','Prepared Data', 'orders_wrangled.csv'))

In [None]:
# Export df_dep_t_new Data Frame
df_dep_t_new.to_csv(os.path.join('/Users', 'xxx', 'Documents', 'Instacart Basket Analysis - 2025-01-05', '02 - Data','Prepared Data', 'departments_wrangled.csv'))