## Data Wrangling 

### 1. Dropping unneccessary columns
### 2. Finding missing values (NaN)
### 3. Transposing imported table 
### 4. Changing Data Type
### 5. Renaming a column

## Analysis 

### 6. Using value_counts 
### 7. Creating a subset from the data 
### 8. Information about user_id : 1

In [1]:
## importing libraries & data 
import pandas as pd 
import numpy as np 
import os 

path = r'/Users/aahamoustafa/Desktop/Data Projects/Instacart Basket Analysis  12-2024'

df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'),  index_col = False)

df_dep = pd.read_csv(os.path.join(path,'02 Data', 'Original Data', 'departments.csv'),  index_col = False)

## Data Wrangling

### 1. Dropping the eval_set column 

In [30]:
df_ords = df_ords.drop(columns = ['eval_set'])

### 2. Finding Missing Values 

In [10]:
## completing a count on days since prior order 

df_ords['days_since_prior_order'].value_counts(dropna = False) 

## We got 206,209 NaN (missing values) 

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

### 3. Transposing Department Table

In [11]:
## Transposing department table 

df_dep_t = df_dep.T

In [12]:
df_dep_t.head(30)

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [13]:
## to fix the headers issue we have above 

df_dep_t.reset_index()

new_header = df_dep_t.iloc[0]

df_dep_t_new = df_dep_t[1:]

df_dep_t_new.columns = new_header

df_dep_t_new.head(30)

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# Exercise Tasks 

### 4. Changing Data type from int to str 

In [32]:
df_ords['user_id'] = df_ords['user_id'].astype('str')

df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 object 
 2   order_number            int64  
 3   order_dow               int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 156.6+ MB


### 5. Renaming order_dow column

In [34]:
## Renaming order_dow column 

df_ords = df_ords.rename(columns = {'order_dow' : 'order_day_of_week'})
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


### 6. Busiest hour of the day for placing orders?

In [17]:
df_ords['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

#### We can see the busiest hour is 10-11AM 

### 7. Creating subsets from the data

In [43]:
## subset for the breakfast item sales ( department_id = 14 )

df_breakfast_sales =  df_prods[df_prods['department_id']==14]
df_breakfast_sales

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


#### Total breakfast items sold resulted in 1116 rows

In [21]:
## subset for products used for dinner parties departments( alcohol, deli, beverages, and meat/seafood)

df_dinner_parties = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]

df_dinner_parties.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1


#### Result is 7650 rows of products

### 8. information about user_id : 1 

In [49]:
## 1. How many orders did they make? 

# Step 1. Filter data for the user
user_data = df_ords.query("user_id == '1'")

## 2. How many orders did they make? 
num_orders = user_data['order_id'].nunique()
print("Total number of orders made by user 1: " +  str(num_orders))

## 3. date with most orders? 
order_counts_day = user_data.groupby('order_day_of_week')['order_id'].count()

max_orders_date = order_counts_day.idxmax()
max_orders_count = order_counts_day.max()

print(f"The day of the week with most orders for user 1 is {max_orders_date} with {max_orders_count} orders.")

## 4. hour with most orders?
order_counts_hour = user_data.groupby('order_hour_of_day')['order_id'].count() 

max_orders_hour = order_counts_hour.idxmax()
max_orders_count = order_counts_hour.max()

print(f"The hour of the day with most orders for user 1 is {max_orders_hour} with {max_orders_count} orders.")



Total number of orders made by user 1: 11
The day of the week with most orders for user 1 is 4 with 4 orders.
The hour of the day with most orders for user 1 is 7 with 3 orders.


In [44]:
## exporting the data

df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

df_dep_t_new.to_csv(os.path.join(path,'02 Data','Prepared Data', 'departments_wrangled.csv'))
                                 