### 4.4: Data Wrngling & Subsetting

In [1]:
# Import libraries
import pandas as pd
import numpy as nm
import os

In [2]:
# Set display options for better viewing
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 100)  # Limit columns
pd.set_option('display.max_rows', 50)      # Limit rows

In [3]:
# Create shortcut for data file
path= r'/Users/anjanpakhrin/Documents/Instacart Basket Analysis'

In [4]:
# Create path to data files
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

#### Data Wrangling - Dropping Columns
##### df.drop(columns = ['variable'])

In [5]:
# Removing column "eval_set" from "orders.csv"
df_ords = df_ords.drop(columns = ['eval_set'])

In [6]:
# Checking for missing values (NaN)
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [7]:
# Investigating missing values - checks missing values (NaN) in "say_since_prior_order"
df_ords['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

#### Data Wrangling - Renaming Columns
##### df.rename(columns = {'old_name' : 'new_name'}, inplace = True)

In [8]:
df_ords.rename(columns = {'order_dow' : 'orders_days_of_week'}, inplace = True)
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_days_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


#### Data Wrangling - Changing Data Types
##### df_ords['order_id'] = df_drops['order_id'].astype('str')

In [9]:
# Renaming order_id data type to "string"
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [10]:
df_ords['order_id'].dtype

dtype('O')

#### Data Wrangling - Transposing Data

In [11]:
# Importing "Department Data Set"
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [12]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,bulk,personal care,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [13]:
# Transposing "Department Data Set"
df_dep_t = df_dep.T

In [14]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [15]:
# Resetting "Department data set" index
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


#### Creating new header for transposed dataframe
##### 1) Create a new header
##### 2) Remove the first row in the dataframe
##### 3) Add the new header

In [16]:
# Creating a new header for transposed dataframe (takes first row of df_dep_t)
new_header = df_dep_t.iloc[0]

In [17]:
new_header

0    department
Name: department_id, dtype: object

In [18]:
# Removing first row  (Take the data under the header row for a new df)
df_dep_t_new = df_dep_t[1:]

In [19]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [20]:
# Assign new header
df_dep_t_new.columns = new_header # set the header row as the df header

In [21]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


####  Data Dictionary

In [22]:
# Create Data Dictionary
data_dict = df_dep_t_new.to_dict('index')

In [23]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [24]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [25]:
# Checking index
print(data_dict.get('19'))  # access only a single key

{'department': 'snacks'}


In [26]:
# Checking multiple keys
keys_i_want = ['5', '8', '13', '15']
values = [data_dict.get(key) for key in keys_i_want]
print(values)

[{'department': 'alcohol'}, {'department': 'pets'}, {'department': 'pantry'}, {'department': 'canned goods'}]


#### Subsetting

In [27]:
df_prods['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [28]:
# Creating a subset for "snacks"
df_snacks = df_prods[df_prods['department_id']==19]

In [29]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


#### Exporting Dataframes
##### df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'))

### Data Wrangling & Subsetting - "orders.csv"

In [30]:
# Returning data type of "df_ords"
print(df_ords.dtypes)

order_id                   object
user_id                     int64
order_number                int64
orders_days_of_week         int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object


#### Step 2: Changing user_id data type to "string"

In [31]:
# Changing "user_id" from "int64" to "str"
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [32]:
df_ords['user_id'].dtype

dtype('O')

#### Step 3: Unintuitive name (orders_days_of_week)

In [33]:
# Changing unintuitive name - Renaming Column
df_ords.rename(columns = {'orders_days_of_week' : 'order_day_of_week'}, inplace = True)
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


#### Step 4: Identify busiest hour of the day

In [34]:
# Frequency of hourly orders
hourly_orders = df_ords['order_hour_of_day'].value_counts()

In [35]:
# Indentify busiest hour
busiest_hour = hourly_orders.idxmax()
busiest_count = hourly_orders.max()
print(f'busiest hour: {busiest_hour}:00')
print(f'busiest count:{busiest_count}')

busiest hour: 10:00
busiest count:288418


#### Step 5: Determine meaning of value 4 in "department_id"

In [36]:
# Recalling Data Dictionary
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [37]:
# Find value for 4
print(data_dict.get('4'))

{'department': 'produce'}


#### Step 6: Creating subset for brekfast items

In [38]:
# Creating subset for "breakfast"
df_breakfast = df_prods[df_prods['department_id']==14]

In [39]:
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


#### Step 7: Creating Subset of Party Items

In [40]:
# Define list of departments for party items from data dictionary
dep_list = [5, 7, 12, 20]

In [41]:
# Filter dataframe - keep only rows with alcohol, deli, bevarages, and meat seafood
df_party_subset = df_prods[df_prods['department_id'].isin(dep_list)]
print(df_party_subset.head(10))

    product_id                                       product_name  aisle_id  department_id  prices
2            3               Robust Golden Unsweetened Oolong Tea        94              7     4.5
6            7                     Pure Coconut Water With Orange        98              7     4.4
9           10     Sparkling Orange Juice & Prickly Pear Beverage       115              7     8.4
10          11                                  Peach Mango Juice        31              7     2.8
16          17                                  Rendered Duck Fat        35             12    17.1
19          20     Pomegranate Cranberry & Aloe Vera Enrich Drink        98              7     6.0
22          23                             Organic Turkey Burgers        49             12     8.2
34          35     Italian Herb Porcini Mushrooms Chicken Sausage       106             12    15.1
38          39           Daily Tangerine Citrus Flavored Beverage        64              7    12.5
39        

In [42]:
# Saving party subset as csv
df_party_subset.to_csv('party_product.csv', index = False)

In [43]:
# Export dataframe as csv
df_party_subset.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'party_products'))

#### Step 8: Total counts of party subset

In [44]:
# Row count of party subset
row_count = df_party_subset.shape[0]
print('Number of rows:', row_count)

Number of rows: 7650


In [45]:
# Number of rows and columns
print('Party subset (rows, columns):', df_party_subset.shape)

Party subset (rows, columns): (7650, 5)


#### Step 9: Extracting all informations for user_id 1

In [46]:
# Filter all the orders for user_id = 1
user_1_orders = df_ords[df_ords['user_id'] =='1'] # user_id converted to string
print(user_1_orders)
print('rows, columns:' ,user_1_orders.shape)

   order_id user_id  order_number  order_day_of_week  order_hour_of_day  days_since_prior_order
0   2539329       1             1                  2                  8                     NaN
1   2398795       1             2                  3                  7                    15.0
2    473747       1             3                  3                 12                    21.0
3   2254736       1             4                  4                  7                    29.0
4    431534       1             5                  4                 15                    28.0
5   3367565       1             6                  2                  7                    19.0
6    550135       1             7                  1                  9                    20.0
7   3108588       1             8                  1                 14                    14.0
8   2295261       1             9                  1                 16                     0.0
9   2550362       1            10       

In [47]:
# Count of user 1 orders
print(len(user_1_orders))

11


In [48]:
# First and last order
min_order =user_1_orders['order_number'].min()
max_order =user_1_orders['order_number'].max()
print('First order:', min_order)
print('Last order:', max_order)

First order: 1
Last order: 11


In [49]:
# Average days between orders
avg_days_between = user_1_orders['days_since_prior_order'].mean()
print('Average days between orders:', avg_days_between)

Average days between orders: 19.0


In [50]:
# Busiest oder hour
busiest_hour_user_1 = user_1_orders['order_hour_of_day'].mode()
print('Busiest hour for user 1:', busiest_hour_user_1)

Busiest hour for user 1: 0    7
1    8
Name: order_hour_of_day, dtype: int64


#### Step 11: Exporting wrangled df_ords dataframe

In [51]:
# Export df_ords dataframe in "Prepared Data"
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index = False)

#### Step 12: Exporting df_dep_t_new wrangled

In [52]:
df_dep_t_new.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'department_wrangled.csv'), index = False)