# Contents

## 01 Set up

## 02 Wrangling the dataset

## 03 Transposing a dataframe

## 04 Assigning a new header - 3 step process

## 05 Creating and Working with a Data Dictionary

## 06 Creating and Working with Subsets

## 07 Changing a data type for a descriptive column currently set as quantitative.

## 08 Finding the busiest hour of the day

## 09 Using the data dictionary

## 10 Creating a subset for breakfast items

## 11 Creating a subset for dinner party items (list of several departments)

## 12 Creating and using a subset for a single customer

## 13 Data Export to .csv

### 01 Set up

In [6]:
# Import Libraries
import pandas as pd
import numpy as np
import os

In [7]:
# Defining the main project path
proj = r'C:\Users\bfd_6\Documents\Career Foundry Project\Course Part 2\A4 Python\Instacart Basket Analysis'

In [14]:
# Creating a data frame for the orders dataset
df_ords = pd.read_csv(os.path.join(proj, '02 Data', 'Original Data', 'orders.csv'))

In [9]:
# Creating a data frame for the products dataset
df_prods = pd.read_csv(os.path.join(proj, '02 Data', 'Original Data', 'products.csv'))

### 02 Wrangling the dataset

In [15]:
# Removing the "eval_set" column from the orders data frame - only visually. This shows a "what if" sample version of the new df
df_ords.drop(columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


#### Dropping a column

In [16]:
# To actually remove the column, the df must be redefined
df_ords = df_ords.drop(columns = ['eval_set'])

#### Checking for missing values

In [17]:
# Looking for missing values
df_ords['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

#### Changing a column name

In [19]:
# Changing a column name
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [20]:
# Checking to make sure the column name change worked.
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


#### Changing data type

In [21]:
# Changing order_id to str data type
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [23]:
# checking the data type for a single column - .dtype, not .dtypes
df_ords['order_id'].dtype

dtype('O')

### 03 Transposing a dataframe

In [26]:
# Importing departments into a df
df_dep = pd.read_csv(os.path.join(proj, '02 Data', 'Original Data', 'departments.csv'))

In [27]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [28]:
# Creating a new data frame for departments with rows and columns transposed
df_dep_t = df_dep.T

In [29]:
# Showing the new layout of the dataframe after the transpose function was used.
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


#### Adding an index to a dataframe

In [30]:
# Adding an index to df_dep_t data frame
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


### 04 Assigning a new header - 3 step process

#### Create the new header as a variable

In [31]:
# Assigns contents of row 0 to variable new_header
new_header = df_dep_t.iloc[0]

In [32]:
# Verifying the variable
new_header

0    department
Name: department_id, dtype: object

#### Eliminating the existing header

In [33]:
# Create new data frame df_dep_t_new that only contains data from the second row of df_dep_t (rows start with 0)
df_dep_t_new = df_dep_t[1:]

In [34]:
# Showing the contents of the new dataframe
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


#### Putting the new header in place

In [35]:
# Assign contents of new_header variable as the new column title (header) for df_dep_t_new
df_dep_t_new.columns = new_header

In [36]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


### 05 Creating and Working with a Data Dictionary

In [37]:
# Creating a variable, data_dict, to hold the data dictionary created from the df_dep_t_new df
data_dict = df_dep_t_new.to_dict('index')

In [38]:
# Showing the contents of the data dictionary.
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [39]:
# Showing the first 5 rows of the dataframe.
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [40]:
# Using the data dictionary to find the name associated with the department_id in df_prods
print(data_dict.get('19'))

{'department': 'snacks'}


### 06 Creating and Working with Subsets

In [42]:
# Creating a subset of df_prods that only contains products from the snacks department
df_snacks = df_prods[df_prods['department_id']==19]

In [43]:
# Showing the first 5 rows of the subset.
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [45]:
# Alternate methods of creating subsets
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]
df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([17,18,19])] # Allows for multiple entries in a list

In [46]:
# Showing the first 5 rows of the subset for department 19
df_snacks_2.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [47]:
# Showing the first 20 rows of the subset for departments 17, 18, 19.
df_snacks_3.head(20)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
13,14,Fresh Scent Dishwasher Cleaner,74,17,6.5
14,15,Overnight Diapers Size 6,56,18,11.2
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
47,48,"School Glue, Washable, No Run",87,17,6.4
55,56,Healthy Pop Butter Popcorn,23,19,4.4
56,57,Flat Toothpicks,111,17,13.2


### 07 Changing a data type for a descriptive column currently set as quantitative.

In [49]:
# Finding another variable that could be changed from numeric to string
df_ords.dtypes

order_id                   object
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [50]:
# Changing user_id to string
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [51]:
# Making a copy of order_number with new title customer_order - does not overwrite the df
df_ords.rename(columns = {'order_number' : 'customer_order'}, inplace = False) # inplace = False is the default and not necessary

Unnamed: 0,order_id,user_id,customer_order,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [52]:
# Showing first 5 rows of the dataframe. 
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


### 08 Finding the busiest hour of the day

In [53]:
# Finding the busiest hour for placing orders
df_ords['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

##### The busiest hour of the day is 10:00 a.m.

### 09 Using the data dictionary

In [54]:
# What does '4' mean in the department_id column of df_prods?
print(data_dict.get('4')) # data_dict was created earlier in this exercise

{'department': 'produce'}


### 10 Creating a subset for breakfast items

In [55]:
# Subset of breakfast items
df_breakfast = df_prods.loc[df_prods['department_id'].isin([14])]

In [56]:
df_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


### 11 Creating a subset for dinner party items (list of several departments)

In [57]:
# Subset of dinner party items (alcohol, deli, beverages, meat/seafood)
df_dinner_party = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]

In [59]:
df_dinner_party.head(30)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
19,20,Pomegranate Cranberry & Aloe Vera Enrich Drink,98,7,6.0
22,23,Organic Turkey Burgers,49,12,8.2
34,35,Italian Herb Porcini Mushrooms Chicken Sausage,106,12,15.1
38,39,Daily Tangerine Citrus Flavored Beverage,64,7,12.5
39,40,Beef Hot Links Beef Smoked Sausage With Chile ...,106,12,22.5


In [60]:
df_dinner_party.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7650 entries, 2 to 49688
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     7650 non-null   int64  
 1   product_name   7647 non-null   object 
 2   aisle_id       7650 non-null   int64  
 3   department_id  7650 non-null   int64  
 4   prices         7650 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 358.6+ KB


##### The dinner party df has 7,650 rows

### 12 Creating and using a subset for a single customer

In [68]:
# All info on user_id 1
df_user_1 = df_ords.loc[df_ords['user_id'] == '1']

In [71]:
# Showing first 5 rows of the subset
df_user_1.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [72]:
# Basic Stats for user 1
df_user_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11 entries, 0 to 10
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   order_id                11 non-null     object 
 1   user_id                 11 non-null     object 
 2   order_number            11 non-null     int64  
 3   orders_day_of_week      11 non-null     int64  
 4   order_hour_of_day       11 non-null     int64  
 5   days_since_prior_order  10 non-null     float64
dtypes: float64(1), int64(3), object(2)
memory usage: 616.0+ bytes


In [75]:
df_user_1.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


##### We can determine the number of rows and columns, the data types of each column, and basic descriptive statistics. It is noted that "order_number" should be changed to a str type. 

### 13 Data Export to .csv

In [78]:
df_ords.to_csv(os.path.join(proj, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [79]:
df_dep_t_new.to_csv(os.path.join(proj, '02 Data','Prepared Data', 'departments_wrangled.csv'))