# Import data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Assign path string to variable
path = r'D:\Docs\Career Foundry\II. Data Immersion\4. Python Fundamentals for Data Analysts\Instacart Basket Analysis - Alina Racu'

In [3]:
# Import orders.cvs file
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', '4.4 orders_wrangled.csv'), index_col = False)

In [4]:
# Import products.cvs file
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

# Data consistency checks

## Addressing mixed type data

### Dummy df

In [5]:
# Create a dataframe
df_test = pd.DataFrame()

In [6]:
# Create a mixed type column
df_test["mix"] = ["a", "b", 1, True]

In [7]:
# Check
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [8]:
# Check whether a dataframe contains any mixed-type columns
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [9]:
# Change data type from numeric to string
df_test["mix"] = df_test["mix"].astype("str")

In [10]:
# Change data type from string to numeric
# df_test["mix"] = df_test["mix"].astype("int64")

## Addressing missing values

## df_prods

In [11]:
# Identify missing observations and sum them up
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [12]:
# Create a subset of the dataframe containing only the missing values
df_nan = df_prods[df_prods["product_name"].isnull() == True]

In [13]:
# Check
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [14]:
# Check no. of rows in dataframe
df_prods.shape

(49693, 5)

In [15]:
# Create a subset of the dataframe without the missing values
df_prods_clean = df_prods[df_prods["product_name"].isnull() == False]

In [16]:
# Check
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [17]:
# Check no. of rows in the new subset and compare with the original dataframe
df_prods_clean.shape

(49677, 5)

In [18]:
# Alternatively, drop the missing values
# df_prods_clean = df_prods["product_name"].dropna(inplace = True)

## Addressing duplicates

## df_prods_clean

In [19]:
# Look for full duplicates within the dataframe
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [20]:
# Check
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [21]:
# Check no. of rows in dataframe
df_prods_clean.shape

(49677, 5)

In [22]:
# Create a subset of the dataframe without the duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [23]:
# Check no. of rows in the new subset and compare with the original dataframe
df_prods_clean_no_dups.shape

(49672, 5)

# Task

In [24]:
# Exercise 2: Run the df.describe() function on "df_prods" dataframe
df_prods_clean_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


Prices range from 1 to 99999. The maximum number looks out of range considering that we are dealing with food products.

In [25]:
# Exercise 3: Check for mixed-type data in your "df_ords" dataframe.
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

In [26]:
# Exercise 4: If you find mixed-type data, fix it. The column in question should contain observations of a single data type.

There seem to be no mixed-type data in df_ords.

In [27]:
# Exercise 5: Run a check for missing values in your df_ords dataframe.
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [28]:
# Create a subset of the dataframe containing only the missing values
df_ords_missing = df_ords[df_ords["days_since_prior_order"].isnull() == True]

In [29]:
# Check
df_ords_missing.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,


There are 206209 missing values in column "days_since_prior_order". 
We can see that the rows with missing values have one thing in common: order_number = 1. Each user here placed only 1 order, which means that there is no prior order available.

In [30]:
# Exercise 6: Address the missing values using an appropriate method.

I would not remove any rows or impute any values, since this data refers to one-time buyers. In fact, one may add a column "one_time_buyers" to flag this user category.

In [31]:
# Exercise 7: Run a check for duplicate values in your df_ords data.
df_ords_dup = df_ords[df_ords.duplicated()]

In [32]:
# Check
df_ords_dup

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order


In [33]:
# Check number of rows
df_ords_dup.shape

(0, 7)

There are no duplicate rows in df_ords.

# Export Data

In [34]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', '4.5 products_checked.csv'))

In [35]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', '4.5 orders_checked.csv'))