## Importing Libraries 

In [2]:
import numpy as np 
import pandas as pd 
import os 

## Testing

In [34]:
## creating a data frame with mixed data types 

df_test = pd.DataFrame()

df_test['mix'] = ['a','b',1,True]

## to check whether a dataframe contains any mixed type columns 

for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)


mix


  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)


In [30]:
## changing the column's data type 

df_test['mix'] = df_test['mix'].astype('str')

## Consistency Checks on Products Table

In [40]:
## importing products table 

path = r'/Users/aahamoustafa/Desktop/Data Projects/Instacart Basket Analysis 11-2024'

df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'),  index_col = False)


In [156]:
## finding missing observations 

df_prods.isnull().sum()

## viewing those missing values 

df_nan_prods = df_prods[df_prods['product_name'].isnull() == True]

df_nan_prods

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [None]:
## Addressing these missing values 

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]


We filtered out the rows with product names that are missing and placed that into a new dataframe

In [63]:
## finding duplicates 

df_dups_prods = df_prods_clean[df_prods_clean.duplicated()]

df_dups_prods

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [69]:
## addressing duplicates 

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()


(49672, 5)

dropped the duplicates and placed them into a new dataframe, at the same time we did not change the original products table as best practice

In [152]:
## exporting products table 

df_prods_clean_no_dups.to_csv(os.path.join(path,'02 Data','Prepared Data', 'products_checked.csv'))

## Consistency Checks on Orders Table 

In [112]:
## importing and investigating data 

cols_list = [ 'order_id', 'user_id', 'order_number', 'order_day_of_week', 'order_hour_of_day', 'days_since_prior_order']

df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False, usecols = cols_list)

## to avoid scientific notation with describe
pd.set_option('display.float_format', '{:.2f}'.format)

df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.21,17.15,2.78,13.45,11.11
std,987581.74,59533.72,17.73,2.05,4.23,9.21
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.5,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


Nothing in the data looks off by looking at the data here. 

In [None]:
## checking for mixed data type columns in orders table

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

No mixed type columns 

In [140]:
## investigating missing values in orders table 

df_ords.isnull().sum()


order_id                       0
user_id                        0
order_number                   0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

Missing values are 206209 in the days_since_prior_order. That is normal as every customer's first order will be NaN and we have 206209 different customers.

In [144]:
## check for duplicate values in orders table

df_dups_ords = df_ords[df_ords.duplicated()]

df_dups_ords


Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order


No duplicates found

In [147]:
## Exporting orders table 

df_ords.to_csv(os.path.join(path,'02 Data','Prepared Data', 'orders_checked.csv'))