# Contents

## 01 Set up

## 02 Working with Mixed Data Types

## 03 Identifying and Handling Missing Values

## 03 Identifying and Eliminating Duplicates

## 04 Working with the describe function

## 05 When missing values are valid

## 06 Checking for duplicates in the dataframe

## 07 Exporting to .csv

### 01 Set up

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Defining the main project path
proj = r'C:\Users\bfd_6\Documents\Career Foundry Project\Course Part 2\A4 Python\Instacart Basket Analysis'

In [3]:
# Creating a data frame for the products dataset
df_prods = pd.read_csv(os.path.join(proj, '02 Data', 'Original Data', 'products.csv'))

In [4]:
# Creating a data frame for the wrangled orders dataset
df_ords = pd.read_csv(os.path.join(proj, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

### 02 Working with Mixed Data Types

In [5]:
# Creating a test data frame
df_test = pd.DataFrame()

In [6]:
# Creating a mixed type column in the test df
df_test['mix'] = ['a', 'b', 1, True]

In [7]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [8]:
# Looking for columns with mixed data types
# The code in the exercise resulted in an error message. Applymap is no longer used. The following code is a correction:
for col in df_test.columns.tolist():
    weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_test[weird]) > 0:
        print(col)

mix


In [9]:
# Changing the data type for column 'mix' in df_test to string:
df_test['mix'] = df_test['mix'].astype('str')

### 03 Identifying and Handling Missing Values

In [10]:
# Identifying the total number of missing values for each column in df_prods
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [11]:
# Creating a subset of df_prods with missing values in product_name
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [12]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [13]:
# Finding the number of rows in the data frame prior to acting on missing values
df_prods.shape

(49693, 5)

In [4]:
# Creating a "clean" subset with no missing product names
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [15]:
df_prods_clean.shape

(49677, 5)

### 03 Identifying and Eliminating Duplicates

In [5]:
# Creating a subset data frame with just full row duplicates from the df_prods_clean data frame
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [17]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [6]:
# Dropping duplicate rows from df_prods_clean
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [7]:
# Checking rows and columns for dataframe
df_prods_clean_no_dups.shape

(49672, 5)

### 04 Working with the describe function

In [20]:
# Run the df.describe() function on your df_ords dataframe. I changed the format to avoid scientific notation.
df_ords.describe().map(lambda x: f"{x:0.3f}") 

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.208,17.155,2.776,13.452,11.115
std,987581.74,987581.74,59533.718,17.733,2.047,4.226,9.207
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565811.5,2565812.5,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


##### Nothing appears to be off based on the describe() function

In [23]:
# Looking for columns with mixed data types
# The code in the exercise resulted in an error message. Applymap is no longer used. The following code is a correction:
for col in df_ords.columns.tolist():
    weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_ords[weird]) > 0:
        print(col)


##### There were no mixed data type columns found.

In [24]:
# Identifying the total number of missing values for each column in df_ords
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

### 05 When missing values are valid

In [25]:
# Creating a subset of df_ords with missing values in days_since_prior_order
df_ords_missing = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [26]:
df_ords_missing.head(30)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
50,50,2086598,6,1,5,18,
54,54,2565571,7,1,3,9,
75,75,600894,8,1,6,0,
79,79,280530,9,1,1,17,
83,83,1224907,10,1,2,14,


In [27]:
df_ords_missing.tail(30)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
3420548,3420548,566068,206180,1,0,11,
3420567,3420567,351920,206181,1,0,15,
3420582,3420582,485448,206182,1,0,8,
3420593,3420593,31169,206183,1,0,13,
3420608,3420608,180109,206184,1,6,12,
3420613,3420613,2168901,206185,1,5,19,
3420624,3420624,1831589,206186,1,6,14,
3420628,3420628,474057,206187,1,2,23,
3420663,3420663,1944921,206188,1,4,10,
3420671,3420671,2966736,206189,1,4,15,


In [28]:
# Creating a subset of df_ords to test for unintended missing values (order number > 1)
df_ords_repeat_cust = df_ords[df_ords['days_since_prior_order'] > 1]

In [29]:
# Identifying the total number of missing values for each column in df_ords_repeat_cust
df_ords_repeat_cust.isnull().sum()

Unnamed: 0                0
order_id                  0
user_id                   0
order_number              0
orders_day_of_week        0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

##### The data dictionary indicates that NaN is an appropriate value for days_since_prior_order when order_number = 1. No changes to the data are necessary. If analysis requires exclusion of the null values, a subset should be used.

### 06 Checking for duplicates in the dataframe

In [31]:
# Creating a subset data frame with just full row duplicates from the df_ords data frame
df_dups_ords = df_ords[df_ords.duplicated()]

In [32]:
df_dups_ords

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


In [33]:
df_dups_ords_id = df_ords[df_ords['order_id'].duplicated()]

In [34]:
df_dups_ords_id

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


In [36]:
# This method will return the total number of duplicate rows without creating an additional data frame
df_ords.duplicated().sum()

0

In [37]:
# This method will return the total number of rows with duplicate data in the order_id column without creating an additional data frame
df_ords.duplicated(subset = 'order_id').sum()

0

##### I used the method described in the exercise first and created new data frames to determine duplicate rows. I also determined whether duplicate data existed for order_id, which should be unique in this data frame. I also used a different method that returns only the presence of duplicate rows or values by counting all of the True boolean values for duplicated(). There were no duplicates returned for rows or within the order_id column using either method.

### 07 Exporting to .csv

In [38]:
# Exporting csv file to prepared data folder for the products data set with no dups or missing values
df_prods_clean_no_dups.to_csv(os.path.join(proj, '02 Data', 'Prepared Data', 'products_checked.csv'))

In [None]:
# No changes were required for the orders data set in this module, so the orders_wrangled.csv will remain unchanged.