##### InstaCart Project 

## Ex4.5 Data Consistency

### Contents
- 1. Import libraries
- 2. Import orders df, products df and departments df
- 3. Accuracy Checks
- 4. Mixed data types
- 5. Missing Values
- 6. Duplicates
- 7. Exporting dataframe
- 8. Task questions

# 01 Import libraries

In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import os

# 02 Import Dataframes

In [2]:
#Create filepath
path = r'//Users/amypalomino/Documents/(03.22) InstaCart Basket Analysis'

In [3]:
#Importing products.csv as df_prods
df_prods = pd.read_csv(
    os.path.join(
        path, '02 Data', 'Original Data', 'products.csv'),
index_col = False)

In [4]:
#Importing wrangled orders.csv as df_ords
df_ords = pd.read_csv(
    os.path.join(
        path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'),
index_col = False)

# 03 Accuracy checks

In [5]:
#Accuracy check of df_ords
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [8]:
#I have an unnamed column which is concerning.
df_ords.info
#I think it's just the index column that pandas assigned?

<bound method DataFrame.info of          Unnamed: 0  order_id  user_id  order_number  order_day_of_week  \
0                 0   2539329        1             1                  2   
1                 1   2398795        1             2                  3   
2                 2    473747        1             3                  3   
3                 3   2254736        1             4                  4   
4                 4    431534        1             5                  4   
...             ...       ...      ...           ...                ...   
3421078     3421078   2266710   206209            10                  5   
3421079     3421079   1854736   206209            11                  4   
3421080     3421080    626363   206209            12                  1   
3421081     3421081   2977660   206209            13                  1   
3421082     3421082    272231   206209            14                  6   

         order_hour_of_day  days_since_prior_order  
0             

In [6]:
#Accuracy check of df_prods
df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


### max price looks like an error. Must be skewing the std. dev. and the mean considerably

# 04 Test dataframe for mixed data tpyes

In [10]:
# Create a dataframe
df_test = pd.DataFrame()

In [11]:
# Create a mixed type column
df_test['mix'] = ['a','b', 1, True]

In [12]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [13]:
# Check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [14]:
df_test['mix'] = df_test['mix'].astype('str')

# 05 Finding Missing values

In [15]:
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [16]:
# Create subset of missing values to view more detail
df_nan = df_prods[df_prods['product_name'].isnull() == True]
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [17]:
df_prods.shape

(49693, 5)

In [18]:
# Removing missing values (16 out of 49693 is a small proportion so won't alter the data)
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [19]:
#Confirming changes have been made and missing records removed 
df_prods_clean.shape

(49677, 5)

# 06 Duplicates

In [20]:
# Looking for duplicates in df_prods_clean
df_dups = df_prods_clean[df_prods_clean.duplicated()]
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [21]:
# Removing duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [22]:
df_prods_clean_no_dups.shape

(49672, 5)

# 07 Export new product dataframe

In [23]:
# Export df_prods_clean_no_dups as products_checked.csv
df_prods_clean_no_dups.to_csv(os.path.join
                              (path, '02 Data', 'Prepared Data', 'products_checked.csv'))

# 08 Ex 4.5 Task Questions

# Q2 

#Q2. Answered in output 6 above. Max price is 99999 which must be an error adn will be skweing the mean and std deviation stats. 

In [26]:
# Investigating max price in products
df_prods_clean_no_dups['prices'].value_counts()

2.5        470
5.3        458
6.2        451
2.6        447
5.4        444
          ... 
15.6         1
21.0         1
99999.0      1
14900.0      1
18.3         1
Name: prices, Length: 242, dtype: int64

In [29]:
# Which products in the dataframe have the highest prices?
df_top_prices = df_prods_clean_no_dups[df_prods_clean_no_dups['prices'] > 18.3]

In [30]:
df_top_prices

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
39,40,Beef Hot Links Beef Smoked Sausage With Chile ...,106,12,22.5
83,84,Lamb Shank,7,12,24.3
174,175,T Bone Steak,122,12,18.6
193,194,Lamb Rib Chops,122,12,18.9
304,305,Wild Forest Raw Neem Honey,29,13,20.0
...,...,...,...,...,...
48966,48962,Hardwood Smoked Sliced Bacon,106,12,23.1
49240,49236,80% Lean Ground Beef,122,12,20.7
49440,49436,Imitation Crab Flakes,15,12,23.5
49655,49651,Beef Brisket,122,12,20.7


In [33]:
# What is the median price?
df_prods_clean_no_dups['prices'].median()

7.1

In [34]:
# Which products in the dataframe have the highest prices?
df_top_prices = df_prods_clean_no_dups[df_prods_clean_no_dups['prices'] > 100]

In [35]:
df_top_prices

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
21554,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


#Q2 next steps
Both records are priced inappropriately for what they are. Using common sense, I would impute the cheese item to 1.49. 
I'd like to look at the other milk items before imputing this second item.

In [49]:
#Looking at milk products to determine correct price
df_milk = []
for products in df_prods_clean_no_dups['product_name']:
    if 'milk' in products:
        df_milk.append(products)


In [50]:
df_milk

['Pancake Mix, Buttermilk',
 'Very Vanilla Soymilk',
 'Vanilla Almond Cashew Almondmilk Cashewmilk Blend',
 'Organic Whole Grassmilk Milk',
 'Sweet Cream Buttermilk Powder',
 'Complete Funfetti Buttermilk Pancake & Waffle Mix',
 'Shake n Pour Buttermilk Pancake Mix',
 'Old Fashioned Buttermilk Ranch Salad Dressing',
 'Buttermilk Biscuits',
 'Plain Soymilk Creamer',
 'Light Buttermilk Ranch Salad Dressing',
 'BoomChocoBoom Gluten Free Ricemilk Crunch Bar',
 'West Soymilk',
 'Almond Cashew Unsweetened Almondmilk Cashewmilk Blend',
 'Pumpkin Spice Latte Iced Coffee Pure Cold Brew Coffee With Almondmilk',
 'Flaxmilk 1200 mg Omega-3 Vanilla',
 'Organic Chocolate Almondmilk Pudding',
 'Toasted Coconut Almondmilk Blend',
 'Dark Chocolate Almondmilk',
 'Very Vanilla Soymilk Singles',
 'Buttermilk Ranch Pretzel Pieces',
 'Almondmilk, Pure, Chocolate Protein',
 'Organic Vanilla Soymilk',
 'Jumbos Buttermilk Biscuits',
 'Turtle Trails Soymilk Frozen Dessert',
 'Vanilla Soymilk',
 'Plain Soymilk',

In [57]:
# Half way there! I'll look up the 'Organic Whole Grassmilk Milk' and see what price it's listed at
print(df_prods_clean_no_dups['prices'].get('Organic Whole Grassmilk Milk'))

None


#Hmmmm, this hasn't gone to plan... I'm also not sure how to overwrite a record yet anyway, so I'll move on for now! 

# Q3

In [58]:
# Checking for mixed data in df_ords
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

In [None]:
#No output was returned, which means there are no mixed data columns in this dataframe

In [59]:
df_ords.dtypes #Double checking data types in df_ords
# if one fo the id columns were containing mixed data types, I would convert them to string - 
# df_ords['order_id'] = df_ords['order_id'] astype ('Str')

Unnamed: 0                  int64
order_id                    int64
user_id                     int64
order_number                int64
order_day_of_week           int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

# Q5

In [61]:
# Checking for missing values in df_ords
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

# Q6 
There are 206,209 missing values in the days_since_prior_order. Due to the nature of this record, you'd expect this to be the case as every single customer will place their first order, making the record in this column always null. 
I would create a new variable, acting as a flag to indicate 'new_customer'. You could then look at how many are repeat customers whcih would be an interesting insight

# Q7

In [62]:
#Checking for duplicate records in df_ords
df_ords_dups = df_ords[df_ords.duplicated()] # Checks for 'full duplicates'

In [64]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order


In [69]:
# There are no full duplicates in the orders dataframe. If there had been, I'd have carefully considered if they were okay to remove and then used - 
# drop_duplicates() 

# Export dataframes

In [71]:
# Export df_prods_clean_no_dups as products_checked.csv
df_prods_clean_no_dups.to_csv(os.path.join
                              (path, '02 Data', 'Prepared Data', 'products_checked.csv'))

In [72]:
# Export df_ords as orders_checked.csv
df_ords.to_csv(os.path.join
                (path, '02 Data', 'Prepared Data', 'orders_checked.csv'))