<a href="https://colab.research.google.com/github/John-G-Thomas/DS-Unit-2-Applied-Modeling/blob/master/module2-wrangle-ml-datasets/2020_07_21_wrangling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Data

In [None]:
import requests
import tarfile
import pandas as pd

# Download data from AWS
def download(url):
    filename = url.split('/')[-1]
    print(f'Downloading {url}')
    r = requests.get(url)
    with open(filename, 'wb') as f:
        f.write(r.content)
    print(f'Downloaded {filename}')

download('https://s3.amazonaws.com/instacart-datasets/instacart_online_grocery_shopping_2017_05_01.tar.gz')

# Extract Files
tarfile.open('instacart_online_grocery_shopping_2017_05_01.tar.gz').extractall()

# Load files into DataFrames
orders = pd.read_csv('instacart_2017_05_01/orders.csv')
order_products_train = pd.read_csv('instacart_2017_05_01/order_products__train.csv')
order_products_prior = pd.read_csv('instacart_2017_05_01/order_products__prior.csv')
products = pd.read_csv('instacart_2017_05_01/products.csv')

Downloading https://s3.amazonaws.com/instacart-datasets/instacart_online_grocery_shopping_2017_05_01.tar.gz
Downloaded instacart_online_grocery_shopping_2017_05_01.tar.gz


# EDA

In [None]:
from IPython.display import display

frames = [orders, order_products_prior, order_products_train, products]

for df in frames:
  display(df.head())
  print(df.shape)
  print()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


(3421083, 7)



Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


(32434489, 4)



Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


(1384617, 4)



Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


(49688, 4)



In [None]:
# Do the number of unique order_ids in `order_products_prior` and 
# `order_products_train` match the number of `prior` and `train` 
# rows in `orders` DataFrame

orders['eval_set'].value_counts()

prior    3214874
train     131209
test       75000
Name: eval_set, dtype: int64

In [None]:
order_products_prior['order_id'].nunique()

3214874

In [None]:
order_products_train['order_id'].nunique()

131209

In [None]:
# How can we know the names of the products mentioned in `order_products` DataFrames?

In [None]:
order_products_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [None]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [None]:
order_products_train.shape

(1384617, 4)

In [None]:
order_products_train = pd.merge(order_products_train, products, on='product_id', how='inner')

In [None]:
order_products_train.shape

(1384617, 7)

In [None]:
order_products_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,1,49302,1,1,Bulgarian Yogurt,120,16
1,816049,49302,7,1,Bulgarian Yogurt,120,16
2,1242203,49302,1,1,Bulgarian Yogurt,120,16
3,1383349,49302,11,1,Bulgarian Yogurt,120,16
4,1787378,49302,8,0,Bulgarian Yogurt,120,16


In [None]:
order_products_prior = pd.merge(order_products_prior, products, on='product_id', how='inner')

## Defining Our Problem

- Will a customer order one particular product?
- Which product should we choose? The most frequently ordered product

But what is the most frequently ordered product?

In [None]:
order_products_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,33120,1,1,Organic Egg Whites,86,16
1,26,33120,5,0,Organic Egg Whites,86,16
2,120,33120,13,0,Organic Egg Whites,86,16
3,327,33120,5,1,Organic Egg Whites,86,16
4,390,33120,28,1,Organic Egg Whites,86,16


In [None]:
order_products_prior['product_name'].value_counts().head()

Banana                    472565
Bag of Organic Bananas    379450
Organic Strawberries      264683
Organic Baby Spinach      241921
Organic Hass Avocado      213584
Name: product_name, dtype: int64

**Will a customer order `'Banana'`**?

In [None]:
# Step 1: Create a feature matrix from `orders`
X = orders[orders['eval_set'] == 'train'].sort_values('order_id')
print(X.shape)
X.head()

(131209, 7)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
1868044,1,112108,train,4,4,10,9.0
1322255,36,79431,train,23,6,18,30.0
709772,38,42756,train,6,6,16,24.0
284948,96,17227,train,7,6,20,30.0
941403,98,56463,train,41,3,8,14.0


In [None]:
# Step 2: Create Targets
# Identify when a `Banana` is ordered in `order_products_train`
order_products_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,1,49302,1,1,Bulgarian Yogurt,120,16
1,816049,49302,7,1,Bulgarian Yogurt,120,16
2,1242203,49302,1,1,Bulgarian Yogurt,120,16
3,1383349,49302,11,1,Bulgarian Yogurt,120,16
4,1787378,49302,8,0,Bulgarian Yogurt,120,16


In [None]:
order_products_train['is_banana'] = order_products_train.product_name == 'Banana'
order_products_train[order_products_train['is_banana']].head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,is_banana
129688,226,24852,2,0,Banana,24,4,True
129689,473,24852,2,0,Banana,24,4,True
129690,878,24852,2,1,Banana,24,4,True
129691,1042,24852,1,1,Banana,24,4,True
129692,1139,24852,1,1,Banana,24,4,True


In [None]:
# Step 2b: Identify the `order_id`s that contains `'Banana'`

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
1868044,1,112108,train,4,4,10,9.0
1322255,36,79431,train,23,6,18,30.0
709772,38,42756,train,6,6,16,24.0
284948,96,17227,train,7,6,20,30.0
941403,98,56463,train,41,3,8,14.0


In [None]:
ordered_banana = order_products_train[order_products_train['is_banana']]['order_id']

In [None]:
ordered_banana

18726

In [None]:
X['ordered_banana'] = X['order_id'].isin(ordered_banana)

In [None]:
X.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,ordered_banana
1868044,1,112108,train,4,4,10,9.0,False
1322255,36,79431,train,23,6,18,30.0,False
709772,38,42756,train,6,6,16,24.0,False
284948,96,17227,train,7,6,20,30.0,False
941403,98,56463,train,41,3,8,14.0,False


In [None]:
y = X['ordered_banana']
X = X.drop(['order_id', 'user_id', 'eval_set', 'ordered_banana'], axis=1)

In [None]:
y.head()

1868044    False
1322255    False
709772     False
284948     False
941403     False
Name: ordered_banana, dtype: bool

## What's our baseline?

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=.2, random_state=42) 

In [None]:
y_train.value_counts(normalize=True)

False    0.857584
True     0.142416
Name: ordered_banana, dtype: float64