# Predict Future Sale - Feature Engineering
## Final project for "How to win a data science competition" Coursera course
https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data  
>Student: Rafael Caneiro de Oliveira  
>Email: rafael.caneiro@gmail.com  
>Date: 07/10/2020

## Load

In [1]:
import numpy as np
import pandas as pd
from itertools import product

from pathlib import Path

PATH = Path.cwd().parent
DATA_PATH = Path(PATH, "./data/raw/") 
MODEL_PATH = Path(PATH, "./models/") 

### Memory Optimization Function

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

### Sales

In [3]:
sales = pd.read_csv(Path(DATA_PATH,"sales_train.csv"))
sales["date"] = pd.to_datetime(sales["date"], format="%d.%m.%Y")
sales["pk"] = sales.shop_id.map(str) + "#" + \
             sales.item_id.map(str) + "#" + \
             sales.date_block_num.map(str)
sales.reset_index(drop=False, inplace=True)
sales = reduce_mem_usage(sales)
print(sales.shape)
sales.head()

Memory usage after optimization is: 84.00 MB
Decreased by 53.1%
(2935849, 8)


Unnamed: 0,index,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,pk
0,0,2013-01-02,0,59,22154,999.0,1.0,59#22154#0
1,1,2013-01-03,0,25,2552,899.0,1.0,25#2552#0
2,2,2013-01-05,0,25,2552,899.0,-1.0,25#2552#0
3,3,2013-01-06,0,25,2554,1709.050049,1.0,25#2554#0
4,4,2013-01-15,0,25,2555,1099.0,1.0,25#2555#0


### Items

In [7]:
items = pd.read_csv(Path(DATA_PATH,"items.csv"))
items = reduce_mem_usage(items)
items.head()

Memory usage after optimization is: 0.23 MB
Decreased by 54.2%


Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


### Categories

In [8]:
categories = pd.read_csv(Path(DATA_PATH,"item_categories.csv"))
categories = reduce_mem_usage(categories)
categories.head()

Memory usage after optimization is: 0.00 MB
Decreased by 39.9%


Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


### Shops

In [9]:
shops = pd.read_csv(Path(DATA_PATH,"shops.csv"))
shops = reduce_mem_usage(shops)
shops.head()

Memory usage after optimization is: 0.00 MB
Decreased by 38.6%


Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


### Test

In [10]:
test = pd.read_csv(Path(DATA_PATH,"test.csv"))
test["date_block_num"]=34
test = reduce_mem_usage(test)
test.head()

Memory usage after optimization is: 1.63 MB
Decreased by 75.0%


Unnamed: 0,ID,shop_id,item_id,date_block_num
0,0,5,5037,34
1,1,5,5320,34
2,2,5,5233,34
3,3,5,5232,34
4,4,5,5268,34


### Grid

In [11]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

grid["pk"] = grid.shop_id.map(str) + "#" + \
             grid.item_id.map(str) + "#" + \
             grid.date_block_num.map(str)

grid = reduce_mem_usage(grid)
        
grid.head()

Memory usage after optimization is: 124.90 MB
Decreased by 40.0%


Unnamed: 0,shop_id,item_id,date_block_num,pk
0,59,22154,0,59#22154#0
1,59,2552,0,59#2552#0
2,59,2554,0,59#2554#0
3,59,2555,0,59#2555#0
4,59,2564,0,59#2564#0


## FeatureTools

In [12]:
import featuretools as ft

# EntitySet
es = ft.EntitySet(id="sales")

In [13]:
# Adding entities

es = es.entity_from_dataframe(entity_id="grid",
                              dataframe=grid,
                              index="pk")

# Sales
es = es.entity_from_dataframe(entity_id="train_sales",
                              dataframe=sales,
                              index="index",
                              time_index="date")

# Items
es = es.entity_from_dataframe(entity_id="items",
                              dataframe=items,
                              index="item_id")

# Categories
es = es.entity_from_dataframe(entity_id="categories",
                              dataframe=categories,
                              index="item_category_id")

# Shops
es = es.entity_from_dataframe(entity_id="shops",
                              dataframe=shops,
                              index="shop_id")

In [14]:
# Adding Relationships

es = es.add_relationship(ft.Relationship(es["grid"]["pk"], 
                                         es["train_sales"]["pk"]))

es = es.add_relationship(ft.Relationship(es["items"]["item_id"], 
                                         es["train_sales"]["item_id"]))

es = es.add_relationship(ft.Relationship(es["categories"]["item_category_id"],
                                         es["items"]["item_category_id"]))

es = es.add_relationship(ft.Relationship(es["shops"]["shop_id"],
                                         es["train_sales"]["shop_id"]))

In [15]:
es.relationships

[<Relationship: train_sales.pk -> grid.pk>,
 <Relationship: train_sales.item_id -> items.item_id>,
 <Relationship: items.item_category_id -> categories.item_category_id>,
 <Relationship: train_sales.shop_id -> shops.shop_id>]

In [17]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                      target_entity="grid",
                                      max_depth=3,
                                      n_jobs=4)

EntitySet scattered to 4 workers in 69 seconds




CancelledError: calculate_chunk-49857a28823c6e7c26738d4e33dd7686

In [None]:
feature_matrix.head()

In [None]:
feature_matrix.to_csv("feature_matrix.csv")

In [86]:
fm = feature_matrix.sort_index()
fm.reset_index(drop=False, inplace=True)
fm.head


KeyError: 'pk'

In [10]:
#get aggregated values for (shop_id, item_id, month)
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})

#fix column names
gb.rename({"item_cnt_day":"target"}, inplace=True, axis=1)

#join aggregated data to the grid
train = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

#sort the data
train.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

train = reduce_mem_usage(train)

Memory usage after optimization is: 145.72 MB
Decreased by 36.4%


In [6]:
train.head()

Unnamed: 0,shop_id,item_id,date_block_num,target
139255,0,19,0,0.0
141495,0,27,0,0.0
144968,0,28,0,0.0
142661,0,29,0,0.0
138947,0,32,0,6.0


In [5]:
train = train.merge(items, on="item_id", how="left")
train = train.merge(categories, on="item_category_id", how="left")
train = train.merge(shops, on="shop_id", how="left")
train = reduce_mem_usage(train)
train.head() 

Memory usage after optimization is: 156.12 MB
Decreased by 58.3%


Unnamed: 0,shop_id,item_id,date_block_num,target,item_category_id
0,0,19,0,0.0,40
1,0,27,0,0.0,19
2,0,28,0,0.0,30
3,0,29,0,0.0,23
4,0,32,0,6.0,40


In [6]:
test = test.merge(items, on="item_id", how="left")
test = test.merge(categories, on="item_category_id", how="left")
test = test.merge(shops, on="shop_id", how="left")
test = reduce_mem_usage(test)
test.head()

Memory usage after optimization is: 3.47 MB
Decreased by 64.6%


Unnamed: 0,ID,shop_id,item_id,date_block_num,item_category_id
0,0,5,5037,34,19
1,1,5,5320,34,55
2,2,5,5233,34,19
3,3,5,5232,34,23
4,4,5,5268,34,20


## Test / Validation Split

In [7]:
def get_index(start, end, df, col):
    ls = [x for x in range(start , end + 1)]
    idx = df[df[col].isin(ls)].index.ravel()
    return idx

# 2013-jan >> 2013-oct
train_1 = get_index(0, 10, train, "date_block_num")

# 2013-nov
val_1 = get_index(10, 10, train, "date_block_num")

# 2014-jan >> 2014-oct
train_2 = get_index(12, 22, train, "date_block_num")

# 2014-nov
val_2 = get_index(22, 22, train, "date_block_num")

# 2015-jan >> 2015-sep
train_3 = get_index(24, 33, train, "date_block_num")

# 2015-oct
val_3 = get_index(33, 33, train, "date_block_num")

# 2013-jan >> 2015-sep
train_all = get_index(0, 33, train, "date_block_num")

folds = [(train_1, val_1), (train_2, val_2), (train_3, val_3)]

## Features / Target

In [8]:
remove_cols = ["target", "item_name", "item_category_name", "shop_name"]
features = [col for col in train.columns if col not in remove_cols]
target="target"

In [9]:
train.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,item_category_id
0,0,19,0,0.0,40
1,0,27,0,0.0,19
2,0,28,0,0.0,30
3,0,29,0,0.0,23
4,0,32,0,6.0,40
