## 4. Time Series Classification & Prediction Deep Learning
You are provided with daily historical sales data. The task is to forecast the total amount of products sold in every shop for the test set. Note that the list of shops and products slightly changes every month. Creating a robust model that can handle such situations is the goal.
File descriptions
* sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.
* test.csv - the test set. You need to forecast the sales for these shops and products for November 2015.
* sample_submission.csv - a sample submission file in the correct format.
* items.csv - supplemental information about the items/products.
* item_categories.csv - supplemental information about the items categories.
* shops.csv- supplemental information about the shops.

#### Hint:
* Recommended a hybrid CNN-LSTM model 
* Dimensionality Reduction
    - Use Several Classifiers/ Ensemble Method
    - Logistic Regression (with different c values)
    - Random Forest (with different estimator values) 
    - SVM (with different kernels)
    - KNN (with k = 1,2,5,10,20)
    - K (3,5,10) Fold Cross Validation
* Performance Comparison
    - Classification Accuracy, Precision, Recall, Sensitivity, Specificity * AUC, ROC Curve
    - Confusion Matrix

In [92]:
%reload_ext autoreload
%autoreload 2
from Helpers import *
from AccuracyMetrics import *
from CustomMods import *

In [93]:
import gc
gc.collect()

92

In [94]:
# Supress warnings
def warn(*args, **kwargs):
    pass
import warnings
import os as os
import numpy as np
import pandas as pd
import re as regx
import gc as gc
import sys as sys
from datetime import datetime
from time import mktime
import matplotlib
import matplotlib.pyplot as plt

# iPy magic and other cool settings 
%matplotlib inline
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
np.random.seed(42)
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
warnings.warn = warn

from collections import Counter
from scipy.sparse import csr_matrix
from itertools import compress
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [95]:
# Import datasets
item_cats = pd.read_csv('../data/4/item_categories.csv')
items = pd.read_csv('../data/4/items.csv')
sales_train = pd.read_csv('../data/4/sales_train.csv')
test = pd.read_csv('../data/4/test.csv')
shops = pd.read_csv('../data/4/shops.csv')

In [96]:
sales_train.head(5)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [97]:
# combine items  & item categories into one dataset for analysis
items = pd.merge(items, item_cats, how='left', on=['item_category_id'])

In [98]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id,item_category_name
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40,Кино - DVD
1,!ABBYY FineReader 12 Professional Edition Full...,1,76,Программы - Для дома и офиса (Цифра)
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD
4,***КОРОБКА (СТЕКЛО) D,4,40,Кино - DVD


In [99]:
items.shape

(22170, 4)

In [100]:
features = 25
TfdfVect = TfidfVectorizer(max_features=features)
item_name = pd.DataFrame(TfdfVect.fit_transform(items['item_name']).toarray())

In [101]:
merge_data(items, item_name, 'item_name')

item_name.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.637898,0.0,0.0,...,0.0,0.403761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.483839
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
TfdfVect = TfidfVectorizer(max_features=features)
item_cat_name = pd.DataFrame(TfdfVect.fit_transform(items['item_category_name']).toarray())
merge_data(items, item_cat_name, 'item_cat_name')

In [103]:
TfdfVect = TfidfVectorizer(max_features=feature_cnt)
shop_name = pd.DataFrame(TfdfVect.fit_transform(shops['shop_name']).toarray())
merge_data(shops, shop_name, 'shop_name')

In [104]:
sales_train = sales_train[(sales_train['item_price']>0) & (sales_train['item_cnt_day']>0)]

In [105]:
item_price_latest = sales_train.sort_values(by=['date'], ascending=False)\
    .groupby(['item_id', 'shop_id'], as_index=False)['item_price'].first()

In [106]:
type(sales_train['date'][0])

str

In [107]:
sales_train['date'] = sales_train['date'].apply(lambda x: datetime.strptime(x, '%d.%m.%Y'))
sales_train['year'] = sales_train['date'].apply(lambda x: x.year)
sales_train['month'] = sales_train['date'].apply(lambda x: x.month)

In [108]:
sales_train1 = sales_train.groupby(['shop_id', 'date_block_num', 'item_id', 'year', 'month'], as_index=False)['item_cnt_day']\
    .sum().rename(columns={'item_cnt_day':'item_cnt_month'})

In [109]:
sales_train1.head()

Unnamed: 0,shop_id,date_block_num,item_id,year,month,item_cnt_month
0,0,0,32,2013,1,6.0
1,0,0,33,2013,1,3.0
2,0,0,35,2013,1,1.0
3,0,0,43,2013,1,1.0
4,0,0,51,2013,1,2.0


In [110]:
sales_train1.describe()

Unnamed: 0,shop_id,date_block_num,item_id,year,month,item_cnt_month
count,1608226.0,1608226.0,1608226.0,1608226.0,1608226.0,1608226.0
mean,32.80798,14.66502,10681.48,2013.793,6.154981,2.273154
std,16.53645,9.542384,6238.856,0.7773325,3.455207,8.653236
min,0.0,0.0,0.0,2013.0,1.0,1.0
25%,21.0,6.0,5046.0,2013.0,3.0,1.0
50%,31.0,14.0,10497.0,2014.0,6.0,1.0
75%,47.0,23.0,16060.0,2014.0,9.0,2.0
max,59.0,33.0,22169.0,2015.0,12.0,2253.0


In [111]:
sales_train1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1608226 entries, 0 to 1608225
Data columns (total 6 columns):
shop_id           1608226 non-null int64
date_block_num    1608226 non-null int64
item_id           1608226 non-null int64
year              1608226 non-null int64
month             1608226 non-null int64
item_cnt_month    1608226 non-null float64
dtypes: float64(1), int64(5)
memory usage: 85.9 MB


In [112]:
sales_train1['item_cnt_month'] = sales_train1['item_cnt_month'].clip(0, 20)

In [113]:
sales_train1 = sales_train1.sort_values('date_block_num')
sales_train1['item_cnt_prev_month'] = sales_train1.groupby(['shop_id', 'item_id'])['item_cnt_month'].shift(1).fillna(0)

In [114]:
# for the testing, the item_cnt_prev_month is the last month on the sales_train1
test_item_cnt_prev_month = sales_train1[['item_id', 'shop_id', 'item_cnt_month', 'date_block_num']]\
    .sort_values('date_block_num', ascending=False).groupby(['item_id', 'shop_id'], as_index=False).first()
test_item_cnt_prev_month = test_item_cnt_prev_month[['item_id', 'shop_id', 'item_cnt_month']]\
    .rename(columns={'item_cnt_month': 'item_cnt_prev_month'})

In [115]:
# for the testing, the item_cnt_prev_month is the last month on the sales_train1
test_item_cnt_prev_month = sales_train1[['item_id', 'shop_id', 'item_cnt_month', 'date_block_num']]\
    .sort_values('date_block_num', ascending=False).groupby(['item_id', 'shop_id'], as_index=False).first()
test_item_cnt_prev_month = test_item_cnt_prev_month[['item_id', 'shop_id', 'item_cnt_month']]\
    .rename(columns={'item_cnt_month': 'item_cnt_prev_month'})

In [116]:
# Exclude the date_block_num ==33
item_month_mean = sales_train1[sales_train1['date_block_num']!=33]\
    .groupby(['item_id', 'shop_id'], as_index=False)['item_cnt_month'].mean()\
    .rename(columns={'item_cnt_month': 'item_cnt_month_mean'})
sales_train1 = sales_train1.merge(item_month_mean, how='left', on=['item_id', 'shop_id'])
sales_train1.head()

Unnamed: 0,shop_id,date_block_num,item_id,year,month,item_cnt_month,item_cnt_prev_month,item_cnt_month_mean
0,0,0,32,2013,1,6.0,0.0,8.0
1,37,0,18636,2013,1,1.0,0.0,1.0
2,37,0,18610,2013,1,1.0,0.0,1.0
3,37,0,18581,2013,1,1.0,0.0,1.0
4,37,0,18580,2013,1,1.0,0.0,1.0


In [117]:
# For test data, we can actually use the date_block_num ==33
test_item_month_mean = sales_train1.groupby(['item_id', 'shop_id'], as_index=False)['item_cnt_month']\
    .mean().rename(columns={'item_cnt_month': 'item_cnt_month_mean'})
test_item_month_mean.head()

Unnamed: 0,item_id,shop_id,item_cnt_month_mean
0,0,54,1.0
1,1,55,1.2
2,2,54,1.0
3,3,54,1.0
4,4,54,1.0


In [118]:
item_price_avg = sales_train.groupby(['item_id', 'shop_id', 'year', 'month'], as_index=False)['item_price'].mean()
item_price_avg.head()

Unnamed: 0,item_id,shop_id,year,month,item_price
0,0,54,2014,9,58.0
1,1,55,2014,4,4490.0
2,1,55,2014,7,4490.0
3,1,55,2014,8,4490.0
4,1,55,2014,9,4490.0


In [119]:
sales_train2 = pd.merge(sales_train1, item_price_avg, how='left', on=['shop_id','item_id','year','month'])
sales_train3 = pd.merge(sales_train2, items, how='left', on=['item_id'])
sales_train4 = pd.merge(sales_train3, shops, how='left', on=['shop_id'])
train = sales_train4

In [120]:
test2 = pd.merge(test, item_price_latest, how='left', on=['shop_id','item_id'])
test3 = pd.merge(test2, items, how='left', on=['item_id'])
test4 = pd.merge(test3, shops, how='left', on=['shop_id'])
test5 = pd.merge(test4, test_item_cnt_prev_month[['item_id', 'shop_id', 'item_cnt_prev_month']], how='left', on=['item_id', 'shop_id'])
test6 = pd.merge(test5, test_item_month_mean, how='left', on=['item_id', 'shop_id'])
df_test = test6

In [121]:
for col in ['shop_id', 'item_id', 'item_category_id']:
    train[col] = train[col].astype(str)
    df_test[col] = df_test[col].astype(str)

In [122]:
train.sort_values(by=['year','month'], ascending=[False, False]).head(1)

Unnamed: 0,shop_id,date_block_num,item_id,year,month,item_cnt_month,item_cnt_prev_month,item_cnt_month_mean,item_price,item_name,...,shop_name15,shop_name16,shop_name17,shop_name18,shop_name19,shop_name20,shop_name21,shop_name22,shop_name23,shop_name24
1576705,58,33,2252,2015,10,4.0,1.0,2.259259,399.0,"Call Of Duty: Modern Warfare 3 [PC, Jewel]",...,0.0,0.0,0.0,0.0,0.0,0.0,0.322815,0.0,0.689588,0.648274


In [123]:
df_test['year'] = 2015
df_test['month'] = 11
df_test['date_block_num'] = 34

In [124]:
train = shuffle(train, random_state=42)

In [125]:
X = train[[col for col in train.columns.values\
           if col not in ['item_name', 'item_category_name', 'shop_name', 'item_cnt_month',\
                          'item_cnt_prev_month', 'item_cnt_month_mean']]].fillna(0)

y = train['item_cnt_month'].fillna(0)

list_training = list(X['date_block_num']<33)
list_testing = list(X['date_block_num']==33)

X_train2 = X[X['date_block_num']<33]
y_train2 = y[list_training].fillna(0)
X_test2 = X[X['date_block_num']==33]
y_test2 = y[list_testing].fillna(0)

In [126]:
reg = ExtraTreesRegressor(n_estimators=25, n_jobs=-1, max_depth=15, random_state=42)
reg.fit(X_train2, y_train2)

y_train_pred = reg.predict(X_train2)
y_test_pred = reg.predict(X_test2)

rmse_train = np.sqrt(mean_squared_error(y_train2, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test2, y_test_pred))

print('rmse_train:', rmse_train)
print('rmse_test:', rmse_test)

rmse_train: 1.9816493854980095
rmse_test: 2.2061106360729092


In [127]:
rf_reg2 = RandomForestRegressor(n_estimators=25, max_depth=10, random_state=42)
rf_reg2.fit(X_train2, y_train2)

y_train2_pred = rf_reg2.predict(X_train2)
y_test2_pred = rf_reg2.predict(X_test2)

In [128]:
rmse_train2 = np.sqrt(mean_squared_error(y_train2, y_train2_pred))
rmse_test2 = np.sqrt(mean_squared_error(y_test2, y_test2_pred))

print('rmse_train:', rmse_train2)
print('rmse_test:', rmse_test2)

rmse_train: 2.126222232885013
rmse_test: 2.2862270245545933


In [129]:
df_test2 = df_test[[col for col in X_train2.columns.values]].fillna(0)

In [130]:
test_pred = reg.predict(df_test2)

In [131]:
test_pred = np.round(test_pred,1)

In [132]:
test['item_cnt_month'] = test_pred

In [133]:
test.to_csv(path_or_buf='final_result.csv', columns=['ID', 'item_cnt_month'], index=False)