In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder


from itertools import product
import gc
import random as python_random


In [2]:
# Keras seeding to produce reproduciable results
np.random.seed(123)
python_random.seed(123)
tf.random.set_seed(1234)


In [3]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

### Load Data

In [4]:
# Read the data into variable

train = pd.read_csv('./final_project_data/sales_train.csv')
items = pd.read_csv('./final_project_data/items.csv')
item_categories = pd.read_csv('./final_project_data/item_categories.csv')
shops = pd.read_csv('./final_project_data/shops.csv')
test = pd.read_csv('./final_project_data/test.csv')

In [5]:
# train.head()

In [6]:
# test.head()

In [7]:
print(f'Shapes of data are:\n train:{train.shape}\n test:{test.shape}\n items:{items.shape}' +
     f'\n item_categories:{item_categories.shape} \n shops:{shops.shape}')

print(f'unique shops in test:{test.shop_id.nunique()}, unique item in test:{test.item_id.nunique()}' +
     f' possible combinations:{test.shop_id.nunique() * test.item_id.nunique()}' +
     f'which is same as the number of rows:{test.shape[0]}')


Shapes of data are:
 train:(2935849, 6)
 test:(214200, 3)
 items:(22170, 3)
 item_categories:(84, 2) 
 shops:(60, 2)
unique shops in test:42, unique item in test:5100 possible combinations:214200which is same as the number of rows:214200


In [8]:
print(train.columns)
print(test.columns)

Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day'],
      dtype='object')
Index(['ID', 'shop_id', 'item_id'], dtype='object')


In [9]:
# Copy the original dataset to temp 
sales = train.copy()
sales_test = test.copy()

sales_index = sales_test['ID']
sales_test.drop_duplicates()
print(f' before duplicate drop:{test.shape}  after duplicate drop:{sales_test.shape}')

 before duplicate drop:(214200, 3)  after duplicate drop:(214200, 3)


In [10]:
sales_33 = sales.loc[sales['date_block_num'] == 33]
item_sales_last_month = sales_33.groupby(['shop_id','item_id'])['item_cnt_day'].sum().reset_index()
item_price_last_month = sales_33.groupby(['shop_id','item_id'])['item_price'].mean().reset_index()
# sales_last_month.head()

In [11]:
sales_test['date_block_num'] = 34
sales_test = sales_test.merge(item_sales_last_month, on=['shop_id', 'item_id'], how='left').fillna(0)
sales_test = sales_test.merge(item_price_last_month, on=['shop_id', 'item_id'], how='left').fillna(0)
sales_test.drop('ID', axis=1, inplace=True)
# sales_test.head()
sales_test.shape

(214200, 5)

In [12]:
print(sales.shape)
sales = sales.append(sales_test, ignore_index=True, sort=False)
print(sales.shape)
sales.tail()

(2935849, 6)
(3150049, 6)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
3150044,,34,45,18454,99.0,1.0
3150045,,34,45,16188,0.0,0.0
3150046,,34,45,15757,0.0,0.0
3150047,,34,45,19648,0.0,0.0
3150048,,34,45,969,0.0,0.0


### Get a feature matrix

* itertools.product(*iterables):

It returns the cartesian product of all the itrable provided as the argument. For example, product(arr1, arr2, arr3).

In [13]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    # Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)


In [14]:
# Groupby data to get shop-item-month aggregates
# Rename the aggregate column to target
gb = (sales.groupby(index_cols,as_index=False)['item_cnt_day']
          .sum()
          .rename(columns = {'item_cnt_day':'target'}))

# Join it to the grid
# grid is formed by all combination of unique shop,ite,month hence is many more rows,
# Joining grid with gb will result in the target column of gb having NaNs
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)
# all_data.head()

In [15]:
# Same as above but with shop-month aggregates
gb = (sales.groupby(['shop_id', 'date_block_num'],as_index=False)['item_cnt_day']
          .sum()
          .rename(columns = {'item_cnt_day':'target_shop'}))

#gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)


In [16]:
# Same as above but with item-month aggregates
gb = (sales.groupby(['item_id', 'date_block_num'],as_index=False)['item_cnt_day']
          .sum().rename(columns = {'item_cnt_day':'target_item'}))

all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)
#all_data.head()

In [17]:
# # Mean Price for each item
# gb = sales.groupby(index_cols,as_index=False)['item_price'].mean().rename(
#                                                                     columns = {'item_price':'target_price'})

# all_data = pd.merge(all_data, gb, how='left', on=index_cols).fillna(0)


In [18]:
all_data.shape

(11128050, 6)

In [19]:
# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();

### Lag feature

After creating a grid, we can calculate some features. We will use lags from [1, 2, 3, 4, 5, 12] months ago.

In [20]:
# List of columns that we will use to create lags
# These columns are the target column which we created above by groupby
cols_to_rename = list(all_data.columns.difference(index_cols)) 
# print(index_cols)
# print(cols_to_rename)

In [21]:
print(cols_to_rename)

['target', 'target_item', 'target_shop']


In [22]:
# Months choosen for lag
shift_range = [1, 2, 11,12]

for month_shift in tqdm(shift_range):
    # Create a temp df
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    # Shift Month column value
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    # we will get a df with date_block_num column updated to the shift and 
    # and all target column names reflecting the month shift data
    # corresponding to the date_block_num value i.e
    # if date_block_num is 'N' the target_lag_M is the target_lag data from 
    # month N-M !!!! Superb bit of code.
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    # Iteratively add all the lagging months
    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# all_data.tail(20)

100%|██████████| 4/4 [00:41<00:00, 10.30s/it]


In [23]:
# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

# List of all lagged features
# Iterate for col name and check the last character occurs in the lag month list
#fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
items = ''
for item in shift_range:
    items = items + str(item)

fit_cols = [col for col in all_data.columns if col[-1] in items]

# We will drop these at fitting stage
# Drop the original target columns and the date_block_num
# essentially keeping the fit_cols and every index col except one
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

to_drop_cols

['target_shop', 'target_item', 'target', 'date_block_num']

In [24]:
# to_drop_cols.remove('target_shop')
# to_drop_cols.remove('target_item')

# to_drop_cols

### Add category column

In [25]:
# # Category for each item
# item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

# all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')

# Mean Price for each item
gb = sales.groupby(index_cols,as_index=False)['item_price'].mean()

all_data = pd.merge(all_data, gb, how='left', on=index_cols).fillna(0)

# gb = sales.groupby(index_cols,as_index=False)['item_price'].max().rename(
#                                                                 columns = {'item_price':'item_price_max'})

# all_data = pd.merge(all_data, gb, how='left', on=index_cols).fillna(0)

# gb = sales.groupby(index_cols,as_index=False)['item_price'].min().rename(
#                                                                 columns = {'item_price':'item_price_min'})

# all_data = pd.merge(all_data, gb, how='left', on=index_cols).fillna(0)

# all_data['revenue'] = all_data['target'] * all_data['item_price']


all_data = downcast_dtypes(all_data)

del gb
gc.collect();

In [26]:
# interactions = all_data['shop_id'].astype('str') + '_' + all_data['item_id'].astype('str')

# label_enc = LabelEncoder()
# all_data = all_data.assign(shop_item=label_enc.fit_transform(interactions))

In [27]:
# all_data.drop('shop_item', axis=1, inplace=True)

To this end, we've created a feature matrix. It is stored in ```all_data variable```. Take a look:

In [28]:
all_data.tail(5)

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_2,target_item_lag_2,target_shop_lag_2,target_lag_11,target_item_lag_11,target_shop_lag_11,target_lag_12,target_item_lag_12,target_shop_lag_12,item_price
6639289,45,18454,34,1.0,683.0,2.0,1.0,2.0,702.0,0.0,1.0,654.0,4.0,106.0,1551.0,0.0,0.0,0.0,99.0
6639290,45,16188,34,0.0,683.0,1.0,0.0,1.0,702.0,0.0,3.0,654.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6639291,45,15757,34,0.0,683.0,5.0,0.0,5.0,702.0,0.0,3.0,654.0,0.0,16.0,1551.0,0.0,9.0,1251.0,0.0
6639292,45,19648,34,0.0,683.0,2.0,0.0,2.0,702.0,0.0,3.0,654.0,0.0,11.0,1551.0,0.0,0.0,0.0,0.0
6639293,45,969,34,0.0,683.0,3.0,0.0,3.0,702.0,0.0,5.0,654.0,0.0,7.0,1551.0,0.0,6.0,1251.0,0.0


### train/ validation/test split

34th month data is the test set. 32nd and 33rd data will be taken as validation split and rest as training data.

In [29]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = all_data['date_block_num']


X_train = all_data.loc[(dates <  32)].drop(to_drop_cols, axis=1)
X_val = all_data.loc[(dates ==  33) | (dates ==  32)].drop(to_drop_cols, axis=1)
X_test =  all_data.loc[dates == 34].drop(to_drop_cols, axis=1)

y_train = all_data.loc[(dates <  32), 'target'].values
y_val =  all_data.loc[((dates ==  33) | (dates ==  32)), 'target'].values

gc.collect();

In [30]:
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 


Using TensorFlow backend.


In [31]:
def build_model():
    model = keras.Sequential([
        layers.Dense(128, activation='relu',input_shape=[X_train.shape[1]]),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
       layers.Dense(1)
      ])

    #optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.0)
    optimizer='rmsprop'
    
    model.compile(loss=root_mean_squared_error,
                optimizer=optimizer,
                metrics=['mse',])
    return model

In [32]:
keras_model = build_model()

In [None]:
EPOCHS = 3

history = keras_model.fit(
  X_train, y_train,
  epochs=EPOCHS,
    batch_size=100,
    validation_data=(X_val, y_val),
    #callbacks=[callbacks],
  )

Epoch 1/3

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model accuracy')
plt.ylabel('loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
# plt.ylim(bottom=10, top=12)
# plt.xlim(left=1100, right = 1200)
plt.grid()
plt.show()

In [None]:
# Preprocessing of test data, fit model
preds_test = keras_model.predict(X_test)

# The prediction is of type numpy.ndarray
preds_list = preds_test.tolist()

# Extract the prediction and put it in a list
prediction = []
for item in preds_list:
    prediction.append(item[0])

In [None]:
# xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
# xgb_model.fit(X_train, y_train, 
#              early_stopping_rounds=5, 
#              eval_set=[(X_val, y_val)], 
#              verbose=False)

In [None]:
# preds = xgb_model.predict(df_ts_expand)
# prediction = (np.clip(preds, 0, 20)).tolist()

In [None]:
prediction = np.clip(prediction, 0, 20)
# Save test predictions to file
output = pd.DataFrame({'ID': sales_index,
                       'item_cnt_month': prediction})


In [None]:
# output.to_csv('XbgModel.csv', index=False)
output.to_csv('KerasModel.csv', index=False)