# Beware: Works only on Kaggle

Hangs on laptop. Needs more RAM.

In [166]:
import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder


from itertools import product
import gc
import random as python_random
import time


In [167]:
# Keras seeding to produce reproduciable results
np.random.seed(123)
python_random.seed(123)
tf.random.set_seed(1234)


In [168]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype in ["int32", "int64"]]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float16)
    df[int_cols]   = df[int_cols].astype(np.int16)
    
    return df

def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in tqdm(lags):
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left').fillna(0)
    return df


# Months choosen for lag
def lag_features(df, lags, ref_col, index_col, col_to_lag):
    '''
    ref_col: Reference column relative to which lag is measured
    lags: List of lag period
    index_col: Columns which will be static
    col_to lag: Columns on which lag has to be generated
    
    The function first makes a dataframe with index column and the columns which are
    to be lagged. Then it shifts the reference column forward to the lag period.
    Other column in the index remains the same. Then only new columns names are generated
    which represents the lag period but the values are same as original.
    
    Thus the reference column is shifted forward, the column names for lag column are suffixed
    with lag period and all other elements remain same. If original value of refernce column is 
    m, lag required is n. Then new reference value is m+n. The rename column is suffixed '_n'.
    The lag column value now shows value of n period back from the refernce perion.
    
    '''
    for month_shift in tqdm(lags):
        # Create a temp df
        lag_shift = df[index_cols + col_to_lag].copy()

        # Shift Month column value
        lag_shift[ref_col] = lag_shift[ref_col] + month_shift

        foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in col_to_lag else x
        lag_shift = lag_shift.rename(columns=foo)

        # Iteratively add all the lagging months
        df = pd.merge(df, lag_shift, on=index_cols, how='left').fillna(0)

    
    del lag_shift
    return df


def price_trend(df,col1,col2,col3):
    
    gb = (df.groupby([col1,col2])[col3].sum()
          .reset_index()
          .rename(columns = {col3:'avg_price'}))
    
    gb['shifted_avg_price'] = gb.groupby([col2])['avg_price'].shift(1)
    
    gb.drop('avg_price', axis=1, inplace=True)
    
    df = pd.merge(df,gb,on=[col1,col2])
    df['price_trend'] = df[col3] - df['shifted_avg_price']
    df.drop(['shifted_avg_price'], axis=1, inplace=True)
    
    del gb
    
    return df.fillna(0)


# Measures the delta changes in a column with respect to other columns.
def delta_change(df1, df2, gb_cols, pivot_col, agg_col, agg_func='sum', drop_col=True):
    '''
    gb_cols are the index columns w.r.t which the changes will be calculated
    pivot_col is the column subset of the index column relative to which the
    change is computed
    agg_col is the column which is monitored for change
    agg_func is the function (sum, mean...) which will be used for aggregation
    drop col is the boolean to know whether the aggregator column and the averaging
    column to be retained or not.
    
    The function first aggregates a column on the groupby statement of index columns of the
    training dataframe. Then the aggregrated column average is computed for the pivot column. 
    This mean is then subtracted from the aggregated column to get the delta column for the 
    new dataframe being built.
    '''
    
    aggregated_col = '_'.join(gb_cols) + '_agg'
    averaged_col = '_'.join(gb_cols) + '_avg_agg'
    delta_col = 'delta_' + '_'.join(gb_cols) + '_' + agg_col
    
    gb = (df1.groupby(gb_cols)
                 .agg({agg_col: agg_func})
                 .rename(columns = {agg_col:aggregated_col})
                 .reset_index())

    df2 = pd.merge(df2, gb, how='left', on=gb_cols)

    gb = (gb.groupby(pivot_col)
                 .agg({aggregated_col: 'mean'})
                 .rename(columns = {aggregated_col:averaged_col})
                 .reset_index())

    df2 = pd.merge(df2, gb, how='left', on=[pivot_col])
    df2[delta_col] = ((df2[aggregated_col] - 
                                  df2[averaged_col]) / df2[averaged_col])
    
    if drop_col:
        df2.drop([aggregated_col,averaged_col], axis=1,inplace=True)
    
    del gb
    
    return df2


def me_gb(df1, df2, gb_cols, agg_col, agg_func='sum', rename_specific = False, new_name = ''):
    '''
    gb_cols are the index columns w.r.t which the changes will be calculated
    
    agg_col is the column which is monitored for change
    agg_func is the function (sum, mean...) which will be used for aggregation
    rename_specific is the boolean to know whether the aggregator column  has to be
    renamed
    new_name is the new name that is to be given if changed
    
    The function first aggregates a column on the groupby statement of index columns of the
    training dataframe. Then it is merged to the new dataframe that is being built.
    
    '''
    # Get first 4 characters
    list_derived = []
    for i in gb_cols:
        list_derived.append(i[:4])
    
    if rename_specific:
        aggregated_col = new_name
    else:
        aggregated_col = '_'.join(list_derived) + '_' + agg_col[-3:]
        
    
    
    gb = (df1.groupby(gb_cols)
                 .agg({agg_col: agg_func})
                 .rename(columns = {agg_col:aggregated_col})
                 .reset_index())

    df2 = pd.merge(df2, gb, how='left', on=gb_cols)

    del gb
    
    return (aggregated_col, df2.fillna(0))

gc

<module 'gc' (built-in)>

### Load Data

In [169]:
# Read the data into variable

## For Kaggle

train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
category = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

# For laptop

# train = pd.read_csv('./final_project_data/sales_train.csv')
# items = pd.read_csv('./final_project_data/items.csv')
# category = pd.read_csv('./final_project_data/item_categories.csv')
# shops = pd.read_csv('./final_project_data/shops.csv')
# test = pd.read_csv('./final_project_data/test.csv')

In [170]:
# Copy the original dataset to temp 
sales = train.copy()
sales_test = test.copy()

sales_index = sales_test['ID']
sales_test.drop_duplicates()
print(f' before duplicate drop:{test.shape}  after duplicate drop:{sales_test.shape}')

 before duplicate drop:(214200, 3)  after duplicate drop:(214200, 3)


In [171]:
sales['split'] = sales['date'].str.split('.')
sales['year'] = sales['split'].map(lambda x: int(x[2][-2:]))
sales['year'] = sales['year'].astype(np.int8)

In [172]:
sales.drop(['date','split'], axis=1, inplace=True)

In [173]:
sales_33 = sales.loc[sales['date_block_num'] == 33]
item_sales_last_month = sales_33.groupby(['shop_id','item_id'])['item_cnt_day'].sum().reset_index()
item_price_last_month = sales_33.groupby(['shop_id','item_id'])['item_price'].mean().reset_index()
# sales_last_month.head()

In [174]:
sales_test['year'] = 2015
sales_test['date_block_num'] = 34
sales_test = sales_test.merge(item_sales_last_month, on=['shop_id', 'item_id'], how='left').fillna(0)
sales_test = sales_test.merge(item_price_last_month, on=['shop_id', 'item_id'], how='left').fillna(0)

sales_test.drop('ID', axis=1, inplace=True)
# sales_test.head()
sales_test.shape

(214200, 6)

In [175]:
print(sales.shape)
sales = sales.append(sales_test, ignore_index=True, sort=False)
print(sales.shape)


(2935849, 6)
(3150049, 6)


In [176]:
sales['month'] = sales['date_block_num'] % 12
sales.tail()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,year,month
3150044,34,45,18454,99.0,1.0,2015,10
3150045,34,45,16188,0.0,0.0,2015,10
3150046,34,45,15757,0.0,0.0,2015,10
3150047,34,45,19648,0.0,0.0,2015,10
3150048,34,45,969,0.0,0.0,2015,10


In [177]:
month_year = sales[['date_block_num','month','year']].drop_duplicates()
month_year['date_block_num'].astype(np.int8)
month_year['month'].astype(np.int8)
month_year['year'].astype(np.int16)
month_year.sample(5)

Unnamed: 0,date_block_num,month,year
631921,6,6,13
2719169,30,6,15
531518,5,5,13
1267562,12,0,14
2323423,24,0,15


In [178]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
#shops
shop_city = shops[['shop_id','city_code']].drop_duplicates()


In [179]:
category['split'] = category['item_category_name'].str.split('-')
category['type'] = category['split'].map(lambda x: x[0].strip())
category['type_code'] = LabelEncoder().fit_transform(category['type'])

category['subtype'] = category['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
category['subtype_code'] = LabelEncoder().fit_transform(category['subtype'])
category = category[['item_category_id','type_code', 'subtype_code']]

item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()


In [180]:
sales = pd.merge(sales, item_category_mapping, how='left', on='item_id')

sales = pd.merge(sales, category, on=['item_category_id'], how='left')

sales = pd.merge(sales, shop_city, on=['shop_id'], how='left')


In [181]:
sales['revenue'] = sales['item_cnt_day'] * sales['item_price']

sales['revenue'] = sales['revenue'] / sales['revenue'].max()


### Get a feature matrix

* itertools.product(*iterables):

It returns the cartesian product of all the itrable provided as the argument. For example, product(arr1, arr2, arr3).

In [182]:
# Create "grid" with columns
index_cols = ['date_block_num', 'shop_id', 'item_id']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[[block_num], cur_shops, cur_items ])),dtype='int16'))

    # Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int16)


In [183]:
# Join it to the grid
# grid is formed by all combination of unique shop,ite,month hence is many more rows,

all_data = pd.merge(grid, item_category_mapping, how='left', on='item_id')

all_data = pd.merge(all_data, category, on=['item_category_id'], how='left')

all_data = pd.merge(all_data, shop_city, on=['shop_id'], how='left')

all_data = pd.merge(all_data, month_year, on=['date_block_num'], how='left')


del item_category_mapping, category, shop_city, month_year

gc.collect()

35

### Mean Encoding

In [184]:
ret_col, all_data = (me_gb(sales, all_data, ['date_block_num', 'shop_id', 'item_id'], 
                            'item_cnt_day', agg_func='sum', rename_specific = True, new_name = 'target'))

list_1_lag = [ret_col]

In [185]:
ret_col, all_data = (me_gb(sales, all_data, ['date_block_num', 'shop_id'], 
                            'item_cnt_day', agg_func='sum', rename_specific = True, new_name = 'target_shop'))

list_2_lag = [ret_col]

In [186]:
ret_col, all_data = (me_gb(sales, all_data, ['date_block_num', 'item_id'], 
                            'item_cnt_day', agg_func='sum', rename_specific = True, new_name = 'target_item'))

list_2_lag.append(ret_col)

In [187]:
ret_col, all_data = (me_gb(sales, all_data, ['date_block_num', 'shop_id', 'item_id'], 
                            'item_price', agg_func='mean', rename_specific = True, new_name = 'item_price'))

list_2_lag.append(ret_col)

In [188]:
ret_col, all_data = (me_gb(sales, all_data, ['date_block_num', 'item_category_id'], 
                            'item_cnt_day', agg_func='mean', rename_specific = True, new_name = 'date_item_cat_cnt'))

list_2_lag.append(ret_col)

In [189]:
ret_col, all_data = (me_gb(sales, all_data, ['date_block_num', 'shop_id', 'item_category_id'], 'item_cnt_day', 
                            agg_func='mean', rename_specific = True, new_name = 'date_shop_item_cat_cnt'))

list_2_lag.append(ret_col)

In [190]:
ret_col, all_data = (me_gb(sales, all_data, ['date_block_num', 'type_code'], 'item_cnt_day', 
                           agg_func='mean', rename_specific = True, new_name = 'date_type_item_cnt'))

list_2_lag.append(ret_col)

In [191]:
ret_col, all_data = (me_gb(sales, all_data, ['date_block_num', 'subtype_code'], 'item_cnt_day', 
                           agg_func='mean', rename_specific = True, new_name = 'date_subtype_item_cnt'))

list_2_lag.append(ret_col)

In [192]:
ret_col, all_data = (me_gb(sales, all_data, ['date_block_num', 'city_code'], 'item_cnt_day', 
                           agg_func='mean', rename_specific = True, new_name = 'date_city_item_cnt'))

# list_2_lag.append(ret_col)

In [193]:
all_data['revenue'] = all_data['target'] * all_data['item_price']

all_data['revenue'] = all_data['revenue'] / all_data['revenue'].max()


In [194]:
# gb = (sales.groupby(['date_block_num','revenue'])
#                  .agg({'item_cnt_day': 'mean'})
#                  .rename(columns = {'item_cnt_day':'date_revenue_item_cnt'})
#                  .reset_index())
# all_data = pd.merge(all_data, gb, how='left', on=['date_block_num','revenue'])


In [195]:
ret_col, all_data = (me_gb(sales, all_data, ['date_block_num', 'revenue'], 'item_cnt_day', 
                           agg_func='mean', rename_specific = True, new_name = 'date_revenue_item_cnt'))

# list_2_lag.append(ret_col)

In [196]:
all_data = delta_change(sales, all_data, ['date_block_num', 'shop_id'], 'shop_id', 'item_cnt_day')
all_data = delta_change(sales, all_data, ['date_block_num', 'shop_id'], 'shop_id', 'revenue', drop_col=False)
all_data = delta_change(sales, all_data, ['month', 'shop_id'], 'shop_id', 'item_cnt_day', 'mean', False)

In [197]:
# all_data = price_trend(all_data,'date_block_num','shop_id','target')

In [198]:
# all_data = price_trend(all_data,'month','shop_id','revenue')

In [199]:
# Downcast dtypes from 64 to 32 bit to save memory
del grid
del sales, sales_test
gc.collect();

all_data = downcast_dtypes(all_data)


NameError: name 'gb' is not defined

### Lag feature

After creating a grid, we can calculate some features. We will use lags from [1, 2, 3, 4, 5, 12] months ago.

In [None]:
all_data = lag_features(all_data,[1,2, 3, 4], 'date_block_num',index_cols,['target'])

In [None]:
all_data = (lag_features(all_data,[1,2], 'date_block_num',index_cols,
             ['target_shop','target_item','date_item_cat_cnt','item_price',
                  'date_shop_item_cat_cnt', 'date_type_item_cnt', 
                  'date_subtype_item_cnt']))


#### try out

In [None]:
all_data = (lag_features(all_data,[1], 'date_block_num',index_cols, 
                        ['date_revenue_item_cnt']))


In [None]:
all_data.drop(['target_shop','target_item','date_item_cat_cnt','date_shop_item_cat_cnt', 'date_type_item_cnt', 
                'date_subtype_item_cnt','date_revenue_item_cnt',
               'revenue','item_price','month', 'year'], axis=1,inplace=True)

all_data.shape

In [None]:
# all_data = (lag_features(all_data,[1], 'date_block_num',index_cols, ['date_city_item_cnt']))

# all_data.drop('revenue', axis=1,inplace=True)


In [None]:
# # Category for each item


all_data = all_data.fillna(0)
all_data = downcast_dtypes(all_data)
all_data['shop_id'] = all_data['shop_id'].astype(np.int8)
all_data['item_id'] = all_data['item_id'].astype(np.int8)
all_data['city_code'] = all_data['city_code'].astype(np.int8)
all_data['date_block_num'] = all_data['date_block_num'].astype(np.int8)

gc.collect();

In [None]:
# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 


to_drop_cols = ['target','item_category_id','date_block_num']

In [None]:
all_data.shape

In [None]:
# all_data.drop('shop_item', axis=1, inplace=True)

To this end, we've created a feature matrix. It is stored in ```all_data variable```. Take a look:

### train/ validation/test split

34th month data is the test set. 32nd and 33rd data will be taken as validation split and rest as training data.

In [None]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = all_data['date_block_num']


X_train = all_data.loc[(dates <  32)].drop(to_drop_cols, axis=1)
X_val = all_data.loc[(dates ==  33) | (dates ==  32)].drop(to_drop_cols, axis=1)
X_test =  all_data.loc[dates == 34].drop(to_drop_cols, axis=1)

y_train = all_data.loc[(dates <  32), 'target'].values
y_val =  all_data.loc[((dates ==  33) | (dates ==  32)), 'target'].values

gc.collect();

In [None]:
X_test.shape

In [None]:
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 


In [None]:
def build_model():
    model = keras.Sequential([
        layers.Dense(128, activation='relu',input_shape=[X_train.shape[1]]),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
       layers.Dense(1)
      ])

    #optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.0)
    optimizer='rmsprop'
    
    model.compile(loss=root_mean_squared_error,
                optimizer=optimizer,
                metrics=['mse',])
    return model

In [None]:
# keras_model = build_model()

# EPOCHS = 10

# history = keras_model.fit(
#   X_train, y_train,
#   epochs=EPOCHS,
#     batch_size=100,
#     validation_data=(X_val, y_val),
#     #callbacks=[callbacks],
#   )

In [None]:
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Model accuracy')
# plt.ylabel('loss')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Test'], loc='upper left')
# # plt.ylim(bottom=10, top=12)
# # plt.xlim(left=1100, right = 1200)
# plt.grid()
# plt.show()

In [None]:
# # Preprocessing of test data, fit model
# preds_test = keras_model.predict(X_test)

# # The prediction is of type numpy.ndarray
# preds_list = preds_test.tolist()

# # Extract the prediction and put it in a list
# prediction = []
# for item in preds_list:
#     prediction.append(item[0])

# prediction = np.clip(prediction, 0, 20)

In [None]:
xgb_model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)


In [None]:
start = time.time()
xgb_model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    verbose=True, 
    early_stopping_rounds = 3)

time.time() -start

In [None]:
preds = xgb_model.predict(X_test).clip(0, 20)

prediction = (np.clip(preds, 0, 20)).tolist()

In [None]:
# Save test predictions to file
output = pd.DataFrame({'ID': sales_index,
                       'item_cnt_month': prediction})


In [None]:
# output.to_csv('XbgModel.csv', index=False)
# output.to_csv('KerasModel.csv', index=False)
output.to_csv('XGBModel.csv', index=False)

In [None]:
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax, importance_type='weight')

plot_features(xgb_model, (10,14))

In [None]:
df_gain = pd.DataFrame(xgb_model.get_booster().get_score(fmap='', importance_type='gain'), index=['gain'])
df_weight = pd.DataFrame(xgb_model.get_booster().get_score(fmap='', importance_type='weight'), index=['weight'])
df_cover = pd.DataFrame(xgb_model.get_booster().get_score(fmap='', importance_type='cover'), index=['cover'])
# df_total_gain = pd.DataFrame(xgb_model.get_booster().get_score(fmap='', importance_type='total_gain'), index=['total_gain'])
# df_total_cover = pd.DataFrame(xgb_model.get_booster().get_score(fmap='', importance_type='total_cover'), index=['total_cover'])

df_importance = df_gain.append(df_weight, ignore_index=False, sort=False)
df_importance = df_importance.append(df_cover, ignore_index=False, sort=False)
# df_importance = df_importance.append(df_total_gain, ignore_index=False, sort=False)
# df_importance = df_importance.append(df_total_cover, ignore_index=True, sort=False)

df_importance.iloc[0] = df_importance.iloc[0]/df_importance.iloc[0].max()
df_importance.iloc[1] = df_importance.iloc[1]/df_importance.iloc[1].max()
df_importance.iloc[2] = df_importance.iloc[2]/df_importance.iloc[2].max()
df_T=df_importance.T
df_T.to_csv('importance.csv')


In [None]:
df_T.plot.bar(figsize=(15,20))

In [None]:
df_T.tail(5)

In [None]:
# file1 = 'KerasModel.csv'
# k1 =pd.read_csv(file1)
# file2 = 'XGBModel.csv'
# k2 =pd.read_csv(file2)
# k2['item_cnt_month'] = (k2['item_cnt_month'] + k1['item_cnt_month'])/2
# k2.to_csv('KerasModel_mean.csv', index=False)