# Loading important libraries

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")


import tensorflow as tf
import tensorflow.keras.backend as K

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, concatenate, Flatten
from tensorflow.keras.optimizers import Adam


## Loading the data

In [2]:
# Loading the train data 

train_subset = pd.read_csv("train_subset.csv",parse_dates = ['date'])

#Loading the item data 

item_details = pd.read_excel("item_details.xlsx")

In [3]:
## Merging items with train data and extracting out the time features. 

train_subset = pd.merge(train_subset,item_details,how = 'left', on = 'item_id')

train_subset['Month'] =pd.DatetimeIndex(train_subset['date']).month.astype('int8')
train_subset['Day'] =pd.DatetimeIndex(train_subset['date']).day.astype('int8')
train_subset['Week'] =pd.DatetimeIndex(train_subset['date']).weekday.astype('int8')

train_subset = train_subset.drop(['Unnamed: 0','date','perishable'],axis = 1)


train_subset.head()

Unnamed: 0,locationId,item_id,unit_sales,onpromotion,category_of_item,class,Month,Day,Week
0,location_25,item_105574,12.0,False,grocery_items,class_1045,1,1,0
1,location_25,item_105575,9.0,False,grocery_items,class_1045,1,1,0
2,location_25,item_105857,3.0,False,grocery_items,class_1092,1,1,0
3,location_25,item_108634,3.0,False,grocery_items,class_1075,1,1,0
4,location_25,item_108701,2.0,True,deli_items,class_2644,1,1,0


In [4]:
### Removing negative values in train data 

train_subset = train_subset[(train_subset['unit_sales']>0)]

In [5]:
### Removing outliers 

Q1 = train_subset.unit_sales.quantile(0.25)
Q3 = train_subset.unit_sales.quantile(0.75)
print(Q1,Q3)


IQR = Q3 - Q1
print(IQR)

lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
print( lower_limit,upper_limit)


train_subset = train_subset[(train_subset.unit_sales < upper_limit)]

2.0 8.0
6.0
-7.0 17.0


In [6]:
## Label Encoding 


from sklearn import preprocessing
def df_lbl_enc(df):
    for c in df.columns:
        if df[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            df[c] = lbl.fit_transform(df[c])
            print(c)
    return df


train_subset = df_lbl_enc(train_subset)

locationId
item_id
category_of_item
class


In [7]:
 from sklearn.preprocessing import LabelEncoder
 
 lb = LabelEncoder()

train_subset['onpromotion']= lb.fit_transform(train_subset['onpromotion'])

In [8]:
X_train = train_subset.drop(['unit_sales'], axis = 1)

Y_train = train_subset['unit_sales']

In [9]:
Y_train = np.log1p(Y_train)

In [10]:
X_train

Unnamed: 0,locationId,item_id,onpromotion,category_of_item,class,Month,Day,Week
0,17,120,0,8,29,1,1,0
1,17,121,0,8,29,1,1,0
2,17,133,0,8,58,1,1,0
3,17,194,0,8,46,1,1,0
4,17,197,1,5,165,1,1,0
...,...,...,...,...,...,...,...,...
21679746,48,2399,0,8,21,8,15,2
21679747,48,2400,0,8,21,8,15,2
21679748,48,2402,0,8,21,8,15,2
21679749,48,2403,0,8,37,8,15,2


# Building autoencoders 

In [11]:
# The size of encoded and actual representations
encoding_dim = 4  # Tried with 5,6
actual_dim = X_train.shape[1]

# Input placeholder
input_attrs = Input(shape=(actual_dim,))

# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu')(input_attrs)

# "decoded" is the lossy reconstruction of the input
decoded = Dense(actual_dim, activation='relu')(encoded)


# this model maps an input to its reconstruction
autoencoder = Model(input_attrs, decoded)

autoencoder.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
dense (Dense)                (None, 4)                 36        
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 40        
Total params: 76
Trainable params: 76
Non-trainable params: 0
_________________________________________________________________


In [12]:
autoencoder.compile(optimizer='Adam', loss='mean_absolute_percentage_error')

In [13]:
autoencoder.fit(X_train.values, X_train.values,batch_size = 1024, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x201b97113a0>

## Extracting deep features 

In [14]:
encoder = Model(input_attrs, encoded)

encoder.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
dense (Dense)                (None, 4)                 36        
Total params: 36
Trainable params: 36
Non-trainable params: 0
_________________________________________________________________


In [15]:
X_train_nonLinear_features = encoder.predict(X_train)

In [16]:
print(X_train_nonLinear_features.shape)

(19356963, 4)


In [17]:
del encoder 
del autoencoder 
del train_subset

In [18]:
X_train.head()

Unnamed: 0,locationId,item_id,onpromotion,category_of_item,class,Month,Day,Week
0,17,120,0,8,29,1,1,0
1,17,121,0,8,29,1,1,0
2,17,133,0,8,58,1,1,0
3,17,194,0,8,46,1,1,0
4,17,197,1,5,165,1,1,0


In [19]:
X_train.shape

(19356963, 8)

In [20]:
X_train_nonLinear_features = pd.DataFrame(X_train_nonLinear_features)

In [21]:
X_train_nonLinear_features.shape

(19356963, 4)

In [22]:
# Train Data With non linear features 
new_train_data = pd.concat([X_train.reset_index(drop=True),X_train_nonLinear_features], axis = 1)


In [23]:
new_train_data.head()

Unnamed: 0,locationId,item_id,onpromotion,category_of_item,class,Month,Day,Week,0,1,2,3
0,17,120,0,8,29,1,1,0,0.0,29.898108,91.181458,0.0
1,17,121,0,8,29,1,1,0,0.0,30.161291,92.001587,0.0
2,17,133,0,8,58,1,1,0,0.0,32.627441,99.693611,0.0
3,17,194,0,8,46,1,1,0,0.0,48.968056,150.610977,0.0
4,17,197,1,5,165,1,1,0,0.0,48.350201,148.679367,0.0


In [24]:
new_train_data.shape

(19356963, 12)

In [25]:
del X_train_nonLinear_features

In [26]:
## Splitting the data 

from sklearn.model_selection import train_test_split

X = new_train_data

y = Y_train

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [27]:
cat_cols = ['locationId','item_id','onpromotion','category_of_item','class','Month','Day','Week']

In [28]:
from lightgbm import LGBMRegressor 


## Light GBM 

LGB_model = LGBMRegressor(boosting_type= 'dart', 
                          num_leaves = 31,
                          objective = 'regression_l1',
                          max_depth = 8,
                          min_data_in_leaf = 50,
                          learning_rate = 0.01,
                          metric = 'l1')

## Fitting the model 

LGB_model.fit(X_train, y_train,eval_set = (X_test,y_test),early_stopping_rounds = 50,categorical_feature = cat_cols,verbose = 0)



LGBMRegressor(boosting_type='dart', learning_rate=0.01, max_depth=8,
              metric='l1', min_data_in_leaf=50, objective='regression_l1')

In [29]:
# Custom fuction for MAPE( Error Metric )

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [30]:
# Predicting on the validation data

LGB_Model_Y_train_pred = LGB_model.predict(X_train)
LGB_Model_Y_test_pred = LGB_model.predict(X_test)


LGB_Model_Y_train_pred =np.expm1(LGB_Model_Y_train_pred)
LGB_Model_Y_test_pred =np.expm1(LGB_Model_Y_test_pred)

y_train = np.expm1(y_train)
y_test = np.expm1(y_test)

Train_score_LGB= mean_absolute_percentage_error(y_train,LGB_Model_Y_train_pred)
Test_score_LGB = mean_absolute_percentage_error(y_test,LGB_Model_Y_test_pred)

print(Train_score_LGB)
print(Test_score_LGB)

56.390422320564646
56.461494761527156
