In [1]:
# Loading important libaries 

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
## Loading the data 

train_subset = pd.read_csv("train_subset.csv",parse_dates = ['date']) 

test = pd.read_csv("test.csv",parse_dates=['date'])

item_details = pd.read_excel("item_details.xlsx")

In [3]:
## Merging train data with item and extracting time features 

train_subset = pd.merge(train_subset,item_details,how = 'left', on = 'item_id')

train_subset['Month'] =pd.DatetimeIndex(train_subset['date']).month.astype('int8')
train_subset['Day'] =pd.DatetimeIndex(train_subset['date']).day.astype('int8')
train_subset['Week'] =pd.DatetimeIndex(train_subset['date']).weekday.astype('int8')

train_subset = train_subset.drop(['Unnamed: 0','date','perishable'],axis = 1)


train_subset.head()

Unnamed: 0,locationId,item_id,unit_sales,onpromotion,category_of_item,class,Month,Day,Week
0,location_25,item_105574,12.0,False,grocery_items,class_1045,1,1,0
1,location_25,item_105575,9.0,False,grocery_items,class_1045,1,1,0
2,location_25,item_105857,3.0,False,grocery_items,class_1092,1,1,0
3,location_25,item_108634,3.0,False,grocery_items,class_1075,1,1,0
4,location_25,item_108701,2.0,True,deli_items,class_2644,1,1,0


In [4]:
### Removing negative values in train data 

train_subset = train_subset[(train_subset['unit_sales']>0)]

In [5]:
### Removing outliers 

Q1 = train_subset.unit_sales.quantile(0.25)
Q3 = train_subset.unit_sales.quantile(0.75)
print(Q1,Q3)


IQR = Q3 - Q1
print(IQR)

lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
print( lower_limit,upper_limit)


train_subset = train_subset[(train_subset.unit_sales < upper_limit)]

2.0 8.0
6.0
-7.0 17.0


In [6]:
### Converting to category 


def cat_converter(df):
    for i in df.columns:
        if df[i].dtype == 'float64':
            df[i] = df[i]
        else:            
            df[i] = df[i].astype('category')
            print(i)
    return df


train_subset = cat_converter(train_subset)

locationId
item_id
onpromotion
category_of_item
class
Month
Day
Week


In [7]:
# Categorical Columns 

cat_cols = ['locationId','item_id','onpromotion','category_of_item','class','Month','Day','Week']

In [8]:
# Separating target variable 

X_train = train_subset.drop(['unit_sales'], axis = 1)

y_train = train_subset['unit_sales']

In [9]:
# Log transforming the target variable 

y_train_log = np.log1p(y_train)

In [10]:
# Custom fuction for MAPE (Error Metric)

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [11]:
from lightgbm import LGBMRegressor 


## Light GBM 

LGB_model = LGBMRegressor(boosting_type= 'dart', 
                          num_leaves = 31,
                          objective = 'regression_l1',
                          max_depth = 8,
                          min_data_in_leaf = 50,
                          learning_rate = 0.01,
                          metric = 'l1')

LGB_model.fit(X_train, y_train_log,categorical_feature = cat_cols,verbose = 0)



LGBMRegressor(boosting_type='dart', learning_rate=0.01, max_depth=8,
              metric='l1', min_data_in_leaf=50, objective='regression_l1')

In [13]:
# Predicting on the train set 

LGB_Model_Y_train_pred = LGB_model.predict(X_train)

LGB_Model_Y_train_pred =np.expm1(LGB_Model_Y_train_pred)

Train_score_LGB= mean_absolute_percentage_error(y_train,LGB_Model_Y_train_pred)

print(Train_score_LGB)

56.368464434015884


## Predicting on the test data. 

In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23517680 entries, 0 to 23517679
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   locationId   object        
 3   item_id      object        
 4   onpromotion  bool          
dtypes: bool(1), datetime64[ns](1), int64(1), object(2)
memory usage: 740.1+ MB


In [15]:
test.head()

Unnamed: 0,id,date,locationId,item_id,onpromotion
0,0,2019-01-01,location_25,item_99197,False
1,1,2019-01-01,location_25,item_103665,False
2,2,2019-01-01,location_25,item_105574,False
3,3,2019-01-01,location_25,item_105857,False
4,4,2019-01-01,location_25,item_106716,False


In [16]:
test.shape

(23517680, 5)

In [17]:
ID = test['id']

In [18]:
test = pd.merge(test,item_details,how = 'left', on = 'item_id')

test['Month'] =pd.DatetimeIndex(test['date']).month.astype('int8')
test['Day'] =pd.DatetimeIndex(test['date']).day.astype('int8')
test['Week'] =pd.DatetimeIndex(test['date']).weekday.astype('int8')

test = test.drop(['id','date','perishable'],axis = 1)


test.head()

Unnamed: 0,locationId,item_id,onpromotion,category_of_item,class,Month,Day,Week
0,location_25,item_99197,False,grocery_items,class_1067,1,1,1
1,location_25,item_103665,False,baked_items / bread_based,class_2712,1,1,1
2,location_25,item_105574,False,grocery_items,class_1045,1,1,1
3,location_25,item_105857,False,grocery_items,class_1092,1,1,1
4,location_25,item_106716,False,grocery_items,class_1032,1,1,1


In [19]:
### Converting to category 


def cat_converter(df):
    for i in df.columns:
        if df[i].dtype == 'float64':
            df[i] = df[i]
        else:            
            df[i] = df[i].astype('category')
            print(i)
    return df


test = cat_converter(test)

locationId
item_id
onpromotion
category_of_item
class
Month
Day
Week


In [20]:
Test_pred = LGB_model.predict(test)

In [21]:
Test_Pred_best = np.expm1(Test_pred)

In [22]:
Test_Pred_best

array([1.19800902, 1.21733827, 1.42688314, ..., 1.48623582, 1.33112006,
       1.319369  ])

In [23]:
res = pd.DataFrame(Test_Pred_best)
ID = pd.DataFrame(ID)

res = res.rename(columns={res.columns[0]: 'unit_sales'})

gb = pd.concat([ID,res], axis = 1)

gb['unit_sales'] = gb['unit_sales'].round(2)

In [24]:
del test
del train_subset

In [25]:
gb.to_csv("Final_Submission.csv",index= False)