In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error
import seaborn as sns


In [2]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
submission=pd.read_csv("submission.csv")


In [3]:
train['total_price'].fillna(train['base_price'],inplace=True)

In [4]:
from sklearn import preprocessing 
  
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
train['store_id']= label_encoder.fit_transform(train['store_id']) 
test['store_id']= label_encoder.fit_transform(test['store_id']) 
train['sku_id']= label_encoder.fit_transform(train['sku_id']) 
test['sku_id']= label_encoder.fit_transform(test['sku_id'])

In [5]:
train.loc[train['base_price']-train['total_price'] == 0, 'is_same_price'] = 'Same'
train.loc[train['base_price']-train['total_price'] > 0, 'is_same_price'] = 'Lesser'
train.loc[train['base_price']-train['total_price'] < 0, 'is_same_price'] = 'Greater'
price_col=pd.get_dummies(train['is_same_price'],drop_first=True)
train['diff']=train['total_price']-train['base_price']
train=pd.concat([train,price_col],axis=1)
train.drop(['is_same_price'],axis=1,inplace=True)

In [6]:
test.loc[test['base_price']-test['total_price'] == 0, 'is_same_price'] = 'Same'
test.loc[test['base_price']-test['total_price'] > 0, 'is_same_price'] = 'Lesser'
test.loc[test['base_price']-test['total_price'] < 0, 'is_same_price'] = 'Greater'
price_col=pd.get_dummies(test['is_same_price'],drop_first=True)
test['diff']=test['total_price']-test['base_price']
test=pd.concat([test,price_col],axis=1)
test.drop(['is_same_price'],axis=1,inplace=True)

In [7]:
train['week'] = pd.to_datetime(train['week'])
test['week'] = pd.to_datetime(test['week'])

In [8]:
train['train_or_test'] = 'train'
test['train_or_test'] = 'test'

In [9]:
total_data = train.append(test).reset_index(drop=True)[train.columns] 

In [10]:
total_data.drop(['diff'],axis=1,inplace=True)

In [11]:
total_data['discount_on_base'] = (total_data['base_price'] - total_data['total_price']) / total_data['base_price']

In [12]:
total_data["discount_ratio"] = total_data["base_price"] / total_data["total_price"]

In [13]:
total_data = total_data.sort_values(['store_id', 'sku_id', 'week']).reset_index(drop=True)

In [14]:
total_data

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold,Lesser,Same,train_or_test,discount_on_base,discount_ratio
0,45883,2011-01-08,0,0,133.9500,133.9500,0,0,119.0,0,1,train,0.000000,1.000000
1,99,2011-01-17,0,0,134.6625,134.6625,0,0,114.0,0,1,train,0.000000,1.000000
2,1739,2011-01-24,0,0,133.9500,133.9500,0,0,87.0,0,1,train,0.000000,1.000000
3,3375,2011-01-31,0,0,133.9500,133.9500,0,0,135.0,0,1,train,0.000000,1.000000
4,24612,2011-02-05,0,0,133.9500,133.9500,0,0,98.0,0,1,train,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164005,230645,2013-09-24,75,27,234.4125,234.4125,0,0,,0,1,test,0.000000,1.000000
164006,227367,2013-10-09,75,27,191.6625,234.4125,0,0,,1,0,test,0.182371,1.223048
164007,206086,2013-11-06,75,27,234.4125,234.4125,0,1,7.0,0,1,train,0.000000,1.000000
164008,178250,2013-12-02,75,27,177.4125,191.6625,0,1,35.0,1,0,train,0.074349,1.080321


In [15]:
trn = total_data.loc[total_data['train_or_test']=='train', :]
tst = total_data.loc[total_data['train_or_test']=='test', :]

In [16]:
trn

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold,Lesser,Same,train_or_test,discount_on_base,discount_ratio
0,45883,2011-01-08,0,0,133.9500,133.9500,0,0,119.0,0,1,train,0.000000,1.000000
1,99,2011-01-17,0,0,134.6625,134.6625,0,0,114.0,0,1,train,0.000000,1.000000
2,1739,2011-01-24,0,0,133.9500,133.9500,0,0,87.0,0,1,train,0.000000,1.000000
3,3375,2011-01-31,0,0,133.9500,133.9500,0,0,135.0,0,1,train,0.000000,1.000000
4,24612,2011-02-05,0,0,133.9500,133.9500,0,0,98.0,0,1,train,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164002,191354,2013-09-04,75,27,213.0375,213.0375,0,0,9.0,0,1,train,0.000000,1.000000
164003,212644,2013-09-07,75,27,234.4125,234.4125,0,0,15.0,0,1,train,0.000000,1.000000
164007,206086,2013-11-06,75,27,234.4125,234.4125,0,1,7.0,0,1,train,0.000000,1.000000
164008,178250,2013-12-02,75,27,177.4125,191.6625,0,1,35.0,1,0,train,0.074349,1.080321


In [23]:
test.columns

Index(['record_ID', 'week', 'store_id', 'sku_id', 'total_price', 'base_price',
       'is_featured_sku', 'is_display_sku', 'units_sold', 'Lesser', 'Same',
       'train_or_test', 'discount_on_base', 'discount_ratio'],
      dtype='object')

In [18]:
params={
    "learning_rate" : [0.05,0.10,0.15,0.20,0.25,0.30],
    "max_depth" : [3,4,5,6,8,10,12,15],
    "min_child_weight" : [1,3,5,7],
    "gamma" : [0.0,0.1,0.2,0.3,0.4],
    "colsample_bytree" : [0.3,0.4,0.5,0.7],
    "n_estimator" : [100,300,500,700,900,1000,1200,1500,1700,2000]
}



In [19]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


In [44]:
regressor=lgb.LGBMRegressor()

In [45]:
random_search=RandomizedSearchCV(regressor,param_distributions=params,n_iter=5,n_jobs=-1,cv=5,verbose=3)

In [46]:
features1=['record_ID','store_id', 'sku_id', 'total_price', 'base_price',
       'is_featured_sku', 'is_display_sku', 'Lesser', 'Same','discount_on_base', 'discount_ratio']

In [47]:
X=trn[features1]
y=trn['units_sold']
random_search.fit(X,y)


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    5.9s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    6.6s finished


RandomizedSearchCV(cv=5, estimator=LGBMRegressor(), n_iter=5, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.3, 0.4, 0.5,
                                                             0.7],
                                        'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
                                        'learning_rate': [0.05, 0.1, 0.15, 0.2,
                                                          0.25, 0.3],
                                        'max_depth': [3, 4, 5, 6, 8, 10, 12,
                                                      15],
                                        'min_child_weight': [1, 3, 5, 7],
                                        'n_estimator': [100, 300, 500, 700, 900,
                                                        1000, 1200, 1500, 1700,
                                                        2000]},
                   verbose=3)

In [48]:
random_search.best_estimator_

LGBMRegressor(colsample_bytree=0.4, gamma=0.2, max_depth=10, min_child_weight=1,
              n_estimator=300)

In [49]:
random_search.best_params_

{'n_estimator': 300,
 'min_child_weight': 1,
 'max_depth': 10,
 'learning_rate': 0.1,
 'gamma': 0.2,
 'colsample_bytree': 0.4}

In [50]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(regressor,X,y,cv=10)

In [51]:
score

array([0.42951886, 0.50885652, 0.61051938, 0.54441152, 0.55366506,
       0.55955641, 0.49071242, 0.59176865, 0.46079222, 0.51978612])

In [52]:
score.mean()

0.5269587168435492

In [53]:
regressor=lgb.LGBMRegressor(colsample_bytree=0.4, gamma=0.2, max_depth=10, min_child_weight=1,
              n_estimator=300)

In [54]:
regressor.fit(X,y)

LGBMRegressor(colsample_bytree=0.4, gamma=0.2, max_depth=10, min_child_weight=1,
              n_estimator=300)

In [55]:
test=test[features1]

In [56]:
df_submission = pd.read_csv('submission.csv')


In [57]:
df_submission

Unnamed: 0,record_ID,units_sold
0,212645,0
1,212646,0
2,212647,0
3,212648,0
4,212649,0
...,...,...
13855,232281,0
13856,232282,0
13857,232285,0
13858,232286,0


In [58]:
predict_test = regressor.predict(test)
df_submission.units_sold = predict_test
df_submission.to_csv('solution.csv',index = False)