In [None]:
import os
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'


The files below have to be downloaded from kaggle:

In [None]:
train = pd.read_csv('train.csv',index_col = False)
test = pd.read_csv('test.csv')
holidays = pd.read_csv('holidays_events.csv')
oil = pd.read_csv('oil.csv')
sample_sub = pd.read_csv('sample_submission.csv')
stores = pd.read_csv('stores.csv')
transactions = pd.read_csv('transactions.csv')

In [None]:
def merge_stores(df,stores):
    df = pd.merge(df,stores,how = 'left',on = 'store_nbr')
    return df

In [None]:
def merge_oil(df,oil):


    dates = pd.DataFrame({'date':df['date'].unique()})

    oil_df = pd.merge(dates,oil,how = 'left',on = 'date')
    oil_df['dcoilwtico'][0] = oil_df['dcoilwtico'][1].copy()
    oil_df['dcoilwtico'] = oil_df['dcoilwtico'].interpolate(method = 'linear')
    df = pd.merge(df,oil,how = 'left',on = 'date')
    return df

In [None]:
def create_holidays(df,holidays):

    dates = pd.DataFrame({'date':df['date'].unique()})

    holidays = holidays[holidays['transferred'] == False]

    #national
    national_holidays = holidays[holidays['locale'] == 'National'][['date','locale']]
    national_holidays['is_national'] = 1
    national_holidays.drop_duplicates(subset='date', keep="first", inplace=True)
    

    national = pd.merge(dates,national_holidays,how = 'left',on = 'date')
    national['is_national'] = national['is_national'].fillna(0)

    df = pd.merge(df,national,how = 'left',on = 'date')
    df.drop(columns = 'locale',inplace = True)

    #regional
    regional_holidays = holidays[holidays['locale'] == 'Regional'][['date','locale_name']]
    regional_holidays['is_regional'] = 1

    df = pd.merge(df,regional_holidays,how = 'left',left_on = ['date','state'],right_on = ['date','locale_name'])
    df.drop(columns = 'locale_name',inplace = True)
    df['is_regional'] = df['is_regional'].fillna(0)

    #local
    local_holidays = holidays[holidays['locale'] == 'Local'][['date','locale_name']]
    local_holidays['is_local'] = 1
    local_holidays.drop([265],inplace = True)

    df = pd.merge(df,local_holidays,how = 'left',left_on = ['date','city'],right_on = ['date','locale_name'])
    df.drop(columns = 'locale_name',inplace = True)
    df['is_local'] = df['is_local'].fillna(0)

    return df


In [None]:
def add_transactions(df,transactions,is_test = False):

    dates = pd.DataFrame({'date':df['date'].unique()})

    if is_test == True:
        df = pd.merge(df,transactions,how = 'left',on = ['date','store_nbr'])
        df['is_closed'] = 0
    else:
        df = pd.merge(df,transactions,how = 'left',on = ['date','store_nbr'])
        df['is_closed'] = df['transactions'].isna()*1
        df['transactions'].fillna(0,inplace= True)
  
    return df    


In [None]:
def get_previous_month(df):
    df_grouped =  df.groupby(['year','month','family'])['sales'].sum().reset_index()

    items_list = df['family'].unique()

    new_df = pd.DataFrame(columns = ['year','month','family','sales','previous_month_sales'])

    for item in items_list:
        df_item = df_grouped[df_grouped['family'] == item]
        df_item['previous_month_sales'] = df_item['sales'].shift(1)

        new_df = new_df.append(df_item)

    new_df.drop(columns = 'sales',inplace= True)

    df = pd.merge(df,new_df,how = 'left',on = ['year','month','family'])

    return df

In [None]:
def run_pipeline():
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    holidays = pd.read_csv('holidays_events.csv')
    oil = pd.read_csv('oil.csv')
    stores = pd.read_csv('stores.csv')
    transactions = pd.read_csv('transactions.csv')

    train_temp = add_transactions(create_holidays(merge_oil(merge_stores(train,stores),oil),holidays),transactions)
    test_temp = add_transactions(create_holidays(merge_oil(merge_stores(test,stores),oil),holidays),transactions)
    test_temp['sales'] = np.nan

    full_data = pd.concat([train_temp,test_temp])

    full_data['family'] = full_data['family'].str.replace(' ','')
    full_data['family'] = full_data['family'].str.replace('/','')



    list_of_items = full_data['family'].unique()


    full_data['date'] = pd.to_datetime(full_data['date'])
    full_data['dayofweek'] = pd.to_datetime(full_data['date']).dt.day_of_week
    full_data['dayofyear'] = pd.to_datetime(full_data['date']).dt.day_of_year
    full_data['dayofmonth'] = pd.to_datetime(full_data['date']).dt.day
    full_data['store_nbr'] = full_data['store_nbr'].astype('str')
    full_data['cluster'] = full_data['cluster'].astype('str')
    full_data['year'] = pd.to_datetime(full_data['date']).dt.year
    full_data['month'] = pd.to_datetime(full_data['date']).dt.month

    full_data = get_previous_month(full_data)
    
    full_data.drop(columns = ['year','month'],inplace = True)
    
    
    full_data['sales'] = np.log1p(full_data['sales']+1)
    full_data['previous_month_sales'] = np.log1p(full_data['previous_month_sales'] + 1)

    
    os.chdir('data')
    
    for item in list_of_items:

        item_df = full_data[full_data['family'] == item]
        item_df.drop(columns = ['family'],inplace= True)
        item_df = pd.get_dummies(item_df)
        item_df.columns = item_df.columns.str.replace('_','')
        item_df.columns = item_df.columns.str.replace('/','')
        item_df.columns = item_df.columns.str.replace(' ','')
        item_df.columns = item_df.columns.str.replace(',','')

        item_df.to_csv(str(item)+'.csv',index = False)

    os.chdir('../')      
    
    del train
    del test
    del holidays
    del oil
    del stores
    del transactions

    


   
    

In [None]:
run_pipeline()

## Running GBM over all data

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_log_error
from lightgbm import LGBMRegressor

scaler = MinMaxScaler()
#train[train.columns] = scaler.fit_transform(train[train.columns])

## Creating one model for each Item

Creating the LGB model:

In [None]:
lgbmr_model = LGBMRegressor(
        colsample_bytree=0.7,
        learning_rate=0.055,
        min_child_samples=10,
        num_leaves=19,
        objective='regression',
        n_estimators=1000,
        n_jobs=4,
        random_state=337)


Function for validation

In [None]:
def validate_lgbrm(df):
    df = df.set_index('id',drop = True)
    df = df[:-864]

    #df.drop(columns = 'previousmonthsales',inplace = True)

    # training 
    x_train = df[:-864]
    y_train = x_train['sales']
    x_train.drop(columns=['sales','date' ],inplace=True)
    x_train[x_train.columns] = scaler.fit_transform(x_train[x_train.columns])

    # validation
    x_val = df[-864:]
    y_val = x_val['sales']
    x_val.drop(columns=['sales','date' ],inplace=True)
    x_val[x_val.columns] = scaler.transform(x_val[x_val.columns])

    lgbmr_model.fit(x_train,y_train)

    predictions = lgbmr_model.predict(x_val)

    predictions = [0 if x < 0 else x for x in predictions]

    rmsle = mean_squared_log_error(y_val,predictions)

    

    return rmsle 

    


This code runs the training + testing with the validation function and prints the RMSLE score:

In [None]:
os.chdir('data')
rmsle = 0
for file in os.listdir():
    df = pd.read_csv(file)
    rmsle = rmsle + validate_lgbrm(df)
    

print(rmsle)

os.chdir('../')

Function for training and testing:

In [None]:
def run_lgbrm(df):
    

    df = df.set_index('id',drop = True)

    df.drop(columns = 'previousmonthsales',inplace = True)
    
    # training
    x_train = df[df['date'] < '2017-08-16']
    y_train = x_train['sales']
    x_train.drop(columns=['sales','date' ],inplace=True)
    x_train[x_train.columns] = scaler.fit_transform(x_train[x_train.columns])

    # testing    
    x_test = df[df['date'] >= '2017-08-16']
    x_test.drop(columns=['sales','date' ],inplace=True)
    x_test[x_test.columns] = scaler.transform(x_test[x_test.columns])


    # Fitting model to training data and predicting on the testing set
    lgbmr_model.fit(x_train,y_train)
    x_test['sales'] = lgbmr_model.predict(x_test)

    

    return x_test.reset_index()[['id','sales']]


    





This code runs training and testing on all datasets and creates a list of predictions:

In [None]:
os.chdir('data')
predicted = pd.DataFrame(columns = ['id','sales'])
for file in os.listdir():
    df = pd.read_csv(file)
    predicted = predicted.append(run_lgbrm(df),ignore_index = True)
    


os.chdir('../')

predicted['sales'] = np.expm1(predicted['sales']) - 1


Lastly we set negative predictions to zero and sort the list before saving:

In [None]:
predicted['sales'] = [0 if x < 0 else x for x in predicted['sales']]

predicted.sort_values(by = 'id')


In [None]:
predicted.sort_values(by = 'id').to_csv('predictions.csv', index = False)