In [118]:
import pandas as pd
import numpy as np
import lightgbm as lgb  

In [7]:
def final_fun_1(X):

  store_data = pd.read_csv('train.csv', parse_dates=['date'], index_col=['date'])
  test_data = X

  store_data['day'] = store_data.index.day
  store_data['month'] = store_data.index.month
  store_data['year'] = store_data.index.year
  store_data['dayofweek'] = store_data.index.dayofweek

  test_data['day'] = test_data.index.day
  test_data['month'] = test_data.index.month
  test_data['year'] = test_data.index.year
  test_data['dayofweek'] = test_data.index.dayofweek

  #Overall average sale value
  average = store_data.sales.mean()

  # Pivot Table on year index
  year_pivot_table = pd.pivot_table(store_data, index='year', values='sales', aggfunc=np.mean) / average

  #approximate quardatic function
  store_years = np.arange(2013, 2019)
  weight = np.exp((store_years - 2018) / 10)[:-1]
  store_annual_growth = np.poly1d(np.polyfit(store_years[:-1], year_pivot_table.values.squeeze(), 2, w=weight))

  # Pivot Table on Day of Week index
  dayofweek_pivot_table = pd.pivot_table(store_data, index='dayofweek', columns='item', values='sales', aggfunc=np.mean)

  # Pivot Table on Month index
  month_pivot_table = pd.pivot_table(store_data, index='month', values='sales', aggfunc=np.mean) / average

  # Pivot Table on Store index
  store_pivot_table = pd.pivot_table(store_data, index='store', values='sales', aggfunc=np.mean) / average

  #Calculating 2018 sales values using pivot table values(Day of Week,Monthly and Store) and multiplying with 2018 growth rate.
  sales_pred_2018 = []
  for _, test_row in test_data.iterrows():
    dayofweek_row, month_row= test_row.name.dayofweek, test_row.name.month
    item_row, store_id_row = test_row['item'], test_row['store']

    #Calculating dayofweek,monthly and store value
    dayofweek_value = dayofweek_pivot_table.at[dayofweek_row, item_row]
    month_value = month_pivot_table.at[month_row, 'sales']
    sales_value =  store_pivot_table.at[store_id_row, 'sales']

    #muliplying all previous values
    final_product = dayofweek_value * month_value * sales_value
    store_annual_growth_value = store_annual_growth(2018)

    #muliplying with 2018 growth value
    sales_pred_2018.append(int(np.round(final_product * store_annual_growth_value, 0)))

  test_data['sales'] = sales_pred_2018
  store = pd.concat([store_data, test_data], sort=False)
  store.reset_index(inplace=True)

  store['dayofyear'] = store.date.dt.dayofyear
  store['weekofyear'] = store.date.dt.weekofyear
  store['weekend_yes'] = store.date.dt.weekday // 4
  store['month_start_yes'] = store.date.dt.is_month_start.astype(int)
  store['month_end_yes'] = store.date.dt.is_month_end.astype(int)
  store['quarter'] = store.date.dt.quarter
  store['weekofmonth'] = store['weekofyear'].values // 4.35                                                                                                                                                                               
  store['mon_yes'] = np.where(store['dayofweek'] == 0, 1, 0)                                                                                            
  store['tue_yes'] = np.where(store['dayofweek'] == 1, 1, 0)                                                                                         
  store['wed_yes'] = np.where(store['dayofweek'] == 2, 1, 0)                                                                                         
  store['thu_yes'] = np.where(store['dayofweek'] == 3, 1, 0)                                                                                         
  store['fri_yes'] = np.where(store['dayofweek'] == 4, 1, 0)                                                                                         
  store['sat_yes'] = np.where(store['dayofweek'] == 5, 1, 0)                                                                                         
  store['sun_yes'] = np.where(store['dayofweek'] == 6, 1, 0) 

  exp_time_features = ['dayofweek', 'weekofmonth', 'weekofyear', 'month', 'quarter', 'weekend_yes'] 
  for exp_item in exp_time_features:
    expanding_store = store.groupby(['store', 'item', exp_item])['sales'].expanding().mean().bfill().reset_index()
    expanding_store.columns = ['store', 'item', exp_item, 'exp_index', 'exp_'+exp_item]
    expanding_store = expanding_store.sort_values(by=['item', 'store', 'exp_index'])
    store['exp_'+exp_item] = expanding_store['exp_'+exp_item].values

  store.sort_values(by=['item', 'store', 'date'], axis=0, inplace=True)

  #Adding Lag values as feature
  l = [8,15,22,29,30,31,38,61,67,73,91, 98, 105, 112, 180, 270, 365, 546, 728]                                                                                                                                                                                                                      
  for var_l in l:                                                                                                                          
    store['l_' + str(var_l)] = store.groupby(["item", "store"])['sales'].transform(lambda y: y.shift(var_l)) + np.random.normal(scale=0.01, size=(len(store),))  

  #Adding Rolling Mean values as feature
  r = [8,15,22,29,30,31,38,61,67,73,91, 98, 105, 112, 180, 270, 365, 546, 728]                                                                                                                                                                                                                                                                                                                       
  for var_r in r:                                                                                                                    
    store['r_' + str(var_r)] = store.groupby(["item", "store"])['sales'].transform(lambda y: y.shift(1).rolling(window=var_r, min_periods=8, win_type="triang").mean()) + np.random.normal(scale=0.01, size=(len(store),)) 

  # #Adding Exponentially Mean values as feature

  ewm_a = [0.95, 0.9, 0.8, 0.7, 0.5,.4,.3,.2,.1]                                             
  ewm_l = [8,15,22,29,30,31,38,61,67,73,91, 98, 105, 112, 180, 270, 365, 546, 728]
                                                                                                          
  for var_a in ewm_a:                                                                                                                      
    for var_l in ewm_l:                                                                                                                      
      store['ewm_a_' + str(var_a) + "_l_" + str(var_l)] = store.groupby(["item", "store"])['sales'].transform(lambda y: y.shift(var_l).ewm(alpha=var_a).mean()) 

  store_encoding = pd.get_dummies(store[['store', 'item', 'dayofweek', 'month']], columns=['store', 'item', 'dayofweek', 'month'], dummy_na=True)  
  store_final = pd.concat([store, store_encoding], axis=1)                                                                                                          

  # changing to log scale                                                                                                          
  store_final['sales'] = np.log1p(store_final["sales"].values)

  store_lgbm_columns = [column for column in store_final.columns if column not in ['date', 'id', 'sales', 'year']]

  iteration = 15000
                                                                                                       
  store_lgbm_parms = {                                                                                                                            
          'nthread': -1,
          'metric': 'mae',
          'boosting_type': 'gbdt',    
          'max_depth': 7,
          'num_leaves': 28,   
          'task': 'train',                                                                                                                      
          'objective': 'regression_l1',                                                                                                         
          'learning_rate': 0.05,                                                                                                                
          'feature_fraction': 0.9,                                                                                                              
          'bagging_fraction': 0.8,                                                                                                              
          'bagging_freq': 5,                                                                                                                    
          'lambda_l1': 0.06,                                                                                                                    
          'lambda_l2': 0.05,                                                                                                                    
          'verbose': -1,     } 

  X = store_final[store_lgbm_columns] 
  Y = store_final['sales']

  store_lgbm = store_final.loc[store_final.id.notnull()]                                                                                                                
  test = store_lgbm[store_lgbm_columns] 
                                                                                                                                                                                                                                                                                                                   
  store_lgbm_dataset = lgb.Dataset(data=X, label=Y, feature_name=store_lgbm_columns)                                                                
  store_lgbm_model = lgb.train(store_lgbm_parms, store_lgbm_dataset, num_boost_round=iteration)

  #making predictions
  store_lgbm_preds = store_lgbm_model.predict(test, num_iteration=iteration)  

  store_lgbm_preds_sales = np.round(np.expm1(store_lgbm_preds),0)
  return store_lgbm_preds_sales

In [119]:
def final_fun_2(X,Y):

  store_data = pd.read_csv('train.csv', parse_dates=['date'], index_col=['date'])
  test_data = X
  target_data = Y

  store_data['day'] = store_data.index.day
  store_data['month'] = store_data.index.month
  store_data['year'] = store_data.index.year
  store_data['dayofweek'] = store_data.index.dayofweek

  test_data['day'] = test_data.index.day
  test_data['month'] = test_data.index.month
  test_data['year'] = test_data.index.year
  test_data['dayofweek'] = test_data.index.dayofweek

  #Overall average sale value
  average = store_data.sales.mean()

  # Pivot Table on year index
  year_pivot_table = pd.pivot_table(store_data, index='year', values='sales', aggfunc=np.mean) / average

  #approximate quardatic function
  store_years = np.arange(2013, 2019)
  weight = np.exp((store_years - 2018) / 10)[:-1]
  store_annual_growth = np.poly1d(np.polyfit(store_years[:-1], year_pivot_table.values.squeeze(), 2, w=weight))

  # Pivot Table on Day of Week index
  dayofweek_pivot_table = pd.pivot_table(store_data, index='dayofweek', columns='item', values='sales', aggfunc=np.mean)

  # Pivot Table on Month index
  month_pivot_table = pd.pivot_table(store_data, index='month', values='sales', aggfunc=np.mean) / average

  # Pivot Table on Store index
  store_pivot_table = pd.pivot_table(store_data, index='store', values='sales', aggfunc=np.mean) / average

  #Calculating 2018 sales values using pivot table values(Day of Week,Monthly and Store) and multiplying with 2018 growth rate.
  sales_pred_2018 = []
  for _, test_row in test_data.iterrows():
    dayofweek_row, month_row= test_row.name.dayofweek, test_row.name.month
    item_row, store_id_row = test_row['item'], test_row['store']

    #Calculating dayofweek,monthly and store value
    dayofweek_value = dayofweek_pivot_table.at[dayofweek_row, item_row]
    month_value = month_pivot_table.at[month_row, 'sales']
    sales_value =  store_pivot_table.at[store_id_row, 'sales']

    #muliplying all previous values
    final_product = dayofweek_value * month_value * sales_value
    store_annual_growth_value = store_annual_growth(2018)

    #muliplying with 2018 growth value
    sales_pred_2018.append(int(np.round(final_product * store_annual_growth_value, 0)))

  test_data['sales'] = sales_pred_2018
  store = pd.concat([store_data, test_data], sort=False)
  store.reset_index(inplace=True)

  store['dayofyear'] = store.date.dt.dayofyear
  store['weekofyear'] = store.date.dt.weekofyear
  store['weekend_yes'] = store.date.dt.weekday // 4
  store['month_start_yes'] = store.date.dt.is_month_start.astype(int)
  store['month_end_yes'] = store.date.dt.is_month_end.astype(int)
  store['quarter'] = store.date.dt.quarter
  store['weekofmonth'] = store['weekofyear'].values // 4.35                                                                                                                                                                               
  store['mon_yes'] = np.where(store['dayofweek'] == 0, 1, 0)                                                                                            
  store['tue_yes'] = np.where(store['dayofweek'] == 1, 1, 0)                                                                                         
  store['wed_yes'] = np.where(store['dayofweek'] == 2, 1, 0)                                                                                         
  store['thu_yes'] = np.where(store['dayofweek'] == 3, 1, 0)                                                                                         
  store['fri_yes'] = np.where(store['dayofweek'] == 4, 1, 0)                                                                                         
  store['sat_yes'] = np.where(store['dayofweek'] == 5, 1, 0)                                                                                         
  store['sun_yes'] = np.where(store['dayofweek'] == 6, 1, 0) 

  exp_time_features = ['dayofweek', 'weekofmonth', 'weekofyear', 'month', 'quarter', 'weekend_yes'] 
  for exp_item in exp_time_features:
    expanding_store = store.groupby(['store', 'item', exp_item])['sales'].expanding().mean().bfill().reset_index()
    expanding_store.columns = ['store', 'item', exp_item, 'exp_index', 'exp_'+exp_item]
    expanding_store = expanding_store.sort_values(by=['item', 'store', 'exp_index'])
    store['exp_'+exp_item] = expanding_store['exp_'+exp_item].values

  store.sort_values(by=['item', 'store', 'date'], axis=0, inplace=True)

  #Adding Lag values as feature
  l = [8,15,22,29,30,31,38,61,67,73,91, 98, 105, 112, 180, 270, 365, 546, 728]                                                                                                                                                                                                                      
  for var_l in l:                                                                                                                          
    store['l_' + str(var_l)] = store.groupby(["item", "store"])['sales'].transform(lambda y: y.shift(var_l)) + np.random.normal(scale=0.01, size=(len(store),))  

  #Adding Rolling Mean values as feature
  r = [8,15,22,29,30,31,38,61,67,73,91, 98, 105, 112, 180, 270, 365, 546, 728]                                                                                                                                                                                                                                                                                                                       
  for var_r in r:                                                                                                                    
    store['r_' + str(var_r)] = store.groupby(["item", "store"])['sales'].transform(lambda y: y.shift(1).rolling(window=var_r, min_periods=8, win_type="triang").mean()) + np.random.normal(scale=0.01, size=(len(store),)) 

  # #Adding Exponentially Mean values as feature

  ewm_a = [0.95, 0.9, 0.8, 0.7, 0.5,.4,.3,.2,.1]                                             
  ewm_l = [8,15,22,29,30,31,38,61,67,73,91, 98, 105, 112, 180, 270, 365, 546, 728]
                                                                                                          
  for var_a in ewm_a:                                                                                                                      
    for var_l in ewm_l:                                                                                                                      
      store['ewm_a_' + str(var_a) + "_l_" + str(var_l)] = store.groupby(["item", "store"])['sales'].transform(lambda y: y.shift(var_l).ewm(alpha=var_a).mean()) 

  store_encoding = pd.get_dummies(store[['store', 'item', 'dayofweek', 'month']], columns=['store', 'item', 'dayofweek', 'month'], dummy_na=True)  
  store_final = pd.concat([store, store_encoding], axis=1)                                                                                                          

  # changing to log scale                                                                                                          
  store_final['sales'] = np.log1p(store_final["sales"].values)

  store_lgbm_columns = [column for column in store_final.columns if column not in ['date', 'id', 'sales', 'year']]

  iteration = 15000
                                                                                                       
  store_lgbm_parms = {                                                                                                                            
          'nthread': -1,
          'metric': 'mae',
          'boosting_type': 'gbdt',    
          'max_depth': 7,
          'num_leaves': 28,   
          'task': 'train',                                                                                                                      
          'objective': 'regression_l1',                                                                                                         
          'learning_rate': 0.05,                                                                                                                
          'feature_fraction': 0.9,                                                                                                              
          'bagging_fraction': 0.8,                                                                                                              
          'bagging_freq': 5,                                                                                                                    
          'lambda_l1': 0.06,                                                                                                                    
          'lambda_l2': 0.05,                                                                                                                    
          'verbose': -1,     } 

  X = store_final[store_lgbm_columns] 
  Y = store_final['sales']

  store_lgbm = store_final.loc[store_final.id.notnull()]                                                                                                                
  test = store_lgbm[store_lgbm_columns] 
                                                                                                                                                                                                                                                                                                                   
  store_lgbm_dataset = lgb.Dataset(data=X, label=Y, feature_name=store_lgbm_columns)                                                                
  store_lgbm_model = lgb.train(store_lgbm_parms, store_lgbm_dataset, num_boost_round=iteration)

  #making predictions
  store_lgbm_preds = store_lgbm_model.predict(test, num_iteration=iteration)  

  store_lgbm_preds_sales = np.round(np.expm1(store_lgbm_preds),0)

  pred_length = len(store_lgbm_preds_sales)
  pred_smape_masked = ~((store_lgbm_preds_sales == 0) & (target_data == 0))
  store_lgbm_preds_sales, target_data = store_lgbm_preds_sales[pred_smape_masked], target_data[pred_smape_masked]
  pred_smape_num = np.abs(store_lgbm_preds_sales - target_data)
  pred_smape_den = np.abs(store_lgbm_preds_sales) + np.abs(target_data)
  pred_smape = (200 * np.sum(pred_smape_num / pred_smape_den)) / pred_length
  return pred_smape