# Evaluation Notebook

This notebook computes the WRMSSE validation scores for our pretraineds model on the last 28 days of the training set.
The WRMSSE scores are calculated such as in the Kaggle competition with all the hierarchical layers.

In [2]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random
import lightgbm as lgb
# custom imports
from multiprocessing import Pool        # Multiprocess Runs

warnings.filterwarnings('ignore')
import wandb
from wandb.lightgbm import wandb_callback
import ipdb
from scipy.sparse import csr_matrix

## WRMSSE score

In [3]:
file_pass = ''

# Load S and W weights for WRMSSE calcualtions:
sw_df = pd.read_pickle(file_pass+'sw_df.pkl')
S = sw_df.s.values
W = sw_df.w.values
SW = sw_df.sw.values

# Load roll up matrix to calcualte aggreagates:
roll_mat_df = pd.read_pickle(file_pass+'roll_mat_df.pkl')
roll_index = roll_mat_df.index
roll_mat_csr = csr_matrix(roll_mat_df.values)
del roll_mat_df

In [4]:
# Function to do quick rollups:
def rollup(v):
    '''
    v - np.array of size (30490 rows, n day columns)
    v_rolledup - array of size (n, 42840)
    '''
    return roll_mat_csr*v #(v.T*roll_mat_csr.T).T


# Function to calculate WRMSSE (for all hierarchical levels):
def wrmsse(preds, y_true, score_only=False, s = S, w = W, sw=SW):
    '''
    preds - Predictions: pd.DataFrame of size (30490 rows, N day columns)
    y_true - True values: pd.DataFrame of size (30490 rows, N day columns)
    sequence_length - np.array of size (42840,)
    sales_weight - sales weights based on last 28 days: np.array (42840,)
    '''
    
    if score_only:
        return np.sum(
                np.sqrt(
                    np.mean(
                        np.square(rollup(preds.values-y_true.values))
                            ,axis=1)) * sw)/12 #<-used to be mistake here
    else: 
        score_matrix = (np.square(rollup(preds.values-y_true.values)) * np.square(w)[:, None])/ s[:, None]
        score = np.sum(np.sqrt(np.mean(score_matrix,axis=1)))/12 #<-used to be mistake here
        return score, score_matrix

## Making Predictions

In [16]:
# SEED = 42                        # We want all things
# seed_everything(SEED)            # to be as deterministic 
# lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores
ver='finalRun'

#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1941               # End day of our train set
END_VALID  = 1969                # End day of our validation set
P_HORIZON   = 28                 # Prediction horizon

weight_features = ['Weight','ScalingFactor','CombinedWeight']
# FEATURES to remove: These features lead to overfit or values not present in test set
remove_features = ['id','store_id','state_id',
                   'date','wm_yr_wk','d',TARGET] + weight_features
mean_features   = ['enc_cat_id_mean','enc_cat_id_std', 'enc_dept_id_mean','enc_dept_id_std', 'enc_item_id_mean','enc_item_id_std'] 

#PATHS for Features
#ORIGINAL = '../input/m5-forecasting-accuracy/'
BASE     = 'grid_part_1.pkl'
PRICE    = 'grid_part_2.pkl'
CALENDAR = 'grid_part_3.pkl'
LAGS     = 'lags_df_28.pkl'
MEAN_ENC = 'mean_encoding_df.pkl'


#STORES ids
STORES_IDS = pd.read_csv('sales_train_evaluation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())

#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

In [6]:
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                pd.read_pickle(PRICE).iloc[:,2:],
                pd.read_pickle(CALENDAR).iloc[:,2:]],
                axis=1)

    # Leave only relevant store
    df = df[df['store_id']==store]
    # With memory limits we have to read 
    # lags and mean encoding features
    # separately and drop items that we don't need.
    # As our Features Grids are aligned 
    # we can use index to keep only necessary rows
    # Alignment is good for us as concat uses less memory than merge.

    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit 

    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit 

    # Weights featues df to test with weights
    #weights_df = df[['id','d']+[col for col in list(df) if col in weight_features]]
    # Create features list
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d', 'store_id', 'state_id', TARGET]+features]

    # Skipping first n rows
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)


    return df, features#, weights_df

In [10]:
index_columns = ['id', 'd']
preds_df = pd.DataFrame()
for store_id in STORES_IDS:
    print('Predict', store_id)
    #wandb.init(project='M5_competition')

    grid_df, features_columns = get_data_by_store(store_id)
    
    # Masks for 
    # Train (All data less than 1913), last 28 days also used in training for the last model
    # Validation (Last 28 days - not real validatio set)
    # Test (All data greater than 1913 day, 
    #       with some gap for recursive features)
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
    
# Code to test with more estimators
#     if (store_id == STORES_IDS[0]):
#         ver='nest=2200'
#     else:
#         ver='tweedie+timeweights+l1'

    model_path = 'lgb_model_'+store_id+'_v'+str(ver)+'.bin' 
    estimator = pickle.load(open(model_path, 'rb'))
    predictions = estimator.predict(grid_df[valid_mask][features_columns])
    
    index_df = grid_df[valid_mask][index_columns]
    index_df['preds'] = predictions
    index_df = index_df.pivot(index='id',columns='d',values='preds')
    preds_df = preds_df.append(index_df)
    #del index_df
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    # "Keep" models features for predictions
    MODEL_FEATURES = features_columns

Predict CA_1
Predict CA_2
Predict CA_3
Predict CA_4
Predict TX_1
Predict TX_2
Predict TX_3
Predict WI_1
Predict WI_2
Predict WI_3


### Create complete prediction set

In [17]:
preds_df.index = preds_df.index.map(lambda x: x.replace('evaluation','validation'))
preds_df

d,1886,1887,1888,1889,1890,1891,1892,1893,1894,1895,...,1904,1905,1906,1907,1908,1909,1910,1911,1912,1913
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HOBBIES_1_001_CA_1_validation,0.933637,0.834698,0.803652,0.875106,1.299386,1.324749,1.251613,1.028397,0.880317,0.907936,...,1.103275,1.360815,1.118590,0.977497,0.907244,0.863040,0.870472,0.950124,1.280733,1.135942
HOBBIES_1_002_CA_1_validation,0.205836,0.183706,0.187472,0.217172,0.243382,0.266216,0.344617,0.242593,0.190199,0.151782,...,0.191390,0.244915,0.234575,0.188412,0.198671,0.201376,0.211660,0.247425,0.265416,0.261552
HOBBIES_1_003_CA_1_validation,0.345288,0.269009,0.300798,0.308848,0.439553,0.585777,0.439859,0.320762,0.300572,0.305685,...,0.559790,0.773314,0.805557,0.407383,0.380388,0.396725,0.392558,0.596037,0.817954,0.777380
HOBBIES_1_004_CA_1_validation,1.617516,1.366384,1.218277,1.497055,2.052653,2.602316,3.248852,1.381824,1.335874,1.168665,...,1.701580,2.527587,3.032344,1.719650,1.397898,1.322182,1.529032,2.028407,3.336247,3.066222
HOBBIES_1_005_CA_1_validation,1.007678,0.980819,1.272441,1.184356,1.189447,1.533110,1.602870,0.954977,0.995106,0.904044,...,1.255361,1.531180,1.463198,1.124632,1.014596,1.026030,1.184397,1.351634,1.613385,1.584914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FOODS_3_823_WI_3_validation,0.047456,0.056838,0.031055,0.030128,0.047298,0.050914,0.053528,0.034646,0.051444,0.039489,...,0.765872,0.620966,0.566474,0.324107,0.314715,0.301354,0.287086,0.436920,0.512525,0.559227
FOODS_3_824_WI_3_validation,0.006307,0.006953,0.009711,0.009804,0.022581,0.191300,1.136018,0.858709,0.471716,0.434291,...,0.345314,0.318431,0.309276,0.314683,0.302972,0.211897,0.188241,0.196051,0.230022,0.314092
FOODS_3_825_WI_3_validation,0.582982,0.505524,0.533812,0.541872,0.811374,1.399320,1.620090,1.010603,1.142836,1.437333,...,1.316829,1.118649,1.228334,0.812298,0.572926,0.565880,0.497168,0.625351,0.683890,0.775790
FOODS_3_826_WI_3_validation,0.999857,0.940076,0.846130,0.827423,0.874417,1.201116,1.379055,0.965682,0.933798,1.110384,...,1.045666,1.143711,1.065144,0.915787,0.915553,0.803919,0.777088,0.830864,1.166261,1.302178


In [18]:
true_df = pd.read_csv('sales_train_validation.csv')
true_df = true_df.set_index('id')
true_df = true_df[true_df.columns[-28:]]
true_df = true_df.reset_index()
true_df

Unnamed: 0,id,d_1886,d_1887,d_1888,d_1889,d_1890,d_1891,d_1892,d_1893,d_1894,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,1,0,0,0,0,0,1,0,4,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,1,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,3,1,2,1,3,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,1,0,4,4,0,1,4,0,1,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_validation,0,0,0,0,0,1,1,2,0,...,0,0,0,0,0,0,0,0,1,0
30487,FOODS_3_825_WI_3_validation,1,0,0,0,0,1,1,2,1,...,2,1,0,2,0,1,0,0,1,0
30488,FOODS_3_826_WI_3_validation,3,0,0,0,0,0,1,1,2,...,0,0,1,0,0,1,0,3,1,3


In [19]:
preds_df = pd.merge(true_df['id'],preds_df.reset_index(),on='id')
preds_df

Unnamed: 0,id,1886,1887,1888,1889,1890,1891,1892,1893,1894,...,1904,1905,1906,1907,1908,1909,1910,1911,1912,1913
0,HOBBIES_1_001_CA_1_validation,0.933637,0.834698,0.803652,0.875106,1.299386,1.324749,1.251613,1.028397,0.880317,...,1.103275,1.360815,1.118590,0.977497,0.907244,0.863040,0.870472,0.950124,1.280733,1.135942
1,HOBBIES_1_002_CA_1_validation,0.205836,0.183706,0.187472,0.217172,0.243382,0.266216,0.344617,0.242593,0.190199,...,0.191390,0.244915,0.234575,0.188412,0.198671,0.201376,0.211660,0.247425,0.265416,0.261552
2,HOBBIES_1_003_CA_1_validation,0.345288,0.269009,0.300798,0.308848,0.439553,0.585777,0.439859,0.320762,0.300572,...,0.559790,0.773314,0.805557,0.407383,0.380388,0.396725,0.392558,0.596037,0.817954,0.777380
3,HOBBIES_1_004_CA_1_validation,1.617516,1.366384,1.218277,1.497055,2.052653,2.602316,3.248852,1.381824,1.335874,...,1.701580,2.527587,3.032344,1.719650,1.397898,1.322182,1.529032,2.028407,3.336247,3.066222
4,HOBBIES_1_005_CA_1_validation,1.007678,0.980819,1.272441,1.184356,1.189447,1.533110,1.602870,0.954977,0.995106,...,1.255361,1.531180,1.463198,1.124632,1.014596,1.026030,1.184397,1.351634,1.613385,1.584914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,0.047456,0.056838,0.031055,0.030128,0.047298,0.050914,0.053528,0.034646,0.051444,...,0.765872,0.620966,0.566474,0.324107,0.314715,0.301354,0.287086,0.436920,0.512525,0.559227
30486,FOODS_3_824_WI_3_validation,0.006307,0.006953,0.009711,0.009804,0.022581,0.191300,1.136018,0.858709,0.471716,...,0.345314,0.318431,0.309276,0.314683,0.302972,0.211897,0.188241,0.196051,0.230022,0.314092
30487,FOODS_3_825_WI_3_validation,0.582982,0.505524,0.533812,0.541872,0.811374,1.399320,1.620090,1.010603,1.142836,...,1.316829,1.118649,1.228334,0.812298,0.572926,0.565880,0.497168,0.625351,0.683890,0.775790
30488,FOODS_3_826_WI_3_validation,0.999857,0.940076,0.846130,0.827423,0.874417,1.201116,1.379055,0.965682,0.933798,...,1.045666,1.143711,1.065144,0.915787,0.915553,0.803919,0.777088,0.830864,1.166261,1.302178


## Computing WRMSSE

In [20]:
wrmsse(preds_df[preds_df.columns[1:]], true_df[true_df.columns[1:]])

(0.3627410929735014,
 array([[1.33673432e-02, 7.72408371e-04, 3.85012430e-02, ...,
         1.31095742e-03, 1.46801284e-02, 9.55349966e-03],
        [1.65529079e-02, 4.27192594e-03, 4.10132399e-03, ...,
         5.89206754e-04, 3.28257010e-03, 6.49017819e-04],
        [1.19673326e-05, 9.51462137e-03, 1.81523123e-03, ...,
         1.73443881e-09, 1.84193991e-03, 9.06754000e-03],
        ...,
        [1.58175795e-13, 1.16137338e-13, 1.39427315e-13, ...,
         1.88136753e-13, 6.75157290e-13, 6.40337959e-13],
        [8.68205665e-14, 1.10509178e-13, 1.08735606e-13, ...,
         1.30228190e-13, 1.72683919e-13, 1.22930991e-13],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00]]))

# Prediction

## Predicting validation set

In [17]:
index_columns = ['id', 'd']
preds_df_valid = pd.DataFrame()
for store_id in STORES_IDS:
    print('Predict', store_id)
    #wandb.init(project='M5_competition')

    grid_df, features_columns = get_data_by_store(store_id)
    
    # Masks for 
    # Train (All data less than 1913)
    # "Validation" (Last 28 days - not real validatio set)
    # Test (All data greater than 1913 day, 
    #       with some gap for recursive features)
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
#     if (store_id == STORES_IDS[0]):
#         ver='nest=2200'
#     else:
#         ver='tweedie+timeweights+l1'
    model_path = 'lgb_model_'+store_id+'_v'+str(ver)+'.bin' 
    estimator = pickle.load(open(model_path, 'rb'))
    predictions = estimator.predict(grid_df[valid_mask][features_columns])
    
    index_df = grid_df[valid_mask][index_columns]
    index_df['preds'] = predictions
    index_df = index_df.pivot(index='id',columns='d',values='preds')
    preds_df_valid = preds_df_valid.append(index_df)
    #del index_df
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    # "Keep" models features for predictions
    MODEL_FEATURES = features_columns
preds_df_valid.index = preds_df_valid.index.map(lambda x: x.replace('evaluation','validation'))

Predict CA_1
Predict CA_2
Predict CA_3
Predict CA_4
Predict TX_1
Predict TX_2
Predict TX_3
Predict WI_1
Predict WI_2
Predict WI_3


In [18]:
preds_df_valid

d,1914,1915,1916,1917,1918,1919,1920,1921,1922,1923,...,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HOBBIES_1_001_CA_1_validation,0.968052,0.846389,0.818432,0.854639,1.052062,1.341891,1.473242,0.921521,0.939520,0.842853,...,0.964595,1.410356,1.173984,0.911776,0.876774,0.841805,1.052188,1.231348,1.300107,1.145099
HOBBIES_1_002_CA_1_validation,0.204419,0.193445,0.151741,0.167550,0.202885,0.251356,0.248374,0.209721,0.207850,0.197859,...,0.274754,0.342914,0.302466,0.206141,0.184683,0.174162,0.192442,0.173567,0.238722,0.264245
HOBBIES_1_003_CA_1_validation,0.435802,0.411070,0.474542,0.466519,0.649205,0.987985,0.673831,0.482883,0.526515,0.490893,...,0.655281,0.763601,0.842098,0.551697,0.498877,0.506402,0.548348,0.771645,0.767158,0.721094
HOBBIES_1_004_CA_1_validation,1.495372,1.274237,1.389909,1.638269,2.270237,2.732851,2.968191,1.569027,1.213425,1.108753,...,1.744539,2.416827,3.078948,1.792574,1.378132,1.424342,1.457471,2.068065,3.387903,3.642703
HOBBIES_1_005_CA_1_validation,0.957936,0.884289,0.840177,0.998208,1.150126,1.337973,1.703712,1.053154,1.112821,1.025056,...,1.191730,1.748037,1.645003,1.012995,0.927642,0.979827,0.818266,1.085105,1.494029,1.443908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FOODS_3_823_WI_3_validation,0.368243,0.388043,0.396559,0.387130,0.430372,0.524693,0.565906,0.496340,0.533340,0.408410,...,0.548221,0.724289,0.959957,0.525476,0.539004,0.490176,0.424678,0.469285,0.582856,0.692565
FOODS_3_824_WI_3_validation,0.276211,0.286433,0.267327,0.234123,0.232092,0.280713,0.321620,0.363986,0.393525,0.296339,...,0.170683,0.232569,0.292231,0.160536,0.166017,0.177527,0.166107,0.223462,0.320445,0.320686
FOODS_3_825_WI_3_validation,0.578205,0.522450,0.462635,0.451609,0.644276,0.782376,1.012051,0.970055,1.046544,0.693705,...,0.942234,1.138429,1.428346,0.966177,0.615458,0.615606,0.471726,0.642382,0.699849,0.833299
FOODS_3_826_WI_3_validation,1.206118,1.126163,0.878187,0.933736,1.050721,1.406829,1.229470,1.371375,1.263210,0.970169,...,1.073419,1.426492,1.506390,1.137581,1.213962,0.878476,0.874977,1.084926,1.048256,1.330711


## Predicting evaluation set

In [19]:
index_columns = ['id', 'd']
preds_df_eval = pd.DataFrame()
for store_id in STORES_IDS:
    print('Predict', store_id)
    #wandb.init(project='M5_competition')

    grid_df, features_columns = get_data_by_store(store_id)
    
    # Masks for 
    # Train (All data less than 1913)
    # "Validation" (Last 28 days - not real validatio set)
    # Test (All data greater than 1913 day, 
    #       with some gap for recursive features)
    not_eval_mask = grid_df['d']<=END_VALID 
    eval_mask = not_eval_mask&(grid_df['d']>(END_VALID -P_HORIZON))
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
#     if (store_id == STORES_IDS[0]):
#         ver='nest=2200'
#     else:
#         ver='tweedie+timeweights+l1'
    model_path = 'lgb_model_'+store_id+'_v'+str(ver)+'.bin' 
    estimator = pickle.load(open(model_path, 'rb'))
    predictions = estimator.predict(grid_df[eval_mask][features_columns])
    
    index_df = grid_df[eval_mask][index_columns]
    index_df['preds'] = predictions
    index_df = index_df.pivot(index='id',columns='d',values='preds')
    preds_df_eval = preds_df_eval.append(index_df)
    #del index_df
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    # "Keep" models features for predictions
    MODEL_FEATURES = features_columns

Predict CA_1
Predict CA_2
Predict CA_3
Predict CA_4
Predict TX_1
Predict TX_2
Predict TX_3
Predict WI_1
Predict WI_2
Predict WI_3


In [20]:
preds_df_eval

d,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,...,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HOBBIES_1_001_CA_1_evaluation,0.956285,0.508918,0.500288,0.496227,0.573487,0.708126,0.695915,0.543169,0.422720,0.475428,...,0.509005,0.482422,0.490297,0.409241,0.397437,0.354469,0.374885,0.477338,0.500414,0.535085
HOBBIES_1_002_CA_1_evaluation,0.220830,0.316807,0.329848,0.333703,0.470354,0.526234,0.542835,0.329995,0.269354,0.312017,...,0.428008,0.442981,0.463392,0.307291,0.283765,0.301074,0.312100,0.402797,0.432115,0.434524
HOBBIES_1_003_CA_1_evaluation,0.549056,0.517010,0.486076,0.463068,0.668072,0.837130,0.738531,0.395407,0.400538,0.430554,...,0.506007,0.527769,0.525306,0.334904,0.333112,0.330644,0.363792,0.444063,0.461447,0.481740
HOBBIES_1_004_CA_1_evaluation,1.377182,0.842073,0.867012,0.889068,1.167183,1.197305,1.786262,1.041366,0.782573,0.714175,...,1.099494,0.903148,1.148229,0.903375,0.866047,0.863782,0.918084,1.141884,0.953543,1.134601
HOBBIES_1_005_CA_1_evaluation,0.990363,0.698564,0.598577,0.673380,0.921072,1.075520,1.114566,0.672423,0.603880,0.603136,...,0.772798,0.719584,0.837130,0.631342,0.614201,0.636361,0.695782,0.855223,0.846728,0.805124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FOODS_3_823_WI_3_evaluation,0.474207,0.635851,0.625845,0.629134,0.702754,0.804479,0.864042,0.351876,0.400226,0.442012,...,0.542415,0.596141,0.604872,0.365289,0.432198,0.514568,0.405698,0.480141,0.411126,0.378893
FOODS_3_824_WI_3_evaluation,0.244012,0.293010,0.305919,0.254819,0.293989,0.333191,0.356492,0.199890,0.206377,0.235224,...,0.304568,0.366428,0.361507,0.241971,0.303168,0.325131,0.238259,0.270650,0.257506,0.242109
FOODS_3_825_WI_3_evaluation,0.584675,0.495688,0.475791,0.457810,0.585183,0.603009,0.662225,0.355504,0.352453,0.403563,...,0.549319,0.580132,0.556582,0.425919,0.510824,0.472447,0.380331,0.461627,0.347011,0.350521
FOODS_3_826_WI_3_evaluation,0.972861,0.535943,0.467384,0.484817,0.616078,0.611548,0.595126,0.554166,0.511890,0.414800,...,0.608030,0.674792,0.650059,0.538438,0.654728,0.654766,0.541818,0.632016,0.602407,0.496478


## Combining valid and eval sets

In [22]:
preds_df_eval = preds_df_eval.reset_index()
preds_df_valid = preds_df_valid.reset_index()

In [28]:
columnsnames = ['F' + str(i) for i in np.arange(1,29)]
print(columnsnames)
preds_df_eval.columns = ['id'] + columnsnames
preds_df_valid.columns = ['id'] + columnsnames

['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20', 'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28']


In [29]:
preds_df_comb = pd.concat([preds_df_valid,preds_df_eval])
preds_df_comb

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.968052,0.846389,0.818432,0.854639,1.052062,1.341891,1.473242,0.921521,0.939520,...,0.964595,1.410356,1.173984,0.911776,0.876774,0.841805,1.052188,1.231348,1.300107,1.145099
1,HOBBIES_1_002_CA_1_validation,0.204419,0.193445,0.151741,0.167550,0.202885,0.251356,0.248374,0.209721,0.207850,...,0.274754,0.342914,0.302466,0.206141,0.184683,0.174162,0.192442,0.173567,0.238722,0.264245
2,HOBBIES_1_003_CA_1_validation,0.435802,0.411070,0.474542,0.466519,0.649205,0.987985,0.673831,0.482883,0.526515,...,0.655281,0.763601,0.842098,0.551697,0.498877,0.506402,0.548348,0.771645,0.767158,0.721094
3,HOBBIES_1_004_CA_1_validation,1.495372,1.274237,1.389909,1.638269,2.270237,2.732851,2.968191,1.569027,1.213425,...,1.744539,2.416827,3.078948,1.792574,1.378132,1.424342,1.457471,2.068065,3.387903,3.642703
4,HOBBIES_1_005_CA_1_validation,0.957936,0.884289,0.840177,0.998208,1.150126,1.337973,1.703712,1.053154,1.112821,...,1.191730,1.748037,1.645003,1.012995,0.927642,0.979827,0.818266,1.085105,1.494029,1.443908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,0.474207,0.635851,0.625845,0.629134,0.702754,0.804479,0.864042,0.351876,0.400226,...,0.542415,0.596141,0.604872,0.365289,0.432198,0.514568,0.405698,0.480141,0.411126,0.378893
30486,FOODS_3_824_WI_3_evaluation,0.244012,0.293010,0.305919,0.254819,0.293989,0.333191,0.356492,0.199890,0.206377,...,0.304568,0.366428,0.361507,0.241971,0.303168,0.325131,0.238259,0.270650,0.257506,0.242109
30487,FOODS_3_825_WI_3_evaluation,0.584675,0.495688,0.475791,0.457810,0.585183,0.603009,0.662225,0.355504,0.352453,...,0.549319,0.580132,0.556582,0.425919,0.510824,0.472447,0.380331,0.461627,0.347011,0.350521
30488,FOODS_3_826_WI_3_evaluation,0.972861,0.535943,0.467384,0.484817,0.616078,0.611548,0.595126,0.554166,0.511890,...,0.608030,0.674792,0.650059,0.538438,0.654728,0.654766,0.541818,0.632016,0.602407,0.496478


### Create Final Submission File

In [31]:
ORIGINAL = ''

# Reading competition sample submission and
# merging our predictions
submission = pd.read_csv(ORIGINAL+'sample_submission.csv')[['id']]
submission = submission.merge(preds_df_comb, on=['id'], how='left').fillna(0)
submission.to_csv('submission_v'+str(ver)+'.csv', index=False)