In [1]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb
import os

os.chdir('D:\\kaggle_competitions\\M5 Competition')

In [2]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [3]:
pd.options.display.max_columns = 50

In [4]:
h = 28 
max_lags = 57
tr_last = 1941
fday = datetime(2016,5, 23) 


In [5]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv("sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv("calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv("sales_train_evaluation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [6]:
def create_fea(dt):
    lags = [7,28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7,28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())
            
            
            

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [7]:
FIRST_DAY = 1# If you want to load all the data set it to '1' -->  Great  memory overflow  risk ! ##set to 50

In [8]:
%%time

df = create_dt(is_train=True, first_day= FIRST_DAY)
df.shape

Wall time: 39 s


(46881677, 22)

In [9]:
%%time

create_fea(df)
df.shape

Wall time: 2min 53s


(46881677, 31)

In [10]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
0,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_1,12.0,2011-01-29,11101,2,1,1,2011,0,0,0,0,0.0,0.0,0.0,0.46,,,,,,,4,1,29
1,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_2,15.0,2011-01-30,11101,3,2,1,2011,0,0,0,0,0.0,0.0,0.0,0.46,,,,,,,4,1,30
2,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_3,0.0,2011-01-31,11101,1,3,1,2011,0,0,0,0,0.0,0.0,0.0,0.46,,,,,,,5,1,31
3,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_4,0.0,2011-02-01,11101,5,4,2,2011,0,0,0,0,1.0,1.0,0.0,0.46,,,,,,,5,1,1
4,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_5,0.0,2011-02-02,11101,6,5,2,2011,0,0,0,0,1.0,0.0,1.0,0.46,,,,,,,5,1,2


In [11]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [12]:
df=reduce_mem_usage(df)

Mem. usage decreased to 3308.53 Mb (32.7% reduction)


In [13]:
df.dropna(inplace = True)
print(df.shape)

(45204727, 31)


In [14]:
df['var']=df['d'].str.replace('d_','').astype(int)
df=df[df['var']>=150]
del df['var']

In [15]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
2052822,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_150,1.0,2011-06-27,11122,1,3,6,2011,0,0,0,0,0.0,0.0,0.0,4.339844,0.0,2.0,0.714355,1.0,0.893066,1.107422,26,2,27
2052823,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_151,0.0,2011-06-28,11122,5,4,6,2011,0,0,0,0,0.0,0.0,0.0,4.339844,1.0,0.0,0.856934,1.0,0.928711,1.107422,26,2,28
2052824,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_152,0.0,2011-06-29,11122,6,5,6,2011,0,0,0,0,0.0,0.0,0.0,4.339844,0.0,0.0,0.856934,1.0,0.928711,1.107422,26,2,29
2052825,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_153,1.0,2011-06-30,11122,4,6,6,2011,0,0,0,0,0.0,0.0,0.0,4.339844,2.0,0.0,1.142578,0.714355,0.928711,1.071289,26,2,30
2052826,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_154,3.0,2011-07-01,11122,0,7,7,2011,0,0,0,0,1.0,1.0,0.0,4.339844,0.0,2.0,0.856934,0.856934,0.893066,1.142578,26,3,1


In [16]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id']+["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [17]:
#fake_valid_inds=X_train.index.values[X_train['d'].str.contains('d_1885|d_1886|d_1887|d_1888|d_1889|d_1890|d_1891|d_1892|d_1893|d_1894|d_1895|d_1896|d_1897|d_1898|d_1899|d_1900|d_1901|d_1902|d_1903|d_1904|d_1905|d_1906|d_1907|d_1908|d_1909|d_1910|d_1911|d_1912|d_1913'),True]
fake_valid_inds=X_train.index.values[X_train['d'].str.contains('d_1914|d_1915|d_1916|d_1917|d_1918|d_1919|d_1920|d_1921|d_1922|d_1923|d_1924|d_1925|d_1926|d_1927|d_1928|d_1929|d_1930|d_1931|d_1932|d_1933|d_1934|d_1935|d_1936|d_1937|d_1938|d_1939|d_1940|d_1941'),True]



In [18]:
useless_cols = ["id", "date", "sales",'d', "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [19]:
%%time

np.random.seed(786)

train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

Wall time: 20.8 s


In [21]:
len(fake_valid_inds)

853720

In [22]:
del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

2760

In [23]:
cat_feats

['item_id',
 'dept_id',
 'store_id',
 'cat_id',
 'state_id',
 'event_name_1',
 'event_name_2',
 'event_type_1',
 'event_type_2']

In [None]:
'''params={
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.0412,#riginal=0.0409 ###this parameter result in 0.494  ###this 0.410 reult in 0.494
                    'num_leaves': 2**14-1,
                    'min_data_in_leaf': 2**15-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100, ###100
                    'n_estimators': 1800,
                    'boost_from_average': False,
                    'verbose': 1,
                } ''''


In [25]:
#m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=200) 

In [None]:
#import matplotlib.pyplot as plt


In [None]:
#lgb.plot_importance(m_lgb)

#plt.show()

In [None]:
#m_lgb.save_model("model_poisson.lgb")

In [29]:
only_lr=list(np.arange(0.0404,0.0410,0.0002))
print(only_lr)
params_list=[]
for k in range(len(only_lr)):
    params={
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': round(only_lr[k],4),
                    'num_leaves': 2**14-1,
                    'min_data_in_leaf': 2**15-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1800,
                    'boost_from_average': False,
                    'verbose': 1,
                } 
    params_list.append(params)

[0.0404, 0.0406, 0.040799999999999996, 0.040999999999999995]


In [30]:
%%time
lgb_champ=[]
for i in range(len(params_list)):
    print(i)
    m_lgb = lgb.train(params_list[i], train_data, valid_sets = [fake_valid_data], verbose_eval=200,early_stopping_rounds=200)
    lgb_champ.append(m_lgb)
    m_lgb.save_model(f"lgb_lr_final_ensemble_evaluation_{i}.lgb")

0




Training until validation scores don't improve for 200 rounds
[200]	valid_0's rmse: 2.05614
[400]	valid_0's rmse: 2.04152
[600]	valid_0's rmse: 2.0363
[800]	valid_0's rmse: 2.03315
[1000]	valid_0's rmse: 2.03023
[1200]	valid_0's rmse: 2.02791
[1400]	valid_0's rmse: 2.02704
[1600]	valid_0's rmse: 2.02622
[1800]	valid_0's rmse: 2.02558
Did not meet early stopping. Best iteration is:
[1798]	valid_0's rmse: 2.02556
1
Training until validation scores don't improve for 200 rounds
[200]	valid_0's rmse: 2.05457
[400]	valid_0's rmse: 2.04005
[600]	valid_0's rmse: 2.03489
[800]	valid_0's rmse: 2.03164
[1000]	valid_0's rmse: 2.02849
[1200]	valid_0's rmse: 2.02603
[1400]	valid_0's rmse: 2.0247
[1600]	valid_0's rmse: 2.02388
[1800]	valid_0's rmse: 2.02342
Did not meet early stopping. Best iteration is:
[1797]	valid_0's rmse: 2.0234
2
Training until validation scores don't improve for 200 rounds
[200]	valid_0's rmse: 2.0549
[400]	valid_0's rmse: 2.04158
[600]	valid_0's rmse: 2.03595
[800]	valid_0's 

In [None]:
%%time
alphas=[1]
weights = [1/len(alphas)]*len(alphas)
sub = 0.
for j in range(len(lgb_champ)):
    for icount, (alpha,weight) in enumerate(zip(alphas,weights)):
        te=create_dt(False)
        cols = [f"F{i}" for i in range(1,29)]
        for tdelta in range(0, 28):
            day = fday + timedelta(days=tdelta)
            tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
            create_fea(tst)
            tst = tst.loc[tst.date == day , train_cols]
            #print(j)
            te.loc[te.date == day, "sales"] = alpha*lgb_champ[j].predict(tst) # magic multiplier by kyakovlev



        te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
        te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
        te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
        te_sub.fillna(0., inplace = True)
        te_sub.sort_values("id", inplace = True)
        te_sub.reset_index(drop=True, inplace = True)
        te_sub.to_csv(f"submission_{icount}.csv",index=False)
        if icount == 0 :
            sub = te_sub
            sub[cols] *= weight
        else:
            sub[cols] += te_sub[cols]*weight
        print(icount, alpha, weight)
    sub2 = sub.copy()
    sub2["id"] = sub2["id"].str.replace("evaluation$", "validation")
    sub = pd.concat([sub, sub2], axis=0, sort=False)
    print(j)
    sub.to_csv(f"submission_final_evaluation_{j}.csv",index=False)


0 1 1.0
0
0 1 1.0
1
0 1 1.0
2


In [None]:
#%%time

'''alphas=[1]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by kyakovlev



    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    #te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("ensemble_1.csv",index=False)'''

In [None]:
sub.head()