# Librerías

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import os
import seaborn as sns

%matplotlib inline

# Datos

In [2]:
train = pd.read_csv('../data/prepro_train.csv')
test = pd.read_csv('../data/prepro_test.csv')

In [3]:
train.head()

Unnamed: 0,date_number,product_id,block_id,stock,std_stock,family_id,subfamily_id,size_id,color_id,position_max,position_min,std_position,category_id,price,sales
0,0,310130,1726,1383,34.811328,679611953,533441312,7,1,80.0,3.0,37.194534,3,12.95,33
1,0,1178388,592,60,2.160247,732697347,691762817,4,1,19.0,19.0,0.0,1,49.95,0
2,0,1561460,1625,2373,55.438769,396066037,520569701,5,1,38.0,3.0,14.808299,3,29.95,21
3,0,1874414,1135,1686,20.463906,744793598,811402796,6,1,190.0,12.0,59.873199,6,25.95,24
4,0,2436420,779,245,23.377339,768025921,665805124,5,1,,,,0,25.95,0


In [28]:
test.head()

Unnamed: 0,date_number,product_id,block_id,stock,std_stock,family_id,subfamily_id,size_id,color_id,position_max,position_min,std_position,category_id,price,sales
0,71,151926,1969,636,33.103206,396066037,335531561,5,1,22.0,5.0,8.959787,2,25.95,-1
1,71,213413,1648,1190,36.225788,552529755,11509337,7,1,139.0,57.0,42.547711,2,19.95,-1
2,71,310130,1726,442,52.80918,679611953,533441312,7,1,46.0,46.0,0.0,1,12.95,-1
3,71,455200,1400,86,6.831301,998145072,490222156,3,1,53.0,36.0,9.311283,2,29.95,-1
4,71,571044,1098,416,42.178063,831347344,750943270,4,2,205.0,154.0,26.462601,2,15.95,-1


In [38]:
train['size_p_color'] = train.size_id * train.color_id
test['size_p_color'] = test.size_id * test.color_id

In [39]:
train['ratio_std_pos'] = train.std_position / max(train.std_position.max(), test.std_position.max())
test['ratio_std_pos'] = test.std_position / max(train.std_position.max(), test.std_position.max())

In [41]:
train['ratio_std_stock'] = train.std_stock / max(train.std_stock.max(), test.std_stock.max())
test['ratio_std_stock'] = test.std_stock / max(train.std_stock.max(), test.std_stock.max())

In [78]:
n = 0
d_n = list()
day = list()
for date_n in range(0, 111):
    d_n.append(date_n)
    day.append(n)
    n += 1
    if n >= 7:
        n = 0
        
df_day = pd.DataFrame({'date_number': d_n, 'day': day})
df_day.head(14)

Unnamed: 0,date_number,day
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
5,5,5
6,6,6
7,7,0
8,8,1
9,9,2


In [69]:
train = train.join(df_day.set_index('date_number'), on='date_number', how='left')
test = test.join(df_day.set_index('date_number'), on='date_number', how='left')

In [70]:
train.head()

Unnamed: 0,date_number,product_id,block_id,stock,std_stock,family_id,subfamily_id,size_id,color_id,position_max,position_min,std_position,category_id,price,sales,size_p_color,ratio_std_pos,ratio_std_stock,day
0,0,310130,1726,1383,34.811328,679611953,533441312,7,1,80.0,3.0,37.194534,3,12.95,33,7,0.094515,0.053357,0
1,0,1178388,592,60,2.160247,732697347,691762817,4,1,19.0,19.0,0.0,1,49.95,0,4,0.0,0.003311,0
2,0,1561460,1625,2373,55.438769,396066037,520569701,5,1,38.0,3.0,14.808299,3,29.95,21,5,0.037629,0.084974,0
3,0,1874414,1135,1686,20.463906,744793598,811402796,6,1,190.0,12.0,59.873199,6,25.95,24,6,0.152144,0.031366,0
4,0,2436420,779,245,23.377339,768025921,665805124,5,1,,,,0,25.95,0,5,,0.035832,0


# Prueba de modelo

In [71]:
X = train.loc[:,[c for c in train.columns if c not in ['sales', 'date_number',
                                                             'product_id', 'block_id']]]
y = train[['sales']]

In [72]:
from sklearn.model_selection import train_test_split

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [74]:
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

In [75]:
model_name = 'lgbm'


params = {'max_depth':10,
          'metric':'mae',
#           'num_threads': -1,
          'max_delta_step': 0.2,
          'n_estimators':50000,
          'learning_rate':0.1,
          'colsample_bytree':0.8,
          'objective':'regression',
          'n_jobs':16,
          'seed':42,
#           'bagging_fraction':0.8,
          'lambda_l1':0,
          'lambda_l2':0,
          'max_bin': 64}

lgb_model = lgb.LGBMRegressor(**params)

In [76]:
train_ids = X_train.index
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [77]:
counter = 1
be = 0
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold k {}\n'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    y_val = np.log1p(y_val)
    y_fit = np.log1p(y_fit)
    
    
    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val)],
                  verbose=500,
                  early_stopping_rounds=20)


    be += np.expm1(lgb_model.best_score_['valid_0']['l1'])
#     be += lgb_model.best_score_['valid_0']['l1']
    
#     print('Score en el test:',mean_absolute_error(lgb_model.predict(X_test), y_test),'ventas')
    print('Score en el test:',np.expm1(mean_absolute_error(lgb_model.predict(X_test), np.log1p(y_test))),'ventas')
    
    counter += 1
    
    
print('\n\nBEST SCORE MEAN:', be / k,'SALES :)')

Fold k 1

Training until validation scores don't improve for 20 rounds.


KeyboardInterrupt: 