# Importing pikles train and test sets after preprocessing and build a lightGBM model. Select a Gradient boosting tree algorithm to predict probability of reorder a product ordered in the past. Prediction is done for each 'order_id', 'product_id' pairs 

In [1]:
import pandas as pd
import gc
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV # not used here
from sklearn.model_selection import train_test_split


In [2]:
# Setting working directory
path = '/home/crescio/Documents/kaggle/intacart/data/'

In [3]:
print('loading files ...')
train = pd.read_csv(path + 'my_train.csv')
test = pd.read_csv(path + 'my_test.csv')

############ If using slow CPU and/or low RAM, sample a 40% of the training data and uncomment the following
print('sampling train data ...')
#train = train.sample(frac=0.4)

# Splitting the training set to train and validation set. Validation set 
X_train, X_eval, y_train, y_eval = train_test_split(train[train.columns.difference(['reordered'])], train['reordered'], test_size=0.33, random_state=7)

# memory cleaning
del train
gc.collect()

loading files ...
sampling train data ...


48

In [4]:
# This parameters are not optimized. We can get a better set up 

print('formatting to LightGBM format ...')

# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_eval, y_eval, reference = lgb_train)

# specify configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',              # Gradient boosting tree algorithm
    'objective': 'binary',
    'metric': {'binary_logloss', 'auc'},
    'num_iterations' : 1000,              
    'max_bin' : 100,                      # Controls overfit
    'num_leaves': 512,                    # higher number of leaves
    'feature_fraction': 0.9,              # Controls overfit
    'bagging_fraction': 0.95,
    'bagging_freq': 5,
    'min_data_in_leaf' : 200,             # Controls overfit
    'learning_rate' : 0.1,
    'verbose': 0
}


formatting to LightGBM format ...


In [5]:
print('training LightGBM model ...')
lgb_model = lgb.train(params,
                lgb_train,
                num_boost_round = 200,      
                valid_sets = lgb_eval,     # Validation set used to prevent overfitting
                early_stopping_rounds=10)  # will stop the boost rounds if evaluation metricices didn't improve

del lgb_train, X_train, y_train
gc.collect()

#[77]	valid_0's binary_logloss: 0.245244	valid_0's auc: 0.834128

training LightGBM model ...
[1]	valid_0's auc: 0.825606	valid_0's binary_logloss: 0.625211
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's auc: 0.827529	valid_0's binary_logloss: 0.569708
[3]	valid_0's auc: 0.82811	valid_0's binary_logloss: 0.523835
[4]	valid_0's auc: 0.828574	valid_0's binary_logloss: 0.485162
[5]	valid_0's auc: 0.829058	valid_0's binary_logloss: 0.452406
[6]	valid_0's auc: 0.829297	valid_0's binary_logloss: 0.424538
[7]	valid_0's auc: 0.829654	valid_0's binary_logloss: 0.400667
[8]	valid_0's auc: 0.829895	valid_0's binary_logloss: 0.380132
[9]	valid_0's auc: 0.830126	valid_0's binary_logloss: 0.362417
[10]	valid_0's auc: 0.830294	valid_0's binary_logloss: 0.347132
[11]	valid_0's auc: 0.830453	valid_0's binary_logloss: 0.33387
[12]	valid_0's auc: 0.830668	valid_0's binary_logloss: 0.322337
[13]	valid_0's auc: 0.830808	valid_0's binary_logloss: 0.312332
[14]	valid_0's auc: 0.83093	valid_0's binary_logloss: 0.303635
[15]	valid_0's auc: 0.831071	valid

[128]	valid_0's auc: 0.836418	valid_0's binary_logloss: 0.243839
[129]	valid_0's auc: 0.836423	valid_0's binary_logloss: 0.243838
[130]	valid_0's auc: 0.836416	valid_0's binary_logloss: 0.243843
[131]	valid_0's auc: 0.836419	valid_0's binary_logloss: 0.243842
[132]	valid_0's auc: 0.836417	valid_0's binary_logloss: 0.243844
[133]	valid_0's auc: 0.836416	valid_0's binary_logloss: 0.243845
[134]	valid_0's auc: 0.836454	valid_0's binary_logloss: 0.243816
[135]	valid_0's auc: 0.836454	valid_0's binary_logloss: 0.243817
[136]	valid_0's auc: 0.836473	valid_0's binary_logloss: 0.243804
[137]	valid_0's auc: 0.836474	valid_0's binary_logloss: 0.243804
[138]	valid_0's auc: 0.836474	valid_0's binary_logloss: 0.243806
[139]	valid_0's auc: 0.836469	valid_0's binary_logloss: 0.243809
[140]	valid_0's auc: 0.836467	valid_0's binary_logloss: 0.243811
[141]	valid_0's auc: 0.836466	valid_0's binary_logloss: 0.243814
[142]	valid_0's auc: 0.836502	valid_0's binary_logloss: 0.24379
[143]	valid_0's auc: 0.836

39

In [6]:
print('Save model...')
# save model to file
lgb_model.save_model('model.txt')

# uncomment the following to load model and predict
# print('Load model to predict')
# bst = lgb.Booster(model_file='model.txt')
# can only predict with the best iteration (or the saving iteration)
# y_pred = bst.predict(X_test)

Save model...


In [7]:
print('applying model to test data ...')
test['reordered'] = lgb_model.predict(test[test.columns.difference(['order_id', 'product_id'])], num_iteration = lgb_model.best_iteration)

applying model to test data ...


In [8]:
test


Unnamed: 0,product_id,up_orders,up_first_order,up_last_order,up_average_cart_position,prod_orders,prod_reorder_probability,prod_reorder_times,prod_reorder_ratio,user_orders,...,user_total_products,user_reorder_ratio,user_distinct_products,user_average_basket,order_id,days_since_prior_order,up_order_rate,up_orders_since_last_order,up_order_rate_since_first_order,reordered
0,196,5,15,22,2.200000,35791,0.582500,4.473875,0.776480,22,...,72,0.867647,13,3.272727,2161313,7.0,0.227273,0,0.625000,0.562956
1,12427,10,1,20,2.100000,6476,0.529482,3.857058,0.740735,22,...,72,0.867647,13,3.272727,2161313,7.0,0.454545,2,0.454545,0.306690
2,1747,4,8,19,3.500000,1448,0.393238,2.576512,0.611878,22,...,72,0.867647,13,3.272727,2161313,7.0,0.181818,3,0.266667,0.166407
3,10441,8,1,22,2.375000,2909,0.536332,3.355248,0.701959,22,...,72,0.867647,13,3.272727,2161313,7.0,0.363636,0,0.363636,0.382258
4,11266,10,1,19,1.600000,4081,0.596670,3.775208,0.735114,22,...,72,0.867647,13,3.272727,2161313,7.0,0.454545,3,0.454545,0.215481
5,14715,11,1,20,2.454545,8848,0.451520,2.831360,0.646813,22,...,72,0.867647,13,3.272727,2161313,7.0,0.500000,2,0.500000,0.324668
6,27839,8,2,20,3.125000,1597,0.499127,2.787086,0.641202,22,...,72,0.867647,13,3.272727,2161313,7.0,0.363636,2,0.380952,0.258136
7,30292,1,5,5,3.000000,186,0.352459,1.524590,0.344086,22,...,72,0.867647,13,3.272727,2161313,7.0,0.045455,17,0.055556,0.010050
8,37059,1,6,6,3.000000,562,0.223587,1.380835,0.275801,22,...,72,0.867647,13,3.272727,2161313,7.0,0.045455,16,0.058824,0.016471
9,37710,9,2,19,2.111111,12068,0.608762,4.597333,0.782483,22,...,72,0.867647,13,3.272727,2161313,7.0,0.409091,3,0.428571,0.258303


In [9]:
print('Feature names:', lgb_model.feature_name())

print('Calculate feature importances...')
# feature importances
print('Feature importances:', list(lgb_model.feature_importance()))

Feature names: ['days_since_prior_order', 'prod_orders', 'prod_reorder_probability', 'prod_reorder_ratio', 'prod_reorder_times', 'up_average_cart_position', 'up_first_order', 'up_last_order', 'up_order_rate', 'up_order_rate_since_first_order', 'up_orders', 'up_orders_since_last_order', 'user_average_basket', 'user_distinct_products', 'user_mean_days_since_prior', 'user_orders', 'user_period', 'user_reorder_ratio', 'user_total_products']
Calculate feature importances...
Feature importances: [6263, 6266, 5007, 5132, 602, 4594, 2894, 2700, 3437, 3538, 1102, 3942, 6344, 5883, 6300, 1813, 5879, 8365, 4254]


In [10]:
print('formatting and writing a submission file ...')
prd_bag = dict()
for row in test.itertuples():
    if row.reordered > 0.21:   ## Cutoff for lableing product as positive (can be tweaked with cross validation)
        try:
            prd_bag[row.order_id] += ' ' + str(row.product_id)
        except:
            prd_bag[row.order_id] = str(row.product_id)

for order in test.order_id:
    if order not in prd_bag:
        prd_bag[order] = 'None'

submit = pd.DataFrame.from_dict(prd_bag, orient='index')

submit.reset_index(inplace=True)
submit.columns = ['order_id', 'products']
submit.to_csv('Submit.csv', index=False)

formatting and writing to submission file ...


In [15]:
prd_bag


{2224210: '17330 17859 45511',
 993166: '17330 31154 30633 16826',
 2794524: '17330 11869 41131 6137 24053 47031 44799 22409 16275 38218 40795 20479',
 552080: '17330 31778',
 1831994: '17330 12013 8859 44042 46572 11344 20831',
 9391: '17330 38444 46822 17415',
 809032: '17330 10673 44799 5460 43183 16419 40218 13249 22129 26308 2034',
 124001: '17330 46088 4965 13914',
 195917: '17330 26209 47144 47890 5818 16237 5248',
 2446064: '17330 45432 14385 8475',
 2151485: '35419 30921',
 795800: '35419 27652 2966 41808 10524 44267 38529',
 84768: '35419 4447 27652 39646 48119 27979 21265 16566',
 1482092: '35419 46088 38444 47877 23763 15587',
 2469973: '35419',
 926692: '35419 2120 1160 20998 4605 37766 40348 34050 39921 2838 26689 18432 12473 35556 49226',
 3266544: '35419 23540',
 1809965: '35419 277',
 2111485: '35419 10673 8859 41808 25869',
 3138670: '35419 9076 6137 130 20463',
 3143404: '35419 1160 26209 21137 29487 36307 13176 44987 21333 1263 37766 41273 148 23574 7806 30151 36606

In [1]:
import pickle

In [None]:
submit.to_pickle('prediction_lgbm.pkl')

In [3]:

with open('prediction_lgbm.pkl', 'rb') as file:
    submission = pickle.load(file)