In [67]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.cross_validation import train_test_split
from ml_metrics import rmsle
import xgboost as xgb
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets, linear_model
from datetime import datetime


In [68]:
def evalerror(preds, dtrain):

    labels = dtrain.get_label()
    assert len(preds) == len(labels)
    labels = labels.tolist()
    preds = preds.tolist()
    terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0,preds[i]) + 1))
                    ** 2.0 for i,pred in enumerate(labels)]
    return 'error', (sum(terms_to_sum) * (1.0/len(preds))) ** 0.5

In [69]:
train_joined = pd.read_csv('../input/train_1000000_alld_totals.csv')
test_joined = pd.read_csv('../input/test_1000000_alld_totals.csv')

In [70]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
Unnamed: 0           100 non-null int64
Demanda_uni_equil    100 non-null int64
d1                   100 non-null float64
d2                   100 non-null float64
d3                   100 non-null float64
d4                   100 non-null float64
d5                   100 non-null float64
d6                   100 non-null float64
Cliente_ID           100 non-null int64
Producto_ID          100 non-null int64
p_total_demand       100 non-null int64
total_demand         100 non-null int64
dtypes: float64(6), int64(6)
memory usage: 9.4 KB


In [71]:
test.head()

Unnamed: 0.1,Unnamed: 0,Cliente_ID,Producto_ID,d1,d2,d3,d4,d5,d6,id,p_total_demand,total_demand
0,26217,4501148,32940,-999.0,2.0,2.0,0.0,0.0,2.0,6535198,1003938,1941
1,101342,568294,35484,0.0,0.0,0.0,0.0,0.0,0.0,5639274,138984,240
2,134331,797017,36610,40.0,40.0,40.0,40.0,40.0,40.0,862981,12680243,1445
3,186575,3406482,1232,-999.0,4.0,4.0,2.0,4.0,3.0,1837500,4310659,920
4,235358,1494636,35727,-999.0,4.0,4.0,4.0,2.0,4.0,6938729,707664,5623


In [72]:
train_joined = train_joined.drop(['Unnamed: 0'],axis = 1)
test_joined = test_joined.drop(['Unnamed: 0'],axis = 1)

train_joined[['Cliente_ID', 'Producto_ID', 'Demanda_uni_equil', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6']] =train_joined[['Cliente_ID', 'Producto_ID', 'Demanda_uni_equil', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6']].astype('int32')
test_joined[['Cliente_ID', 'Producto_ID', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6']]=test_joined[['Cliente_ID', 'Producto_ID', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6']].astype('int32')



In [73]:
ids = test_joined['id']
test_joined = test_joined.drop(['id'],axis = 1)

In [74]:
y = train_joined['Demanda_uni_equil']
X = train_joined[test_joined.columns.values]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729)

print ('Division_Set_Shapes:', X.shape, y.shape)
print ('Validation_Set_Shapes:', X_train.shape, X_test.shape)

('Division_Set_Shapes:', (1000000, 10), (1000000,))
('Validation_Set_Shapes:', (800000, 10), (200000, 10))


In [75]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800000 entries, 470026 to 1677
Data columns (total 10 columns):
Cliente_ID        800000 non-null int32
Producto_ID       800000 non-null int32
d1                800000 non-null int32
d2                800000 non-null int32
d3                800000 non-null int32
d4                800000 non-null int32
d5                800000 non-null int32
d6                800000 non-null int32
p_total_demand    800000 non-null int64
total_demand      800000 non-null int64
dtypes: int32(8), int64(2)
memory usage: 42.7 MB


In [76]:
params = {}
params['objective'] = "reg:linear"
params['eta'] = 0.025
params['max_depth'] = 5
params['subsample'] = 0.8
params['colsample_bytree'] = 0.6
params['silent'] = True

In [77]:
test_preds = np.zeros(test_joined.shape[0])
xg_train = xgb.DMatrix(X_train, label=y_train, missing = -999)
xg_test = xgb.DMatrix(X_test)

watchlist = [(xg_train, 'train_joined')]
num_rounds = 200

xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = evalerror,
                         early_stopping_rounds= 20, verbose_eval = 10)
preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration)

print ('RMSLE Score:', rmsle(y_test, preds))

Will train until train_joined error hasn't decreased in 20 rounds.
[0]	train_joined-error:1.352126
[10]	train_joined-error:0.878447
[20]	train_joined-error:0.721096
[30]	train_joined-error:0.661288
[40]	train_joined-error:0.644000
[50]	train_joined-error:0.638841
[60]	train_joined-error:0.640212
Stopping. Best iteration:
[47]	train_joined-error:0.638090



('RMSLE Score:', 0.64997530979127027)


In [78]:
fxg_test = xgb.DMatrix(test_joined)
fold_preds = np.around(xgclassifier.predict(fxg_test, ntree_limit=xgclassifier.best_iteration), decimals = 1)
test_preds += fold_preds

In [79]:
submission = pd.DataFrame({'id':ids, 'Demanda_uni_equil': test_preds})

In [80]:
submission.Demanda_uni_equil = submission.Demanda_uni_equil.round()

In [81]:
submission[["id","Demanda_uni_equil"]].to_csv('../submissions/ALL' +
                                              datetime.now().strftime('%Y-%m-%d-%H-%M-%S') +'.csv', index=False)

print ('done')

done
