In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.cross_validation import train_test_split
from ml_metrics import rmsle
import xgboost as xgb
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets, linear_model
from datetime import datetime


In [2]:
def evalerror(preds, dtrain):

    labels = dtrain.get_label()
    assert len(preds) == len(labels)
    labels = labels.tolist()
    preds = preds.tolist()
    terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0,preds[i]) + 1))
                    ** 2.0 for i,pred in enumerate(labels)]
    return 'error', (sum(terms_to_sum) * (1.0/len(preds))) ** 0.5

In [3]:
train_joined = pd.read_csv('../input/train_alld_prodnames10.csv')
test_joined = pd.read_csv('../input/test_alld_prodnames10.csv')

In [4]:
train_joined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74013023 entries, 0 to 74013022
Data columns (total 23 columns):
Demanda_uni_equil    int64
d1                   float64
d2                   float64
d3                   float64
d4                   float64
d5                   float64
d6                   float64
Cliente_ID           int64
Producto_ID          int64
p_total_demand       int64
total_demand         int64
weight               float64
pieces               float64
canelit              float64
delici chochit       float64
delici vainill       float64
gansit               float64
medi noch            float64
pan blanc            float64
pan integral         float64
pinguin              float64
tortillin            float64
wond                 float64
dtypes: float64(18), int64(5)
memory usage: 12.7 GB


In [5]:
train_joined.head()

Unnamed: 0,Demanda_uni_equil,d1,d2,d3,d4,d5,d6,Cliente_ID,Producto_ID,p_total_demand,...,canelit,delici chochit,delici vainill,gansit,medi noch,pan blanc,pan integral,pinguin,tortillin,wond
0,1,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,100000,43274,1639486,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1.0,-999.0,-999.0,-999.0,-999.0,-999.0,100000,43274,1639486,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2.0,1.0,-999.0,-999.0,-999.0,-999.0,100000,43274,1639486,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0.0,0.0,2.0,2.0,1.0,-999.0,100000,43274,1639486,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,1000032,43274,1639486,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
test_joined.head()

Unnamed: 0,Cliente_ID,Producto_ID,d1,d2,d3,d4,d5,d6,id,p_total_demand,...,canelit,delici chochit,delici vainill,gansit,medi noch,pan blanc,pan integral,pinguin,tortillin,wond
0,4549769,32940,2.0,2.0,0.0,0.0,0.0,0.0,2,1003938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1942590,32940,4.0,3.0,3.0,2.0,0.0,0.0,233,1003938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,460716,32940,0.0,0.0,0.0,0.0,0.0,0.0,616,1003938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2266767,32940,0.0,0.0,4.0,0.0,0.0,2.0,895,1003938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2266767,32940,-999.0,0.0,0.0,4.0,0.0,0.0,2551064,1003938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
train_joined = train_joined.drop(['Cliente_ID'],axis = 1).drop(['Producto_ID'],axis = 1)
test_joined = test_joined.drop(['Cliente_ID'],axis = 1).drop(['Producto_ID'],axis = 1)

train_joined[['Demanda_uni_equil', 'p_total_demand', 'total_demand']] =train_joined[['Demanda_uni_equil', 'p_total_demand', 'total_demand']].astype('float64')
test_joined[['p_total_demand', 'total_demand']]=test_joined[['p_total_demand', 'total_demand']].astype('float64')



In [8]:
ids = test_joined['id']
test_joined = test_joined.drop(['id'],axis = 1)

In [9]:
y = train_joined['Demanda_uni_equil']
X = train_joined[test_joined.columns.values]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729)

print ('Division_Set_Shapes:', X.shape, y.shape)
print ('Validation_Set_Shapes:', X_train.shape, X_test.shape)

('Division_Set_Shapes:', (74013023, 20), (74013023,))
('Validation_Set_Shapes:', (59210418, 20), (14802605, 20))


In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59210418 entries, 28817307 to 38733043
Data columns (total 20 columns):
d1                float64
d2                float64
d3                float64
d4                float64
d5                float64
d6                float64
p_total_demand    float64
total_demand      float64
weight            float64
pieces            float64
canelit           float64
delici chochit    float64
delici vainill    float64
gansit            float64
medi noch         float64
pan blanc         float64
pan integral      float64
pinguin           float64
tortillin         float64
wond              float64
dtypes: float64(20)
memory usage: 9.3 GB


In [11]:
params = {}
params['objective'] = "reg:linear"
params['eta'] = 0.025
params['max_depth'] = 5
params['subsample'] = 0.8
params['colsample_bytree'] = 0.6
params['silent'] = True

In [12]:
test_preds = np.zeros(test_joined.shape[0])
xg_train = xgb.DMatrix(X_train, label=y_train, missing = -999)
xg_test = xgb.DMatrix(X_test)

watchlist = [(xg_train, 'train_joined')]
num_rounds = 200

xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = evalerror,
                         early_stopping_rounds= 20, verbose_eval = 10)
preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration)

print ('RMSLE Score:', rmsle(y_test, preds))

[0]	train_joined-error:1.35077
Will train until train_joined-error hasn't improved in 20 rounds.
[10]	train_joined-error:0.872309
[20]	train_joined-error:0.704571
[30]	train_joined-error:0.641633
[40]	train_joined-error:0.612959
[50]	train_joined-error:0.600205
[60]	train_joined-error:0.598027
[70]	train_joined-error:0.59968
Stopping. Best iteration:
[57]	train_joined-error:0.597979

('RMSLE Score:', nan)


In [13]:
fxg_test = xgb.DMatrix(test_joined)
fold_preds = np.around(xgclassifier.predict(fxg_test, ntree_limit=xgclassifier.best_iteration), decimals = 1)
test_preds += fold_preds

In [14]:
submission = pd.DataFrame({'id':ids, 'Demanda_uni_equil': test_preds})

In [15]:
submission.Demanda_uni_equil = submission.Demanda_uni_equil.round()

In [17]:
submission[["id","Demanda_uni_equil"]].to_csv('../submissions/ALL' +
                                              datetime.now().strftime('%Y-%m-%d-%H-%M-%S') +'.csv', index=False)

print ('done')

done
