In [304]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk.corpus
from nltk.stem.snowball import SnowballStemmer
from sklearn.cross_validation import train_test_split
from ml_metrics import rmsle
import xgboost as xgb
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets, linear_model
import scipy.sparse as sps
from scipy.sparse import coo_matrix, hstack, vstack, csr_matrix
from scipy import io
from datetime import datetime
import gc
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [305]:
def evalerror(preds, dtrain):

    labels = dtrain.get_label()
    assert len(preds) == len(labels)
    labels = labels.tolist()
    preds = preds.tolist()
    terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0,preds[i]) + 1))
                    ** 2.0 for i,pred in enumerate(labels)]
    return 'error', (sum(terms_to_sum) * (1.0/len(preds))) ** 0.5

In [306]:
train = pd.read_csv('../input/train.csv', usecols=['Semana', 'Cliente_ID', 'Producto_ID', 'Demanda_uni_equil'])
test = pd.read_csv('../input/test.csv', usecols=['id', 'Semana', 'Cliente_ID', 'Producto_ID'])
train.tail()

Unnamed: 0,Semana,Cliente_ID,Producto_ID,Demanda_uni_equil
999995,8,550065,43307,30
999996,8,2171180,43069,3
999997,8,2343367,43307,20
999998,8,39008,43316,2
999999,8,41116,43064,3


In [307]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
Semana               1000000 non-null int64
Cliente_ID           1000000 non-null int64
Producto_ID          1000000 non-null int64
Demanda_uni_equil    1000000 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB


In [308]:
test.head()

Unnamed: 0,id,Semana,Cliente_ID,Producto_ID
0,2,10,4549769,32940
1,7,10,4414012,35305
2,29,11,1485041,41938
3,42,11,594640,43285
4,45,11,4257075,1238


In [309]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
id             1000000 non-null int64
Semana         1000000 non-null int64
Cliente_ID     1000000 non-null int64
Producto_ID    1000000 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB


In [310]:
grouping = ['Semana', 'Cliente_ID', 'Producto_ID']
train_group = train.groupby(grouping,as_index=False)['Demanda_uni_equil'].sum()

In [311]:
train_group.tail()

Unnamed: 0,Semana,Cliente_ID,Producto_ID,Demanda_uni_equil
998843,8,9788785,40447,16
998844,8,9808903,32393,14
998845,8,9887764,6469,6
998846,8,9892002,3270,5
998847,8,9991105,1700,47


In [312]:
train_group.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 998848 entries, 0 to 998847
Data columns (total 4 columns):
Semana               998848 non-null int64
Cliente_ID           998848 non-null int64
Producto_ID          998848 non-null int64
Demanda_uni_equil    998848 non-null int64
dtypes: int64(4)
memory usage: 38.1 MB


In [313]:
train_group.head()

Unnamed: 0,Semana,Cliente_ID,Producto_ID,Demanda_uni_equil
0,3,26,4767,42
1,3,107,43064,0
2,3,107,48996,56
3,3,906,4085,6
4,3,2010,1109,1


In [314]:
train_cliprod = train_group
train_cliprod["cliprod"] = train_cliprod["Cliente_ID"].map(str) + ' ' + train_cliprod["Producto_ID"].map(str)

In [315]:
test_cliprod = test
test_cliprod["cliprod"] = test_cliprod["Cliente_ID"].map(str) + ' ' + test_cliprod["Producto_ID"].map(str)

In [316]:
train_cliprod.head()

Unnamed: 0,Semana,Cliente_ID,Producto_ID,Demanda_uni_equil,cliprod
0,3,26,4767,42,26 4767
1,3,107,43064,0,107 43064
2,3,107,48996,56,107 48996
3,3,906,4085,6,906 4085
4,3,2010,1109,1,2010 1109


In [317]:
test_cliprod.head()

Unnamed: 0,id,Semana,Cliente_ID,Producto_ID,cliprod
0,2,10,4549769,32940,4549769 32940
1,7,10,4414012,35305,4414012 35305
2,29,11,1485041,41938,1485041 41938
3,42,11,594640,43285,594640 43285
4,45,11,4257075,1238,4257075 1238


In [318]:
grouping = ['Semana', 'cliprod']
train_cliprod = train_cliprod.groupby(grouping,as_index=False)['Demanda_uni_equil'].sum()


In [319]:
train_cliprod.head()

Unnamed: 0,Semana,cliprod,Demanda_uni_equil
0,3,1000006 1230,3
1,3,1000006 4767,2
2,3,1000015 43285,4
3,3,1000025 43069,2
4,3,100003 36748,2


In [320]:
train_cliprod.tail()

Unnamed: 0,Semana,cliprod,Demanda_uni_equil
998843,8,99992 31471,5
998844,8,99992 43147,7
998845,8,999927 1182,3
998846,8,999927 32819,3
998847,8,99996 41938,2


In [321]:
s3 = train_cliprod[train_cliprod.Semana == 3].drop('Semana', axis=1).set_index('cliprod')
s4 = train_cliprod[train_cliprod.Semana == 4].drop('Semana', axis=1).set_index('cliprod')
s5 = train_cliprod[train_cliprod.Semana == 5].drop('Semana', axis=1).set_index('cliprod')
s6 = train_cliprod[train_cliprod.Semana == 6].drop('Semana', axis=1).set_index('cliprod')
s7 = train_cliprod[train_cliprod.Semana == 7].drop('Semana', axis=1).set_index('cliprod')
s8 = train_cliprod[train_cliprod.Semana == 8].drop('Semana', axis=1).set_index('cliprod')
s9 = train_cliprod[train_cliprod.Semana == 9].drop('Semana', axis=1).set_index('cliprod')

t10 = test_cliprod[test_cliprod.Semana == 10].drop('Semana', axis=1).set_index('cliprod')
t11 = test_cliprod[test_cliprod.Semana == 11].drop('Semana', axis=1).set_index('cliprod')

In [322]:
s3.head()

Unnamed: 0_level_0,Demanda_uni_equil
cliprod,Unnamed: 1_level_1
1000006 1230,3
1000006 4767,2
1000015 43285,4
1000025 43069,2
100003 36748,2


In [323]:
t10.head()

Unnamed: 0_level_0,id,Cliente_ID,Producto_ID
cliprod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4549769 32940,2,4549769,32940
4414012 35305,7,4414012,35305
1695684 1220,109,1695684,1220
114659 1064,114,114659,1064
421275 41938,118,421275,41938


In [324]:
s6["d2"]=s4['Demanda_uni_equil']
s7["d2"]=s5['Demanda_uni_equil']
s8["d2"]=s6['Demanda_uni_equil']
s9["d2"]=s7['Demanda_uni_equil']

s6["d3"]=s3['Demanda_uni_equil']
s7["d3"]=s4['Demanda_uni_equil']
s8["d3"]=s5['Demanda_uni_equil']
s9["d3"]=s6['Demanda_uni_equil']

t10["d2"]=s8['Demanda_uni_equil']
t11["d2"]=s9['Demanda_uni_equil']

t10["d3"]=s7['Demanda_uni_equil']
t11["d3"]=s8['Demanda_uni_equil']

In [325]:
s6.head()

Unnamed: 0_level_0,Demanda_uni_equil,d2,d3
cliprod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000006 37005,25,,
1000042 1120,2,3.0,
1000052 1278,24,,
1000058 1278,5,,
100015 1240,7,,


In [326]:
t10.head()

Unnamed: 0_level_0,id,Cliente_ID,Producto_ID,d2,d3
cliprod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4549769 32940,2,4549769,32940,,
4414012 35305,7,4414012,35305,,
1695684 1220,109,1695684,1220,,
114659 1064,114,114659,1064,,
421275 41938,118,421275,41938,,


In [327]:
s6 = s6.reset_index()
s6.head()

Unnamed: 0,cliprod,Demanda_uni_equil,d2,d3
0,1000006 37005,25,,
1,1000042 1120,2,3.0,
2,1000052 1278,24,,
3,1000058 1278,5,,
4,100015 1240,7,,


In [328]:
s6['Cliente_ID'], s6['Producto_ID'] = zip(*s6['cliprod'].apply(lambda x: x.split(' ', 1)))

In [329]:
s6.head()

Unnamed: 0,cliprod,Demanda_uni_equil,d2,d3,Cliente_ID,Producto_ID
0,1000006 37005,25,,,1000006,37005
1,1000042 1120,2,3.0,,1000042,1120
2,1000052 1278,24,,,1000052,1278
3,1000058 1278,5,,,1000058,1278
4,100015 1240,7,,,100015,1240


In [330]:
s6 = s6.drop(['cliprod'], axis=1)
s6.head()

Unnamed: 0,Demanda_uni_equil,d2,d3,Cliente_ID,Producto_ID
0,25,,,1000006,37005
1,2,3.0,,1000042,1120
2,24,,,1000052,1278
3,5,,,1000058,1278
4,7,,,100015,1240


In [331]:
s7 = s7.reset_index()
s7['Cliente_ID'], s7['Producto_ID'] = zip(*s7['cliprod'].apply(lambda x: x.split(' ', 1)))
s7 = s7.drop(['cliprod'], axis=1)

s8 = s8.reset_index()
s8['Cliente_ID'], s8['Producto_ID'] = zip(*s8['cliprod'].apply(lambda x: x.split(' ', 1)))
s8 = s8.drop(['cliprod'], axis=1)

s9 = s9.reset_index()
s9['Cliente_ID'], s9['Producto_ID'] = zip(*s9['cliprod'].apply(lambda x: x.split(' ', 1)))
s9 = s9.drop(['cliprod'], axis=1)


t10 = t10.reset_index()
t10['Cliente_ID'], t10['Producto_ID'] = zip(*t10['cliprod'].apply(lambda x: x.split(' ', 1)))
t10 = t10.drop(['cliprod'], axis=1)

t11 = t11.reset_index()
t11['Cliente_ID'], t11['Producto_ID'] = zip(*t11['cliprod'].apply(lambda x: x.split(' ', 1)))
t11 = t11.drop(['cliprod'], axis=1)

In [332]:
t10.head()

Unnamed: 0,id,Cliente_ID,Producto_ID,d2,d3
0,2,4549769,32940,,
1,7,4414012,35305,,
2,109,1695684,1220,,
3,114,114659,1064,,
4,118,421275,41938,,


In [333]:
train_joined = pd.concat([s6, s7, s8, s9], ignore_index = True)
test_joined = pd.concat([t10, t11], ignore_index = True)

In [334]:
train_joined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 610615 entries, 0 to 610614
Data columns (total 5 columns):
Demanda_uni_equil    430597 non-null float64
d2                   184065 non-null float64
d3                   6255 non-null float64
Cliente_ID           610615 non-null object
Producto_ID          610615 non-null object
dtypes: float64(3), object(2)
memory usage: 28.0+ MB


In [335]:
train_joined.head()

Unnamed: 0,Demanda_uni_equil,d2,d3,Cliente_ID,Producto_ID
0,25.0,,,1000006,37005
1,2.0,3.0,,1000042,1120
2,24.0,,,1000052,1278
3,5.0,,,1000058,1278
4,7.0,,,100015,1240


In [336]:
train_joined[['Cliente_ID', 'Producto_ID']]=train_joined[['Cliente_ID', 'Producto_ID']].astype('int32')
test_joined[['Cliente_ID', 'Producto_ID']]=test_joined[['Cliente_ID', 'Producto_ID']].astype('int32')

In [None]:
train_joined.info()

In [None]:
test_joined.info()

In [337]:
train_joined.fillna(value=0, inplace=True)
test_joined.fillna(value=0, inplace=True)

In [None]:
train_joined.to_csv('../input/train_d2d3.csv')
test_joined.to_csv('../input/test_d2d3.csv')

In [339]:
ids = test_joined['id']
test_joined = test_joined.drop(['id'],axis = 1)

y = train_joined['Demanda_uni_equil']
X = train_joined[test_joined.columns.values]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729)

print ('Division_Set_Shapes:', X.shape, y.shape)
print ('Validation_Set_Shapes:', X_train.shape, X_test.shape)

('Division_Set_Shapes:', (610615, 4), (610615,))
('Validation_Set_Shapes:', (488492, 4), (122123, 4))


In [340]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 488492 entries, 364287 to 1677
Data columns (total 4 columns):
Cliente_ID     488492 non-null int32
Producto_ID    488492 non-null int32
d2             488492 non-null float64
d3             488492 non-null float64
dtypes: float64(2), int32(2)
memory usage: 14.9 MB


In [341]:
params = {}
params['objective'] = "reg:linear"
params['eta'] = 0.025
params['max_depth'] = 5
params['subsample'] = 0.8
params['colsample_bytree'] = 0.6
params['silent'] = True

In [342]:
test_preds = np.zeros(test_joined.shape[0])
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test)

watchlist = [(xg_train, 'train_joined')]
num_rounds = 100

xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = evalerror,
                         early_stopping_rounds= 20, verbose_eval = 10)
preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration)

print ('RMSLE Score:', rmsle(y_test, preds))

Will train until train_joined error hasn't decreased in 20 rounds.
[0]	train_joined-error:1.198681
[10]	train_joined-error:0.952965
[20]	train_joined-error:0.877669
[30]	train_joined-error:0.853422
[40]	train_joined-error:0.875693


('RMSLE Score:', 0.8554929448608547)


Stopping. Best iteration:
[29]	train_joined-error:0.847988



In [343]:
fxg_test = xgb.DMatrix(test_joined)
fold_preds = np.around(xgclassifier.predict(fxg_test, ntree_limit=xgclassifier.best_iteration), decimals = 1)
test_preds += fold_preds

In [344]:
submission = pd.DataFrame({'id':ids, 'Demanda_uni_equil': test_preds})

submission[["id","Demanda_uni_equil"]].to_csv('../submissions/' +
                                              datetime.now().strftime('%Y-%m-%d-%H-%M-%S') +'.csv', index=False)

print ('done')

done
