In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk.corpus
from nltk.stem.snowball import SnowballStemmer
from sklearn.cross_validation import train_test_split
from ml_metrics import rmsle
import xgboost as xgb
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets, linear_model
import scipy.sparse as sps
from scipy.sparse import coo_matrix, hstack, vstack, csr_matrix
from scipy import io
from datetime import datetime
import gc
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def evalerror(preds, dtrain):

    labels = dtrain.get_label()
    assert len(preds) == len(labels)
    labels = labels.tolist()
    preds = preds.tolist()
    terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0,preds[i]) + 1))
                    ** 2.0 for i,pred in enumerate(labels)]
    return 'error', (sum(terms_to_sum) * (1.0/len(preds))) ** 0.5

In [3]:
train = pd.read_csv('../input/train_1000.csv', usecols=['Semana', 'Cliente_ID', 'Producto_ID', 'Demanda_uni_equil'])
test = pd.read_csv('../input/test_1000.csv', usecols=['id', 'Semana', 'Cliente_ID', 'Producto_ID'])
train.tail()

Unnamed: 0,Semana,Cliente_ID,Producto_ID,Demanda_uni_equil
995,8,4387326,43169,2
996,8,2092900,35571,1
997,8,2140749,43200,3
998,8,4600772,34255,1
999,8,11570,30749,3


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
Semana               1000 non-null int64
Cliente_ID           1000 non-null int64
Producto_ID          1000 non-null int64
Demanda_uni_equil    1000 non-null int64
dtypes: int64(4)
memory usage: 31.3 KB


In [5]:
test.head()

Unnamed: 0,id,Semana,Cliente_ID,Producto_ID
0,2107,10,184044,31588
1,4750,10,2385912,1145
2,6252,11,766465,35305
3,18978,11,2229028,43251
4,30799,11,711302,1220


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
id             1000 non-null int64
Semana         1000 non-null int64
Cliente_ID     1000 non-null int64
Producto_ID    1000 non-null int64
dtypes: int64(4)
memory usage: 31.3 KB


In [7]:
grouping = ['Semana', 'Cliente_ID', 'Producto_ID']
train_group = train.groupby(grouping,as_index=False)['Demanda_uni_equil'].sum()

In [8]:
train_group.tail()

Unnamed: 0,Semana,Cliente_ID,Producto_ID,Demanda_uni_equil
995,8,4461013,5337,5
996,8,4490602,37361,4
997,8,4558805,1109,0
998,8,4600772,34255,1
999,8,7839768,1242,3


In [9]:
train_group.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 4 columns):
Semana               1000 non-null int64
Cliente_ID           1000 non-null int64
Producto_ID          1000 non-null int64
Demanda_uni_equil    1000 non-null int64
dtypes: int64(4)
memory usage: 39.1 KB


In [10]:
train_group.head()

Unnamed: 0,Semana,Cliente_ID,Producto_ID,Demanda_uni_equil
0,3,9218,1284,17
1,3,15959,43121,42
2,3,18643,1150,2
3,3,35687,693,2
4,3,38192,43033,4


In [11]:
train_cliprod = train_group
train_cliprod["cliprod"] = train_cliprod["Cliente_ID"].map(str) + ' ' + train_cliprod["Producto_ID"].map(str)

In [12]:
test_cliprod = test
test_cliprod["cliprod"] = test_cliprod["Cliente_ID"].map(str) + ' ' + test_cliprod["Producto_ID"].map(str)

In [13]:
train_cliprod.head()

Unnamed: 0,Semana,Cliente_ID,Producto_ID,Demanda_uni_equil,cliprod
0,3,9218,1284,17,9218 1284
1,3,15959,43121,42,15959 43121
2,3,18643,1150,2,18643 1150
3,3,35687,693,2,35687 693
4,3,38192,43033,4,38192 43033


In [14]:
test_cliprod.head()

Unnamed: 0,id,Semana,Cliente_ID,Producto_ID,cliprod
0,2107,10,184044,31588,184044 31588
1,4750,10,2385912,1145,2385912 1145
2,6252,11,766465,35305,766465 35305
3,18978,11,2229028,43251,2229028 43251
4,30799,11,711302,1220,711302 1220


In [15]:
grouping = ['Semana', 'cliprod']
train_cliprod = train_cliprod.groupby(grouping,as_index=False)['Demanda_uni_equil'].sum()


In [16]:
train_cliprod.head()

Unnamed: 0,Semana,cliprod,Demanda_uni_equil
0,3,101126 1216,2
1,3,101946 8921,5
2,3,102314 37516,1
3,3,1032961 1687,1
4,3,1033104 43064,12


In [17]:
train_cliprod.tail()

Unnamed: 0,Semana,cliprod,Demanda_uni_equil
995,8,91072 1150,6
996,8,925348 1125,8
997,8,92784 43285,8
998,8,95226 1146,9
999,8,98600 1064,1


In [18]:
s3 = train_cliprod[train_cliprod.Semana == 3].drop('Semana', axis=1).set_index('cliprod')
s4 = train_cliprod[train_cliprod.Semana == 4].drop('Semana', axis=1).set_index('cliprod')
s5 = train_cliprod[train_cliprod.Semana == 5].drop('Semana', axis=1).set_index('cliprod')
s6 = train_cliprod[train_cliprod.Semana == 6].drop('Semana', axis=1).set_index('cliprod')
s7 = train_cliprod[train_cliprod.Semana == 7].drop('Semana', axis=1).set_index('cliprod')
s8 = train_cliprod[train_cliprod.Semana == 8].drop('Semana', axis=1).set_index('cliprod')
s9 = train_cliprod[train_cliprod.Semana == 9].drop('Semana', axis=1).set_index('cliprod')

t10 = test_cliprod[test_cliprod.Semana == 10].drop('Semana', axis=1).set_index('cliprod')
t11 = test_cliprod[test_cliprod.Semana == 11].drop('Semana', axis=1).set_index('cliprod')

In [19]:
s3.head()

Unnamed: 0_level_0,Demanda_uni_equil
cliprod,Unnamed: 1_level_1
101126 1216,2
101946 8921,5
102314 37516,1
1032961 1687,1
1033104 43064,12


In [20]:
t10.head()

Unnamed: 0_level_0,id,Cliente_ID,Producto_ID
cliprod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
184044 31588,2107,184044,31588
2385912 1145,4750,2385912,1145
1424482 5380,32177,1424482,5380
4648745 5337,32400,4648745,5337
802373 35305,36082,802373,35305


In [21]:
s4["d1"]=s3['Demanda_uni_equil']
s5["d1"]=s4['Demanda_uni_equil']
s6["d1"]=s5['Demanda_uni_equil']
s7["d1"]=s6['Demanda_uni_equil']
s8["d1"]=s7['Demanda_uni_equil']
s9["d1"]=s8['Demanda_uni_equil']

t10["d1"]=s9['Demanda_uni_equil']

s5["d2"]=s3['Demanda_uni_equil']
s6["d2"]=s4['Demanda_uni_equil']
s7["d2"]=s5['Demanda_uni_equil']
s8["d2"]=s6['Demanda_uni_equil']
s9["d2"]=s7['Demanda_uni_equil']

t10["d2"]=s8['Demanda_uni_equil']
t11["d2"]=s9['Demanda_uni_equil']

s6["d3"]=s3['Demanda_uni_equil']
s7["d3"]=s4['Demanda_uni_equil']
s8["d3"]=s5['Demanda_uni_equil']
s9["d3"]=s6['Demanda_uni_equil']

t10["d3"]=s7['Demanda_uni_equil']
t11["d3"]=s8['Demanda_uni_equil']

s7["d4"]=s3['Demanda_uni_equil']
s8["d4"]=s4['Demanda_uni_equil']
s9["d4"]=s5['Demanda_uni_equil']

t10["d4"]=s6['Demanda_uni_equil']
t11["d4"]=s7['Demanda_uni_equil']

s8["d5"]=s3['Demanda_uni_equil']
s9["d5"]=s4['Demanda_uni_equil']

t10["d5"]=s5['Demanda_uni_equil']
t11["d5"]=s6['Demanda_uni_equil']

s9["d6"]=s3['Demanda_uni_equil']

t10["d6"]=s4['Demanda_uni_equil']
t11["d6"]=s5['Demanda_uni_equil']

In [22]:
s3.fillna(value=0, inplace=True)
s4.fillna(value=0, inplace=True)
s5.fillna(value=0, inplace=True)
s6.fillna(value=0, inplace=True)
s7.fillna(value=0, inplace=True)
s8.fillna(value=0, inplace=True)
s9.fillna(value=0, inplace=True)

t10.fillna(value=0, inplace=True)
t11.fillna(value=0, inplace=True)




In [23]:
s3["d1"]=np.nan

t11["d1"]=np.nan


s3["d2"]=np.nan
s4["d2"]=np.nan



s3["d3"]=np.nan
s4["d3"]=np.nan
s5["d3"]=np.nan



s3["d4"]=np.nan
s4["d4"]=np.nan
s5["d4"]=np.nan
s6["d4"]=np.nan


s3["d5"]=np.nan
s4["d5"]=np.nan
s5["d5"]=np.nan
s6["d5"]=np.nan
s7["d5"]=np.nan



s3["d6"]=np.nan
s4["d6"]=np.nan
s5["d6"]=np.nan
s6["d6"]=np.nan
s7["d6"]=np.nan
s8["d6"]=np.nan



In [24]:
s3.head()

Unnamed: 0_level_0,Demanda_uni_equil,d1,d2,d3,d4,d5,d6
cliprod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
101126 1216,2,,,,,,
101946 8921,5,,,,,,
102314 37516,1,,,,,,
1032961 1687,1,,,,,,
1033104 43064,12,,,,,,


In [25]:
t10.head()

Unnamed: 0_level_0,id,Cliente_ID,Producto_ID,d1,d2,d3,d4,d5,d6
cliprod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
184044 31588,2107,184044,31588,0.0,0.0,0.0,0.0,0.0,0.0
2385912 1145,4750,2385912,1145,0.0,0.0,0.0,0.0,0.0,0.0
1424482 5380,32177,1424482,5380,0.0,0.0,0.0,0.0,0.0,0.0
4648745 5337,32400,4648745,5337,0.0,0.0,0.0,0.0,0.0,0.0
802373 35305,36082,802373,35305,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
s3 = s3.reset_index()
s3.head()

Unnamed: 0,cliprod,Demanda_uni_equil,d1,d2,d3,d4,d5,d6
0,101126 1216,2,,,,,,
1,101946 8921,5,,,,,,
2,102314 37516,1,,,,,,
3,1032961 1687,1,,,,,,
4,1033104 43064,12,,,,,,


In [27]:
s3['Cliente_ID'], s3['Producto_ID'] = zip(*s3['cliprod'].apply(lambda x: x.split(' ', 1)))

In [28]:
s3.head()

Unnamed: 0,cliprod,Demanda_uni_equil,d1,d2,d3,d4,d5,d6,Cliente_ID,Producto_ID
0,101126 1216,2,,,,,,,101126,1216
1,101946 8921,5,,,,,,,101946,8921
2,102314 37516,1,,,,,,,102314,37516
3,1032961 1687,1,,,,,,,1032961,1687
4,1033104 43064,12,,,,,,,1033104,43064


In [29]:
s3 = s3.drop(['cliprod'], axis=1)
s3.head()

Unnamed: 0,Demanda_uni_equil,d1,d2,d3,d4,d5,d6,Cliente_ID,Producto_ID
0,2,,,,,,,101126,1216
1,5,,,,,,,101946,8921
2,1,,,,,,,102314,37516
3,1,,,,,,,1032961,1687
4,12,,,,,,,1033104,43064


In [30]:
s4 = s4.reset_index()
s4['Cliente_ID'], s4['Producto_ID'] = zip(*s4['cliprod'].apply(lambda x: x.split(' ', 1)))
s4 = s4.drop(['cliprod'], axis=1)

s5 = s5.reset_index()
s5['Cliente_ID'], s5['Producto_ID'] = zip(*s5['cliprod'].apply(lambda x: x.split(' ', 1)))
s5 = s5.drop(['cliprod'], axis=1)

s6 = s6.reset_index()
s6['Cliente_ID'], s6['Producto_ID'] = zip(*s6['cliprod'].apply(lambda x: x.split(' ', 1)))
s6 = s6.drop(['cliprod'], axis=1)

s7 = s7.reset_index()
s7['Cliente_ID'], s7['Producto_ID'] = zip(*s7['cliprod'].apply(lambda x: x.split(' ', 1)))
s7 = s7.drop(['cliprod'], axis=1)

s8 = s8.reset_index()
s8['Cliente_ID'], s8['Producto_ID'] = zip(*s8['cliprod'].apply(lambda x: x.split(' ', 1)))
s8 = s8.drop(['cliprod'], axis=1)

s9 = s9.reset_index()
s9['Cliente_ID'], s9['Producto_ID'] = zip(*s9['cliprod'].apply(lambda x: x.split(' ', 1)))
s9 = s9.drop(['cliprod'], axis=1)


t10 = t10.reset_index()
t10['Cliente_ID'], t10['Producto_ID'] = zip(*t10['cliprod'].apply(lambda x: x.split(' ', 1)))
t10 = t10.drop(['cliprod'], axis=1)

t11 = t11.reset_index()
t11['Cliente_ID'], t11['Producto_ID'] = zip(*t11['cliprod'].apply(lambda x: x.split(' ', 1)))
t11 = t11.drop(['cliprod'], axis=1)

In [31]:
t10.head()

Unnamed: 0,id,Cliente_ID,Producto_ID,d1,d2,d3,d4,d5,d6
0,2107,184044,31588,0.0,0.0,0.0,0.0,0.0,0.0
1,4750,2385912,1145,0.0,0.0,0.0,0.0,0.0,0.0
2,32177,1424482,5380,0.0,0.0,0.0,0.0,0.0,0.0
3,32400,4648745,5337,0.0,0.0,0.0,0.0,0.0,0.0
4,36082,802373,35305,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
train_joined = pd.concat([s3, s4, s5, s6, s7, s8, s9], ignore_index = True)
test_joined = pd.concat([t10, t11], ignore_index = True)

In [33]:
train_joined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1079 entries, 0 to 1078
Data columns (total 9 columns):
Demanda_uni_equil    1079 non-null float64
d1                   891 non-null float64
d2                   711 non-null float64
d3                   518 non-null float64
d4                   340 non-null float64
d5                   158 non-null float64
d6                   79 non-null float64
Cliente_ID           1079 non-null object
Producto_ID          1079 non-null object
dtypes: float64(7), object(2)
memory usage: 75.9+ KB


In [34]:
train_joined.head()

Unnamed: 0,Demanda_uni_equil,d1,d2,d3,d4,d5,d6,Cliente_ID,Producto_ID
0,2.0,,,,,,,101126,1216
1,5.0,,,,,,,101946,8921
2,1.0,,,,,,,102314,37516
3,1.0,,,,,,,1032961,1687
4,12.0,,,,,,,1033104,43064


In [35]:
train_joined[['Cliente_ID', 'Producto_ID']]=train_joined[['Cliente_ID', 'Producto_ID']].astype('int32')
test_joined[['Cliente_ID', 'Producto_ID']]=test_joined[['Cliente_ID', 'Producto_ID']].astype('int32')

In [36]:
train_joined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1079 entries, 0 to 1078
Data columns (total 9 columns):
Demanda_uni_equil    1079 non-null float64
d1                   891 non-null float64
d2                   711 non-null float64
d3                   518 non-null float64
d4                   340 non-null float64
d5                   158 non-null float64
d6                   79 non-null float64
Cliente_ID           1079 non-null int32
Producto_ID          1079 non-null int32
dtypes: float64(7), int32(2)
memory usage: 67.5 KB


In [37]:
test_joined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
Cliente_ID     1000 non-null int32
Producto_ID    1000 non-null int32
d1             516 non-null float64
d2             1000 non-null float64
d3             1000 non-null float64
d4             1000 non-null float64
d5             1000 non-null float64
d6             1000 non-null float64
id             1000 non-null int64
dtypes: float64(6), int32(2), int64(1)
memory usage: 62.6 KB


In [38]:
train_joined.to_csv('../input/train_alld_1000.csv')
test_joined.to_csv('../input/test_alld_1000.csv')
train_joined.fillna(value=-999, inplace=True)
test_joined.fillna(value=-999, inplace=True)

In [39]:
ids = test_joined['id']
test_joined = test_joined.drop(['id'],axis = 1)

y = train_joined['Demanda_uni_equil']
X = train_joined[test_joined.columns.values]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729)

print ('Division_Set_Shapes:', X.shape, y.shape)
print ('Validation_Set_Shapes:', X_train.shape, X_test.shape)

('Division_Set_Shapes:', (1079, 8), (1079,))
('Validation_Set_Shapes:', (863, 8), (216, 8))


In [40]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 863 entries, 313 to 857
Data columns (total 8 columns):
Cliente_ID     863 non-null int32
Producto_ID    863 non-null int32
d1             863 non-null float64
d2             863 non-null float64
d3             863 non-null float64
d4             863 non-null float64
d5             863 non-null float64
d6             863 non-null float64
dtypes: float64(6), int32(2)
memory usage: 53.9 KB


In [41]:
params = {}
params['objective'] = "reg:linear"
params['eta'] = 0.025
params['max_depth'] = 5
params['subsample'] = 0.8
params['colsample_bytree'] = 0.6
params['silent'] = True

In [42]:
test_preds = np.zeros(test_joined.shape[0])
xg_train = xgb.DMatrix(X_train, label=y_train, missing = -999)
xg_test = xgb.DMatrix(X_test)

watchlist = [(xg_train, 'train_joined')]
num_rounds = 100

xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = evalerror,
                         early_stopping_rounds= 20, verbose_eval = 10)
preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration)

print ('RMSLE Score:', rmsle(y_test, preds))

[0]	train_joined-error:1.37942
Will train until train_joined-error hasn't improved in 20 rounds.
[10]	train_joined-error:0.928569
[20]	train_joined-error:0.793596
[30]	train_joined-error:0.754268
[40]	train_joined-error:0.752381
[50]	train_joined-error:0.76101
Stopping. Best iteration:
[33]	train_joined-error:0.749611

('RMSLE Score:', 0.76310932176144064)


In [43]:
fxg_test = xgb.DMatrix(test_joined)
fold_preds = np.around(xgclassifier.predict(fxg_test, ntree_limit=xgclassifier.best_iteration), decimals = 1)
test_preds += fold_preds

In [44]:
submission = pd.DataFrame({'id':ids, 'Demanda_uni_equil': test_preds})

In [45]:
submission.Demanda_uni_equil = submission.Demanda_uni_equil.round()

In [46]:
submission[["id","Demanda_uni_equil"]].to_csv('../submissions/' +
                                              datetime.now().strftime('%Y-%m-%d-%H-%M-%S') +'.csv', index=False)

print ('done')

done
