## First, datas (excels) should be downloaded from 
https://www.kaggle.com/c/grupo-bimbo-inventory-demand/data

In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk.corpus
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets, linear_model
import scipy.sparse as sps
from scipy.sparse import coo_matrix, hstack, vstack, csr_matrix
from scipy import io
from datetime import datetime
import gc
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## Extracting Product Features

Products on excel -> 9,Capuccino Moka 750g NES 9

In [3]:
#All columns can fit one line..
pd.set_option('expand_frame_repr', False)

products = pd.read_csv("../../input/producto_tabla.csv")
products['short_name'] = products.NombreProducto.str.extract('^(\D*)', expand=True)
products['brand'] = products.NombreProducto.str.extract('^.+\s(\D+) \d+$', expand=False)
w = products.NombreProducto.str.extract('(\d+)(Kg|g)', expand=True)
w.head(3)
products['weight'] = w[0].astype('float') * w[1].map({'Kg': 1000, 'g': 1})
products['pieces'] = products.NombreProducto.str.extract('(\d+)p ', expand=False).astype('float')
products['short_name_processed'] = (products['short_name'].map(
    lambda x: " ".join([i for i in x.lower().split() if i not in nltk.corpus.stopwords.words("spanish")])))
stemmer = SnowballStemmer("spanish")
products['short_name_processed'] = (
products['short_name_processed'].map(lambda x: " ".join([stemmer.stem(i) for i in x.lower().split()])))
products["pieces"].fillna(1, inplace=True)
products.drop(0,inplace=True)
products.drop(["short_name","NombreProducto"],axis=1,inplace=True)
products.fillna(products.mean(),inplace=True).head(5)

Unnamed: 0,Producto_ID,brand,weight,pieces,short_name_processed
1,9,NES,750.0,1.0,capuccin mok
2,41,BIM,480.0,6.0,bimboll ext sajonjoli
3,53,LON,170.0,1.0,burrit sincr
4,72,TR,45.0,4.0,div tir mini doradit
5,73,BIM,540.0,1.0,pan multigran linaz


## Read train and test

In [4]:
types = {'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,
         'Cliente_ID':np.uint32, 'Producto_ID':np.uint16,
         'Demanda_uni_equil':np.uint32}

typesTest = {'id':np.uint32, 'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,
         'Cliente_ID':np.uint32, 'Producto_ID':np.uint16}

train = pd.read_csv('../../input/train.csv', usecols=types.keys(), dtype=types)
                                                      
test = pd.read_csv('../../input/test.csv',usecols=typesTest.keys(), dtype=typesTest)

In [5]:
print(train.info(memory_usage=True))
print(test.info(memory_usage=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74180464 entries, 0 to 74180463
Data columns (total 6 columns):
Semana               uint8
Agencia_ID           uint16
Canal_ID             uint8
Cliente_ID           uint32
Producto_ID          uint16
Demanda_uni_equil    uint32
dtypes: uint16(2), uint32(2), uint8(2)
memory usage: 990.4 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6999251 entries, 0 to 6999250
Data columns (total 6 columns):
id             uint32
Semana         uint8
Agencia_ID     uint16
Canal_ID       uint8
Cliente_ID     uint32
Producto_ID    uint16
dtypes: uint16(2), uint32(2), uint8(2)
memory usage: 93.5 MB
None


## Seperate x and y of train data, append test to train to align them to have same sparse product features order. If they don't have the same column order, training gives false results.. Then merge this joined data with products

In [6]:
testIds = test['id']
test.drop(['id'],axis = 1,inplace=True)
trainY = train.loc[:,'Demanda_uni_equil']
trainX = train.loc[:,test.columns.values]
trainTest = trainX.append(test,ignore_index=True)
mergedTrainedTestProduct = trainTest.merge(products,on="Producto_ID",how="left")
mergedTrainedTestProduct.head(5)

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Cliente_ID,Producto_ID,brand,weight,pieces,short_name_processed
0,3,1110,7,15766,1212,BIM,120.0,2.0,rol canel
1,3,1110,7,15766,1216,BIM,135.0,2.0,rol glass
2,3,1110,7,15766,1238,BIM,140.0,2.0,panquecit got choc
3,3,1110,7,15766,1240,BIM,125.0,4.0,mantec vainill
4,3,1110,7,15766,1242,BIM,105.0,6.0,donit espolvor


In [7]:
print mergedTrainedTestProduct.dtypes

Semana                    uint8
Agencia_ID               uint16
Canal_ID                  uint8
Cliente_ID               uint32
Producto_ID              uint16
brand                    object
weight                  float64
pieces                  float64
short_name_processed     object
dtype: object


## Garbage Collection to free memory..

In [8]:
rowsize = test.shape[0]
del trainTest,test,trainX
gc.collect()

248

## Use count vectorizer on short_name_processed column to create sparse count-word matrix

In [9]:
countvec = CountVectorizer()
shortname_sparse =  countvec.fit_transform(mergedTrainedTestProduct.short_name_processed)
shortname_sparse = shortname_sparse.astype(np.int8)
shortname_sparse

<81179715x542 sparse matrix of type '<type 'numpy.int8'>'
	with 145774286 stored elements in Compressed Sparse Row format>

In [10]:
shortname_sparse.data.nbytes

145774286

## Use count vectorizer on brand column

In [10]:
brand_sparse = countvec.fit_transform(mergedTrainedTestProduct.brand)
brand_sparse = brand_sparse.astype(np.int8)
brand_sparse

<81179715x33 sparse matrix of type '<type 'numpy.int8'>'
	with 81179715 stored elements in Compressed Sparse Row format>

# Dropping unnecessary columns from merged Train-Test data

In [11]:
mergedTrainedTestProduct.drop(mergedTrainedTestProduct.columns[[0,1,2,3,4,5,8]],axis=1,inplace=True)
mergedTrainedTestProduct = mergedTrainedTestProduct.astype(np.float16)
print mergedTrainedTestProduct.info()
mergedTrainedTestProduct.head(1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81179715 entries, 0 to 81179714
Data columns (total 2 columns):
weight    float16
pieces    float16
dtypes: float16(2)
memory usage: 929.0 MB
None


Unnamed: 0,weight,pieces
0,120.0,2.0


In [12]:
gc.collect()

98

# Merging train-test data with brand and short name sparse matrices

In [13]:
mergedTrainedTestProduct_sparse = coo_matrix(mergedTrainedTestProduct)

del mergedTrainedTestProduct
gc.collect()

completeSparseData = hstack((mergedTrainedTestProduct_sparse, shortname_sparse,brand_sparse), format='csr')
completeSparseData
#io.mmwrite("../../sparse/completeSparseData.mtx", completeSparseData)

<81179715x577 sparse matrix of type '<type 'numpy.float32'>'
	with 389313431 stored elements in Compressed Sparse Row format>

## Garbage Collection..

In [14]:
del  shortname_sparse, brand_sparse
gc.collect()

0

## Split train-test to train and test again as sparse matrices..

In [15]:
train_sparse = completeSparseData[0:train.shape[0]]
test_sparse = completeSparseData[train.shape[0]:]
test_sparse
#io.mmwrite("../../sparse/test_sparse.mtx", test_sparse)

<6999251x577 sparse matrix of type '<type 'numpy.float32'>'
	with 33593313 stored elements in Compressed Sparse Row format>

## Garbage Collection..

In [16]:
del completeSparseData
gc.collect()

0

## Root mean square logarithmic square evaluation function

In [17]:
def rmsle_eval(y, y0):
    y0=y0.get_label()    
    assert len(y) == len(y0)
    return 'error',np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))

## Using XGBoost to predict target variable while using custom evaluation metric above..

# Prediction..

In [18]:
import xgboost as xgb

params = {}
params['objective'] = "reg:linear"
params['eta'] = 0.025
params['max_depth'] = 5
params['subsample'] = 0.8
params['colsample_bytree'] = 0.6
params['silent'] = True

test_preds = np.zeros(rowsize)

xg_train = xgb.DMatrix(train_sparse, label=trainY)
xg_test = xgb.DMatrix(test_sparse)

del train_sparse,test_sparse,trainY
gc.collect()

watchlist = [(xg_train, 'train')]
num_rounds = 100


xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = rmsle_eval, 
                         early_stopping_rounds= 20, verbose_eval = 10)
#fold_preds = np.around(xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration))
fold_preds = np.rint(xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration))

test_preds += fold_preds

submission = pd.DataFrame({'id':testIds, 'Demanda_uni_equil': test_preds})
submission[["id","Demanda_uni_equil"]].to_csv('../../submissions/' + 
                                              datetime.now().strftime('%Y-%m-%d-%H-%M-%S') +'.csv', index=False)

Will train until train error hasn't decreased in 20 rounds.
[0]	train-error:1.363313
[10]	train-error:0.918269
[20]	train-error:0.792863
[30]	train-error:0.765580
[40]	train-error:0.768304
[50]	train-error:0.782227
Stopping. Best iteration:
[34]	train-error:0.764148



In [19]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6999251 entries, 0 to 6999250
Data columns (total 2 columns):
Demanda_uni_equil    float64
id                   uint32
dtypes: float64(1), uint32(1)
memory usage: 80.1 MB
