# Grupo Bimgo Inventory Demand Kaggle Project
# DA 515 Case Study Project
## Berker Kozan
## Can Koklu

First we import all the necessary libraries.
The reasons for importing some specific 

In [11]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk.corpus
from nltk.stem.snowball import SnowballStemmer
from sklearn.cross_validation import train_test_split
from ml_metrics import rmsle
import xgboost as xgb
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets, linear_model
import scipy.sparse as sps
from scipy.sparse import coo_matrix, hstack, vstack, csr_matrix
from scipy import io
from datetime import datetime
import gc
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

The competition uses a __Root Mean Squared Logarithmic Error__ to evaluate the accuracy of predictions.
The following function is used to emulate this evaluation locally.



In [None]:
def evalerror(preds, dtrain):

    labels = dtrain.get_label()
    assert len(preds) == len(labels)
    labels = labels.tolist()
    preds = preds.tolist()
    terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0,preds[i]) + 1))
                    ** 2.0 for i,pred in enumerate(labels)]
    return 'error', (sum(terms_to_sum) * (1.0/len(preds))) ** 0.5

The overall data for this project is over __3GB of uncompressed text__. As such, for this project we will only creating a proof of concept using a small subset of the data.

We will first load the train and test files.

The number that we are trying to predict is the __Demanda_uni_equil__.

In [23]:
train = pd.read_csv('../input/train_1000.csv', usecols=['Cliente_ID', 'Producto_ID', 'Demanda_uni_equil'])
train.head()

Unnamed: 0,Cliente_ID,Producto_ID,Demanda_uni_equil
0,324600,202,8
1,327360,303,8
2,81569,1309,1
3,81688,1242,7
4,118819,1064,6


In [22]:
test = pd.read_csv('../input/test_1000.csv', usecols=['id', 'Cliente_ID', 'Producto_ID'])
test.head()

Unnamed: 0,id,Cliente_ID,Producto_ID
0,2107,184044,31588
1,4750,2385912,1145
2,6252,766465,35305
3,18978,2229028,43251
4,30799,711302,1220


Another important file we are using is the producto_tabla file; this file contains the names of products.  We will be using this for feature engineering.

In [21]:
products = pd.read_csv("../input/producto_tabla.csv")
products.head()

Unnamed: 0,Producto_ID,NombreProducto
0,0,NO IDENTIFICADO 0
1,9,Capuccino Moka 750g NES 9
2,41,Bimbollos Ext sAjonjoli 6p 480g BIM 41
3,53,Burritos Sincro 170g CU LON 53
4,72,Div Tira Mini Doradita 4p 45g TR 72


In the code segment below, we are engineering various features that can be used in the prediction.

What is being done:
1. Extract a short_name version and place in a new column.
2. Extract the weight and pieces and place in a new column.
3. Use __nltk__ library to stem to split and stem the short name and store in short_name_processed.
4. Use the __get_dummies__ function to convert the processed short name into columnar features.

In [25]:
products['short_name'] = products.NombreProducto.str.extract('^(\D*)', expand=False)
# products['brand'] = products.NombreProducto.str.extract('^.+\s(\D+) \d+$', expand=False)
w = products.NombreProducto.str.extract('(\d+)(Kg|g)', expand=True)
products['weight'] = w[0].astype('float') * w[1].map({'Kg': 1000, 'g': 1})
products['pieces'] = products.NombreProducto.str.extract('(\d+)p ', expand=False).astype('float')

products['short_name_processed'] = (products['short_name'].
                                    map(lambda x: " ".
                                        join([i for i in x.lower().split() 
                                              if i not in nltk.corpus.stopwords.words("spanish")])))
stemmer = SnowballStemmer("spanish")
products['short_name_processed'] = (products['short_name_processed'].
                                    map(lambda x: " ".join([stemmer.stem(i) for i in x.lower().split()])))
short_name_processed_list = products['short_name_processed'].unique()

products = pd.concat([products.drop(['short_name', 'short_name_processed', 'NombreProducto'], axis=1),
                      pd.get_dummies(short_name_processed_list)], axis=1)
products = products.drop([''], axis=1)

products.fillna(value=0, inplace=True)

print('products shape:', products.shape)

('products shape:', (2592, 965))


In [26]:
products.head()

Unnamed: 0,Producto_ID,weight,pieces,actifresh ment,actifresh yerbabuen,agu ciel jamaic,agu ciel limon,agu ciel natural,agu ciel toronj,almendr,...,tuinky vainill,twin pack thins multig,twinki vainill,two pack classic avellanafs,unic fresc naranj,valenton,vas,vidri congel,wond,wonderbutt
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9,750.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,41,480.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,53,170.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,72,45.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Join data and products
train = train.join(products, on='Producto_ID', lsuffix='_t')
train.fillna(value=0, inplace=True)

print('train data joined')

test = test.join(products, on='Producto_ID', lsuffix='_t')
test.fillna(value=0, inplace=True)

print('test data joined')
print('train shape', train.shape)
print('test shape', test.shape)

train data joined
test data joined
('train shape', (1000, 968))
('test shape', (1000, 968))


In [None]:
ids = test['id']
test = test.drop(['id'],axis = 1)

y = train['Demanda_uni_equil']
X = train[test.columns.values]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729)

print ('Division_Set_Shapes:', X.shape, y.shape)
print ('Validation_Set_Shapes:', X_train.shape, X_test.shape)

In [15]:
params = {}
params['objective'] = "reg:linear"
params['eta'] = 0.025
params['max_depth'] = 5
params['subsample'] = 0.8
params['colsample_bytree'] = 0.6
params['silent'] = True

print ('')

test_preds = np.zeros(test.shape[0])
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test)

watchlist = [(xg_train, 'train')]
num_rounds = 100

xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = evalerror, early_stopping_rounds= 20, verbose_eval = 10)
preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration)

print ('RMSLE Score:', rmsle(y_test, preds))

fxg_test = xgb.DMatrix(test)
fold_preds = np.around(xgclassifier.predict(fxg_test, ntree_limit=xgclassifier.best_iteration), decimals = 1)
test_preds += fold_preds

('Division_Set_Shapes:', (1000, 967), (1000,))
('Validation_Set_Shapes:', (800, 967), (200, 967))

('RMSLE Score:', 0.73602198859311663)


Will train until train error hasn't decreased in 20 rounds.
[0]	train-error:1.418320
[10]	train-error:0.932529
[20]	train-error:0.783323
[30]	train-error:0.740890
[40]	train-error:0.735710
[50]	train-error:0.738248
Stopping. Best iteration:
[37]	train-error:0.735631



In [16]:
submission = pd.DataFrame({'id':ids, 'Demanda_uni_equil': test_preds})

submission[["id","Demanda_uni_equil"]].to_csv('../submissions/' +
                                              datetime.now().strftime('%Y-%m-%d-%H-%M-%S') +'.csv', index=False)

print ('done')

done
