In [1]:
# %load Walmart-NN-6.py
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.externals import joblib

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.metrics import confusion_matrix

import xgboost

import theano
from lasagne import layers, nonlinearities
from nolearn.lasagne import NeuralNet, BatchIterator

from sklearn.linear_model import LogisticRegression

Using gpu device 0: GRID K520 (CNMeM is disabled)


In [2]:
train = pd.read_csv('./train.csv') #Last visit number is 191347
test = pd.read_csv('./test.csv') #Last visit number is 191348

full_df = pd.concat((train, test))

full_df_negatives = full_df[full_df.ScanCount < 0]
full_df_negatives_agg = full_df_negatives.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Negative Feature Count

full_df_uncategorized = full_df[pd.isnull(full_df.Upc)]
full_df_uncategorized_agg = full_df_uncategorized.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Unknown Feature Count

full_df_totals = full_df[full_df.ScanCount > 0]
full_df_totals_agg = full_df_totals.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Total purchases Feature Count

In [3]:
full_df.head()

Unnamed: 0,DepartmentDescription,FinelineNumber,ScanCount,TripType,Upc,VisitNumber,Weekday
0,FINANCIAL SERVICES,1000,-1,999,68113152929,5,Friday
1,SHOES,8931,1,30,60538815980,7,Friday
2,PERSONAL CARE,4504,1,30,7410811099,7,Friday
3,PAINT AND ACCESSORIES,3565,2,26,2238403510,8,Friday
4,PAINT AND ACCESSORIES,1017,2,26,2006613744,8,Friday


In [4]:
#print full_df.FinelineNumber.nunique() #5353
#print full_df.Upc.nunique() #124693

In [5]:
#full_df[pd.isnull(full_df.FinelineNumber)].tail(30) #Most Values that have a NA for FinelineNumber Also have NA for Upc

In [6]:
full_df.Upc.fillna(-100, inplace=True)
full_df.DepartmentDescription.fillna('UNKNOWN', inplace=True)
full_df.FinelineNumber.fillna(-100, inplace=True)

visit_days = full_df.loc[:,['VisitNumber','Weekday']]
visit_days.drop_duplicates('VisitNumber', inplace = True)
visit_days.set_index('VisitNumber', inplace = True)
visit_days = pd.get_dummies(visit_days)

In [7]:
full_df['FinelineNumber'] = full_df['FinelineNumber'].astype('int')
full_df['DeptItems'] = full_df.DepartmentDescription +' ' + full_df.FinelineNumber.astype('str')

full_deptitems_df = pd.pivot_table(full_df[full_df.ScanCount>0], values='ScanCount', index='VisitNumber',columns='DeptItems', aggfunc=np.sum)
full_deptitems_df.fillna(0, inplace=True)


y_df = full_df.loc[:, ['VisitNumber', 'TripType']]
y_df.drop_duplicates('VisitNumber', inplace=True)
y_df.set_index('VisitNumber', inplace=True)

y_df = y_df.join(full_deptitems_df) #This requires an insane amount of memory **Cannot fill 0s due to memory error

In [8]:
del full_deptitems_df

In [10]:
X_train = y_df[pd.notnull(y_df.TripType)].drop('TripType', axis = 1).values
X_test = y_df[pd.isnull(y_df.TripType)].drop('TripType', axis = 1).values
y_train = y_df[pd.notnull(y_df.TripType)]['TripType'].values


y_df = y_df[['TripType']] #Removing Unneccessary Columns

In [11]:
X_train = np.nan_to_num(X_train) #Splitting this into 2 cells works

In [12]:
chi_sq_best = SelectKBest(score_func=chi2, k = 10000)
chi_sq_best.fit(X_train,y_train)

X_train = chi_sq_best.transform(X_train)

X_test = np.nan_to_num(X_test)
X_test = chi_sq_best.transform(X_test)

In [13]:
X_df = pd.pivot_table(full_df, values='ScanCount', index='VisitNumber',columns='DepartmentDescription', aggfunc=np.sum)
X_df.fillna(0, inplace=True)


X_df = X_df.join(full_df_totals_agg, rsuffix='Totals')
X_df = X_df.join(full_df_uncategorized_agg, rsuffix='Uncategorized')
X_df = X_df.join(full_df_negatives_agg, rsuffix='Negatives')
X_df = X_df.join(visit_days)
X_df.fillna(0, inplace = True)

y_df = y_df.join(X_df)

X_train2 = y_df[pd.notnull(y_df.TripType)].drop('TripType', axis = 1).values
X_test2 = y_df[pd.isnull(y_df.TripType)].drop('TripType', axis = 1).values
y_train2 = y_df[pd.notnull(y_df.TripType)]['TripType'].values

X_train = np.concatenate((X_train, X_train2), axis = 1)
X_test = np.concatenate((X_test, X_test2), axis = 1)

enc = LabelEncoder()
y_train = enc.fit_transform(y_train)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 5000, random_state = 1)

In [26]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

y_train = y_train.astype('int32')

In [27]:
class AdjustVariable(object):
    def __init__(self, name, start=0.03, stop=0.001):
        self.name = name
        self.start, self.stop = start, stop
        self.ls = None

    def __call__(self, nn, train_history):
        if self.ls is None:
            self.ls = np.linspace(self.start, self.stop, nn.max_epochs)

        epoch = train_history[-1]['epoch']
        new_value = float32(self.ls[epoch - 1])
        getattr(nn, self.name).set_value(new_value)
        
class EarlyStopping(object):
    def __init__(self, patience=100):
        self.patience = patience
        self.best_valid = np.inf
        self.best_valid_epoch = 0
        self.best_weights = None

    def __call__(self, nn, train_history):
        current_valid = train_history[-1]['valid_loss']
        current_epoch = train_history[-1]['epoch']
        if current_valid < self.best_valid:
            self.best_valid = current_valid
            self.best_valid_epoch = current_epoch
            self.best_weights = nn.get_all_params_values()
        elif self.best_valid_epoch + self.patience < current_epoch:
            print("Early stopping.")
            print("Best valid loss was {:.6f} at epoch {}.".format(
                self.best_valid, self.best_valid_epoch))
            nn.load_params_from(self.best_weights)
            raise StopIteration()
            
def float32(k):
    return np.cast['float32'](k)

In [29]:
nn = NeuralNet(layers = [
     ('input', layers.InputLayer),
     ('dropout', layers.DropoutLayer),
     ('hidden1', layers.DenseLayer),
     ('dropout1', layers.DropoutLayer),   
     ('hidden2', layers.DenseLayer),
     ('dropout2', layers.DropoutLayer),   
     ('output', layers.DenseLayer),],
               
     input_shape = (None, X_train.shape[1]),
     dropout_p =.25,
               
     hidden1_num_units = 128,
     dropout1_p = .15,
     hidden2_num_units = 64,
     dropout2_p = .15,
               
     output_num_units = np.unique(y_train).shape[0],
     output_nonlinearity = nonlinearities.softmax,
               
     update_learning_rate=theano.shared(float32(0.03)),
     update_momentum=theano.shared(float32(0.9)),
    
     batch_iterator_train=BatchIterator(batch_size=1024),
               
     on_epoch_finished=[
        AdjustVariable('update_learning_rate', start=0.03, stop=0.0001),
        AdjustVariable('update_momentum', start=0.9, stop=0.999),
        EarlyStopping(patience=25)
        ],

     regression = False,
     max_epochs = 200,
     verbose = True
      )



In [30]:
nn.fit(X_train,y_train)

# Neural Network with 1300966 learnable parameters

## Layer information

  #  name        size
---  --------  ------
  0  input      10079
  1  dropout    10079
  2  hidden1      128
  3  dropout1     128
  4  hidden2       64
  5  dropout2      64
  6  output        38

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       [36m2.88408[0m       [32m2.16694[0m      1.33095      0.42873  2.64s
      2       [36m2.10599[0m       [32m1.65165[0m      1.27508      0.58642  2.55s
      3       [36m1.84701[0m       [32m1.44312[0m      1.27988      0.61387  2.40s
      4       [36m1.70986[0m       [32m1.31878[0m      1.29655      0.63887  2.55s
      5       [36m1.63761[0m       [32m1.25250[0m      1.30748      0.64903  2.52s
      6       [36m1.56697[0m       [32m1.20155[0m      1.30412      0.65784  2.90s
      7       [36m1.52231[0m       [32m1.15751[0m      1.31516   

NeuralNet(X_tensor_type=None,
     batch_iterator_test=<nolearn.lasagne.base.BatchIterator object at 0x7fdae13ed310>,
     batch_iterator_train=<nolearn.lasagne.base.BatchIterator object at 0x7fdad5ddf410>,
     custom_score=None, dropout1_p=0.2, dropout2_p=0.2, dropout_p=0.25,
     hidden1_num_units=128, hidden2_num_units=64,
     input_shape=(None, 10079),
     layers=[('input', <class 'lasagne.layers.input.InputLayer'>), ('dropout', <class 'lasagne.layers.noise.DropoutLayer'>), ('hidden1', <class 'lasagne.layers.dense.DenseLayer'>), ('dropout1', <class 'lasagne.layers.noise.DropoutLayer'>), ('hidden2', <class 'lasagne.layers.dense.DenseLayer'>), ('dropout2', <class 'lasagne.layers.noise.DropoutLayer'>), ('output', <class 'lasagne.layers.dense.DenseLayer'>)],
     loss=None, max_epochs=300, more_params={},
     objective=<function objective at 0x7fdae13d7e60>,
     objective_loss_function=<function categorical_crossentropy at 0x7fdae142caa0>,
     on_epoch_finished=[<__main__.AdjustV

In [31]:
X_, X_val, y_, y_val = train_test_split(X_train, y_train, test_size = 25000, random_state = 13)

del X_
del y_

xgb = xgboost.XGBClassifier(max_depth = 14, n_estimators = 200,
                        objective='multi:softprob', subsample = .80, colsample_bytree=.5, )

xgb.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = 'mlogloss', early_stopping_rounds=25)

Will train until validation_0 error hasn't decreased in 25 rounds.
[0]	validation_0-mlogloss:3.167847
[1]	validation_0-mlogloss:2.830302
[2]	validation_0-mlogloss:2.613766
[3]	validation_0-mlogloss:2.477340
[4]	validation_0-mlogloss:2.369453


XGBClassifier(base_score=0.5, colsample_bytree=0.5, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=14,
       min_child_weight=1, missing=None, n_estimators=5, nthread=-1,
       objective='multi:softprob', seed=0, silent=True, subsample=0.8)

In [33]:
y_xgb_train_predictions = xgb.predict_proba(X_train)
y_nn_train_predictions = nn.predict_proba(X_train)

In [38]:
X_ensembl_train = np.concatenate((y_xgb_train_predictions, y_nn_train_predictions), axis = 1)

In [40]:
log_ensembl = LogisticRegression(C=100)

In [41]:
log_ensembl.fit(X_ensembl_train, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [43]:
log_ensembl.score(X_ensembl_train,y_train)

0.9084412290182412

In [52]:
y_xgb_test_predictions = xgb.predict_proba(X_test)
y_nn_test_predictions = nn.predict_proba(X_test)

In [53]:
X_ensembl_test = np.concatenate((y_xgb_test_predictions, y_nn_test_predictions), axis = 1)

In [54]:
y_probas = log_ensembl.predict_proba(X_ensembl_test)


col_names = ['TripType_' + str(c) for c in enc.classes_.astype('int')]
submission = pd.DataFrame(np.round(y_probas, 4), index=y_df[pd.isnull(y_df.TripType)].index, columns = col_names)

submission.reset_index(inplace = True)
submission.to_csv('Walmart_log_ensembl_10000Features-Notebook.csv', index=False)

In [56]:
y_probas_avg = (y_xgb_test_predictions + y_nn_test_preditions)/2
submission = pd.DataFrame(np.round(y_probas_avg,4), index=y_df[pd.isnull(y_df.TripType)].index, columns = col_names)

submission.reset_index(inplace = True)
submission.to_csv('Walmart_avg_ensembl_10000Features-Notebook.csv', index=False)