In [42]:
# %load Walmart-NN-6.py
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.externals import joblib

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.metrics import confusion_matrix

import xgboost

import theano
from lasagne import layers, nonlinearities
from nolearn.lasagne import NeuralNet

from sklearn.linear_model import LogisticRegression

In [2]:
train = pd.read_csv('./train.csv') #Last visit number is 191347
test = pd.read_csv('./test.csv') #Last visit number is 191348

full_df = pd.concat((train, test))

full_df_negatives = full_df[full_df.ScanCount < 0]
full_df_negatives_agg = full_df_negatives.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Negative Feature Count

full_df_uncategorized = full_df[pd.isnull(full_df.Upc)]
full_df_uncategorized_agg = full_df_uncategorized.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Unknown Feature Count

full_df_totals = full_df[full_df.ScanCount > 0]
full_df_totals_agg = full_df_totals.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Total purchases Feature Count


full_df.Upc.fillna(-100, inplace=True)
full_df.DepartmentDescription.fillna('Unknown', inplace=True)
full_df.FinelineNumber.fillna(-100, inplace=True)


visit_days = full_df.loc[:,['VisitNumber','Weekday']]
visit_days.drop_duplicates('VisitNumber', inplace = True)
visit_days.set_index('VisitNumber', inplace = True)
visit_days = pd.get_dummies(visit_days)

full_df['FinelineNumber'] = full_df['FinelineNumber'].astype('int')
full_df['DeptItems'] = full_df.DepartmentDescription +' ' + full_df.FinelineNumber.astype('str')

full_deptitems_df = pd.pivot_table(full_df[full_df.ScanCount>0], values='ScanCount', index='VisitNumber',columns='DeptItems', aggfunc=np.sum)
full_deptitems_df.fillna(0, inplace=True)


y_df = full_df.loc[:, ['VisitNumber', 'TripType']]
y_df.drop_duplicates('VisitNumber', inplace=True)
y_df.set_index('VisitNumber', inplace=True)

y_df = y_df.join(full_deptitems_df) #This requires an insane amount of memory **Cannot fill 0s due to memory error

del full_deptitems_df

X_train = y_df[pd.notnull(y_df.TripType)].drop('TripType', axis = 1).values
X_test = y_df[pd.isnull(y_df.TripType)].drop('TripType', axis = 1).values
y_train = y_df[pd.notnull(y_df.TripType)]['TripType'].values


y_df = y_df[['TripType']] #Removing Unneccessary Columns


X_train = np.nan_to_num(X_train) #Splitting this into 2 cells works

chi_sq_best = SelectKBest(score_func=chi2, k = 7000)
chi_sq_best.fit(X_train,y_train)

X_train = chi_sq_best.transform(X_train)

X_test = np.nan_to_num(X_test)
X_test = chi_sq_best.transform(X_test)

X_df = pd.pivot_table(full_df, values='ScanCount', index='VisitNumber',columns='DepartmentDescription', aggfunc=np.sum)
X_df.fillna(0, inplace=True)


X_df = X_df.join(full_df_totals_agg, rsuffix='Totals')
X_df = X_df.join(full_df_uncategorized_agg, rsuffix='Uncategorized')
X_df = X_df.join(full_df_negatives_agg, rsuffix='Negatives')
X_df = X_df.join(visit_days)
X_df.fillna(0, inplace = True)

y_df = y_df.join(X_df)

X_train2 = y_df[pd.notnull(y_df.TripType)].drop('TripType', axis = 1).values
X_test2 = y_df[pd.isnull(y_df.TripType)].drop('TripType', axis = 1).values
y_train2 = y_df[pd.notnull(y_df.TripType)]['TripType'].values

X_train = np.concatenate((X_train, X_train2), axis = 1)
X_test = np.concatenate((X_test, X_test2), axis = 1)

enc = LabelEncoder()
y_train = enc.fit_transform(y_train)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 5000, random_state = 1)

In [3]:
X_train.shape

(90674, 7079)

In [4]:
X_test.shape


(95674, 7079)

In [26]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

y_train = y_train.astype('int32')

In [38]:
nn = NeuralNet(layers = [
     ('input', layers.InputLayer),
     ('dropout', layers.DropoutLayer),
     ('hidden1', layers.DenseLayer),
     ('dropout1', layers.DropoutLayer),   
     ('hidden2', layers.DenseLayer),
     ('dropout2', layers.DropoutLayer),   
     ('output', layers.DenseLayer),],
               
     input_shape = (None, X_train.shape[1]),
     dropout_p =.2,
               
     hidden1_num_units = 128,
     dropout1_p = .2,
     hidden2_num_units = 64,
     dropout2_p = .2,
               
     output_num_units = np.unique(y_train).shape[0],
     output_nonlinearity = nonlinearities.softmax,
     
     update_learning_rate = .0005,
     update_momentum = .9,
     
     regression = False,
     max_epochs = 300,
     verbose = True
      )



In [39]:
nn.fit(X_train,y_train)

# Neural Network with 916966 learnable parameters

## Layer information

  #  name        size
---  --------  ------
  0  input       7079
  1  dropout     7079
  2  hidden1      128
  3  dropout1     128
  4  hidden2       64
  5  dropout2      64
  6  output        38

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       [36m3.39475[0m       [32m3.14726[0m      1.07864      0.18060  3.04s
      2       [36m3.06962[0m       [32m2.83186[0m      1.08396      0.30054  3.17s
      3       [36m2.78868[0m       [32m2.53204[0m      1.10136      0.34979  3.00s
      4       [36m2.54949[0m       [32m2.30432[0m      1.10639      0.38404  2.98s
      5       [36m2.38179[0m       [32m2.13460[0m      1.11580      0.45392  2.93s
      6       [36m2.25697[0m       [32m2.01102[0m      1.12230      0.49720  2.98s
      7       [36m2.17200[0m       [32m1.91227[0m      1.13582    

NeuralNet(X_tensor_type=None,
     batch_iterator_test=<nolearn.lasagne.base.BatchIterator object at 0x7ff2981c11d0>,
     batch_iterator_train=<nolearn.lasagne.base.BatchIterator object at 0x7ff2981c1150>,
     custom_score=None, dropout1_p=0.2, dropout2_p=0.2, dropout_p=0.2,
     hidden1_num_units=128, hidden2_num_units=64, input_shape=(None, 7079),
     layers=[('input', <class 'lasagne.layers.input.InputLayer'>), ('dropout', <class 'lasagne.layers.noise.DropoutLayer'>), ('hidden1', <class 'lasagne.layers.dense.DenseLayer'>), ('dropout1', <class 'lasagne.layers.noise.DropoutLayer'>), ('hidden2', <class 'lasagne.layers.dense.DenseLayer'>), ('dropout2', <class 'lasagne.layers.noise.DropoutLayer'>), ('output', <class 'lasagne.layers.dense.DenseLayer'>)],
     loss=None, max_epochs=300, more_params={},
     objective=<function objective at 0x7ff2981b3488>,
     objective_loss_function=<function categorical_crossentropy at 0x7ff29820b0c8>,
     on_epoch_finished=[<nolearn.lasagne.handler

In [41]:
y_probas = nn.predict_proba(X_test)


col_names = ['TripType_' + str(c) for c in enc.classes_.astype('int')]
submission = pd.DataFrame(np.round(y_probas, 4), index=y_df[pd.isnull(y_df.TripType)].index, columns = col_names)

submission.reset_index(inplace = True)
submission.to_csv('Walmart_submission_NN_7000Features-6-2.csv', index=False)

In [None]:
##TODO  removing train test split and use all training data for NN



In [47]:
y_train_preditions = nn.predict_proba(X_train)

In [51]:
log1 = LogisticRegression(C=100)

In [52]:
y_train_preditions.shape

(90674, 38)

In [53]:
log1.fit(y_train_preditions, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [56]:
log1.score(y_train_preditions,y_train)

0.82604715795045991

In [58]:
y_logistic_probas = log1.predict_proba(y_probas)