In [32]:
from __future__ import print_function

import sys
import os
import time

import numpy as np
import theano
import theano.tensor as T

import lasagne

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler

import math
import numpy as np
import pandas as pd
from scipy import sparse


In [10]:
def loss_function(y, pred):
    eps = 1e-15
    total = 0.
    for i in range(len(y)):
        p = max(min(pred[i][y[i]], (1 - eps)), eps)
        total += math.log(p)
    return -(total/len(y))

def OneHotEncoder(col):
    #(values,counts) = np.unique(col,return_counts=True)
    #values[np.argsort(counts)[-2000:]]
    col = np.nan_to_num(col)
    uniques = np.unique(col)
    keymap = dict((key, i) for i, key in enumerate(uniques))
    
    total_pts = len(col)
    num_labels = len(uniques)
    
    spmat = sparse.lil_matrix((total_pts, num_labels))
    for j, val in enumerate(col):
        if val[0] in keymap:
            spmat[j, keymap[val[0]]] = 1
    return keymap, spmat

def getY(col):
    uniques = np.unique(col)
    keymap = dict((key, i) for i, key in enumerate(uniques))
    newY = np.array([keymap[x] for x in col])
    return keymap, newY
def prepareForCountVector(df, columnName, dictCount=2000):
    col = df[columnName].dropna()
    col = col.fillna('')

    counts = col.value_counts()
    topk_dict = counts.iloc[0:min(dictCount, len(col))].index
    
    topk_dict = set(topk_dict).union(set(topk_dict))
    topk = df[columnName].apply(lambda x: '%s%d'%(columnName, x) if x in topk_dict else '%sother'%(columnName))
 
    topk_se = pd.Series(topk, name=columnName)
    df_topk = pd.concat([topk_se, df['VisitNumber']], axis=1)
    return topk_dict, df_topk

def getCountVector(df, columnName, isWords, vec=None):
    if isWords:
        df[columnName] = df[columnName].fillna('')
    df_topk_gpy = df.groupby('VisitNumber')
    df_topk_list = df_topk_gpy.apply(lambda x: list(x[columnName]))
    topk_flat = df_topk_list.str.join(' ')
    
    if not vec: 
        vec = CountVectorizer() 
        vec.fit(topk_flat)    
    
    wc = vec.transform(topk_flat)
    wcar = wc.toarray()
    
    words_count = topk_flat.apply(lambda x : len(x.split(' '))).reshape(-1,1)
    ret = None
    if isWords:
        words_len = topk_flat.apply(lambda x : len(x)).reshape(-1,1)
        ret = np.column_stack([wcar, words_count, words_len])
    else:
        ret = np.column_stack([wcar, words_count])
    
    return vec, ret



In [12]:
#TripType	VisitNumber	Weekday	Upc	ScanCount	DepartmentDescription	FinelineNumber
train_df = pd.read_csv('train.csv')
num_train = np.shape(train_df)[0]

df_y = train_df[['VisitNumber', 'TripType']].groupby('VisitNumber').first()
df_y = df_y.reset_index()

y_map, y = getY(df_y.TripType)



In [17]:
df_w = train_df[['VisitNumber', 'Weekday']].groupby('VisitNumber').first()
df_w = df_w.reset_index()
week = OneHotEncoder(df_w.Weekday)

is_wknd = np.array(df_w['Weekday']=='Sunday')
is_wknd = is_wknd.reshape(-1,1)

df_upc = prepareForCountVector(train_df, 'Upc')
upc = getCountVector(df_upc[1], 'Upc', False)

df_fln = prepareForCountVector(train_df, 'FinelineNumber')
fln = getCountVector(df_fln[1], 'FinelineNumber', False)

words = getCountVector(train_df, 'DepartmentDescription', True)

df_ScanCount = train_df[['VisitNumber', 'ScanCount']].groupby('VisitNumber').sum()
df_ScanCount = df_ScanCount.reset_index()
scancount = np.array(df_ScanCount.ScanCount)
scancount = scancount.reshape(-1,1)

In [34]:
feature_matrix = []
feature_matrix.append(week[1])
feature_matrix.append(is_wknd)
feature_matrix.append(upc[1])
feature_matrix.append(fln[1])
feature_matrix.append(words[1])
feature_matrix.append(scancount)

feature_matrix = sparse.hstack(feature_matrix).tocsr()
feature_matrix = StandardScaler().fit_transform(feature_matrix.toarray())
feature_matrix.shape

AttributeError: 'numpy.ndarray' object has no attribute 'tocsr'

In [35]:
feature_matrix = StandardScaler().fit_transform(feature_matrix.toarray())
feature_matrix.shape

(95674, 4127)

In [36]:
X_train, X_val, y_train, y_val = train_test_split(feature_matrix, y, test_size=0.33, random_state=0)


In [37]:
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    input_len = inputs.shape[0] 
    assert input_len == targets.shape[0]
    if shuffle:
        indices = np.arange(input_len)
        np.random.shuffle(indices)
    for start_idx in range(0, input_len - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt]
        
def build_custom_mlp(input_var=None, depth=2, width=800, drop_input=.2,
                     drop_hidden=.5):
    network = lasagne.layers.InputLayer(shape=(None, 4127),
                                        input_var=input_var)
    if drop_input:
        network = lasagne.layers.dropout(network, p=drop_input)
    # Hidden layers and dropout:
    nonlin = lasagne.nonlinearities.rectify
    for _ in range(depth):
        network = lasagne.layers.DenseLayer(
                network, width, nonlinearity=nonlin)
        if drop_hidden:
            network = lasagne.layers.dropout(network, p=drop_hidden)
    # Output layer:
    softmax = lasagne.nonlinearities.softmax
    network = lasagne.layers.DenseLayer(network, 37, nonlinearity=softmax)
    return network

In [38]:
input_var = T.matrix('inputs')
target_var = T.ivector('targets')

network = build_custom_mlp(input_var)

prediction = lasagne.layers.get_output(network)
#loss = loss_function(target_var, prediction)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
loss = loss.mean()
    
params = lasagne.layers.get_all_params(network, trainable=True)
updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9)

test_prediction = lasagne.layers.get_output(network, deterministic=True)

test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var)
test_loss = test_loss.mean()

test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

train_fn = theano.function([input_var, target_var], loss, updates=updates)
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

In [39]:
num_epochs=5 #500

for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True):
            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(
            val_acc / val_batches * 100))


TypeError: ('Bad input argument to theano function with name "<ipython-input-38-df75b12cf68a>:22"  at index 1(0-based)', 'TensorType(int32, vector) cannot store a value of dtype int64 without risking loss of precision. If you do not mind this loss, you can: 1) explicitly cast your data to int32, or 2) set "allow_input_downcast=True" when calling "function".', array([ 5, 24,  5,  5, 31,  2,  5, 27, 27, 36,  4,  6, 30, 12, 23, 19,  3,
       21,  7, 16, 17, 32,  6, 28,  5, 36,  5,  5, 37,  4, 16,  5, 36, 30,
        5, 30,  5, 28, 31, 33,  5, 12, 31,  5, 29, 18,  6, 32, 19, 27, 32,
        4,  0, 17, 34,  5, 30, 10, 32,  9, 31, 32,  4, 17, 17, 32, 37, 32,
       19, 32,  6, 31,  4,  5, 34, 17, 30, 37,  0, 24,  6,  5,  5, 31,  4,
       19, 22,  5,  5, 31, 28, 28,  5, 31,  6, 22, 37, 17,  4, 17, 24, 27,
       31,  4,  0, 37,  5, 31, 21, 31, 32, 37,  4, 34, 21,  6, 37, 32, 16,
       37,  2,  6,  6, 30, 28, 17,  6, 16,  6, 34,  6, 32, 22,  5, 31, 28,
       30,  6,  5, 25, 35,  6,  1,  6, 32,  5, 31, 32,  6, 14,  2,  6,  6,
       16,  6,  4, 28, 30,  5, 31, 34, 35,  6, 24, 17, 31, 31, 22, 31, 28,
        2, 17,  2, 31, 12,  6,  0,  2, 25, 31, 32,  5, 12, 32,  2,  2, 30,
       37, 32,  2,  5, 34, 26, 27, 13, 31, 28, 34,  6, 31, 31, 29, 24,  0,
        4, 25, 31, 31, 37,  6, 27,  4, 37, 31,  5, 30, 32, 29, 31, 13, 31,
        6,  4, 37, 21,  4, 31,  0,  5, 31, 22, 37,  5,  6, 14, 22, 31,  5,
       28, 33, 22,  4,  6,  4,  4, 37,  2, 29,  4,  4, 27, 37, 31, 37,  6,
        9, 11,  2, 24, 34, 17,  6, 17,  2, 31, 34, 31, 30,  6,  6, 14,  5,
        0,  6, 32, 30,  5, 29, 24, 31,  2, 32, 31,  5,  5, 37, 16, 31,  6,
       37,  5,  5,  6,  5, 22,  5, 37, 32, 34,  0,  4, 11,  4,  6, 37,  6,
       23, 18, 29, 17, 37,  3, 16,  4,  4, 32,  6, 27, 28, 19, 37,  5, 34,
       10, 32, 17, 29, 24, 32,  2, 21, 31, 29, 31,  0,  6, 30, 26, 17,  1,
       37, 17,  4,  5, 24, 30, 17, 31, 29, 32, 16,  5,  5,  5,  4, 31, 37,
        9, 32, 37, 31, 37,  6, 25,  5,  4, 37,  4, 32, 28, 28, 17, 27, 31,
       37,  3, 31,  0, 32, 32,  5, 31, 24, 13, 37,  5,  4, 23,  4, 37, 37,
        2, 31, 37,  4,  6, 16,  6,  2, 34,  0,  6, 32,  5, 32,  1, 37, 23,
       29,  3, 30, 30,  4, 35,  4, 30, 24, 27,  6, 30,  0,  5, 16,  5,  6,
        4,  5, 33,  9, 31, 24, 36, 21, 31, 37, 19, 35, 31, 26, 37, 31, 37,
        5, 16,  5, 30, 30, 23,  0,  5,  6, 37,  0, 37,  5,  6,  5, 37, 37,
        5,  5, 30, 13, 31,  7,  5, 17, 31, 31, 32, 37, 37, 14,  5,  0, 31,
       37, 37,  5,  5,  4, 12,  0,  6,  4,  6, 35,  2,  2, 37,  0, 37, 31,
       30,  4,  2, 31,  5, 26, 37]))

In [None]:
# After training, we compute and print the test error:
    test_err = 0
    test_acc = 0
    test_batches = 0
    for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False):
        inputs, targets = batch
        err, acc = val_fn(inputs, targets)
        test_err += err
        test_acc += acc
        test_batches += 1
    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
    print("  test accuracy:\t\t{:.2f} %".format(
        test_acc / test_batches * 100))
