In [1]:
import sys
import os
import time

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.grid_search import GridSearchCV

import math
import numpy as np
import pandas as pd
from scipy import sparse

from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from lasagne.layers import DropoutLayer
from lasagne.nonlinearities import softmax
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet
from nolearn.lasagne import TrainSplit


In [2]:
def loss_function(y, pred):
    eps = 1e-15
    total = 0.
    for i in range(len(y)):
        p = max(min(pred[i][y[i]], (1 - eps)), eps)
        total += math.log(p)
    return -(total/len(y))

def prepareForCountVector(df, columnName, dictCount=2000, topk_dict=None):
    if not topk_dict:
        col = df[columnName].dropna()
        counts = col.value_counts()
        topk_dict = counts.iloc[0:min(dictCount, len(col))].index
    
        topk_dict = set(topk_dict).union(set(topk_dict))
        
    col = col.fillna('')
    
    topk = df[columnName].apply(lambda x: '%s%d'%(columnName, x) if x in topk_dict else '%sother'%(columnName))
 
    topk_se = pd.Series(topk, name=columnName)
    df_topk = pd.concat([topk_se, df['VisitNumber']], axis=1)
    return topk_dict, df_topk

def getCountVector(df, columnName, isWords, vec=None):
    if isWords:
        df[columnName] = df[columnName].fillna('')
    df_topk_gpy = df.groupby('VisitNumber')
    df_topk_list = df_topk_gpy.apply(lambda x: list(x[columnName]))
    topk_flat = df_topk_list.str.join(' ')
    
    if not vec: 
        vec = CountVectorizer() 
        vec.fit(topk_flat)    
    
    wc = vec.transform(topk_flat)
    wcar = wc.toarray()
    
    words_count = topk_flat.apply(lambda x : len(x.split(' '))).reshape(-1,1)
    ret = None
    if isWords:
        words_len = topk_flat.apply(lambda x : len(x)).reshape(-1,1)
        ret = np.column_stack([wcar, words_count, words_len])
    else:
        ret = np.column_stack([wcar, words_count])
    
    return vec, ret

def make_submission(clf, X_test, ids, encoder, name='my_neural_net_submission.csv'):
    y_prob = clf.predict_proba(X_test)
    outCols = ['TripType_' + col for col in encoder.classes_]
    with open(name, 'w') as f:
        f.write('VisitNumber,')
        f.write(','.join(outCols))
        f.write('\n')
        for id, probs in zip(ids, y_prob):
            probas = ','.join([id] + map(str, probs.tolist()))
            f.write(probas)
            f.write('\n')
    print("Wrote submission to file {}.".format(name))

In [3]:

def getY(train_df):
    df_y = train_df[['VisitNumber', 'TripType']].groupby('VisitNumber').first()
    df_y = df_y.reset_index()

    encoder = LabelEncoder()
    y = encoder.fit_transform(df_y.TripType).astype(np.int32)
    
    params = {
        'y':y,
        'encoder':encoder
    }
    return params
def preprocessData(df, params={}):
    df_w = df[['VisitNumber', 'Weekday']].groupby('VisitNumber').first()
    df_w = df_w.reset_index()
    dict_df_w = df_w[['Weekday']].T.to_dict().values()
    dictVec = DictVectorizer()
    week = dictVec.fit_transform(dict_df_w)

    is_wknd = np.array((df_w['Weekday']=='Sunday') | (df_w['Weekday']=='Saturday'))
    is_wknd = is_wknd.reshape(-1,1)

    upc_dict, df_upc = prepareForCountVector(df, 'Upc', params.get('upc_dict'))
    upc_vec, upc = getCountVector(df_upc, 'Upc', False, params.get('upc_vec'))

    fln_dict, df_fln = prepareForCountVector(df, 'FinelineNumber', params.get('fln_dict'))
    fln_vec, fln = getCountVector(df_fln, 'FinelineNumber', False, params.get('fln_vec'))

    words_vec, words = getCountVector(df, 'DepartmentDescription', True, params.get('words_vec'))

    df_ScanCount = df[['VisitNumber', 'ScanCount']].groupby('VisitNumber').sum()
    df_ScanCount = df_ScanCount.reset_index()
    scancount = np.array(df_ScanCount.ScanCount)
    scancount = scancount.reshape(-1,1)
    
    feature_matrix = []
    feature_matrix.append(week)
    feature_matrix.append(is_wknd)
    feature_matrix.append(upc)
    feature_matrix.append(fln)
    feature_matrix.append(words)
    feature_matrix.append(scancount)

    feature_matrix = sparse.hstack(feature_matrix).tocsr()

    params = {
        'feature_matrix':feature_matrix,
        'dictVec': dictVec,
        'encoder':encoder,
        'upc_vec':upc_vec,
        'upc_dict':upc_dict,
        'fln_vec':fln_vec,
        'fln_dict':fln_dict,
        'words':words,
        'words_vec':words_vec,
        'desc_feature':words
    }
    return params

In [4]:
#TripType	VisitNumber	Weekday	Upc	ScanCount	DepartmentDescription	FinelineNumber
train_df = pd.read_csv('train.csv')

In [None]:
train_data = preprocessData(train_df)
train_y = getY(train_df)

In [None]:
params = {}
df = train_df
df_w = df[['VisitNumber', 'Weekday']].groupby('VisitNumber').first()
df_w = df_w.reset_index()
dict_df_w = df_w[['Weekday']].T.to_dict().values()
dictVec = DictVectorizer()
week = dictVec.fit_transform(dict_df_w)

is_wknd = np.array((df_w['Weekday']=='Sunday') | (df_w['Weekday']=='Saturday'))
is_wknd = is_wknd.reshape(-1,1)

upc_dict, df_upc = prepareForCountVector(df, 'Upc', params.get('upc_dict'))
upc_vec, upc = getCountVector(df_upc, 'Upc', False, params.get('upc_vec'))

fln_dict, df_fln = prepareForCountVector(df, 'FinelineNumber', params.get('fln_dict'))
fln_vec, fln = getCountVector(df_fln, 'FinelineNumber', False, params.get('fln_vec'))

words_vec, words = getCountVector(df, 'DepartmentDescription', True, params.get('words_vec'))

df_ScanCount = df[['VisitNumber', 'ScanCount']].groupby('VisitNumber').sum()
df_ScanCount = df_ScanCount.reset_index()
scancount = np.array(df_ScanCount.ScanCount)
scancount = scancount.reshape(-1,1)
    
feature_matrix = []
feature_matrix.append(week)
feature_matrix.append(is_wknd)
feature_matrix.append(upc)
feature_matrix.append(fln)
feature_matrix.append(words)
feature_matrix.append(scancount)

feature_matrix = sparse.hstack(feature_matrix).tocsr()


In [None]:
num_train, num_features = feature_matrix.shape
    
num_classes = len(encoder.classes_)
print feature_matrix_std.shape

In [None]:
#neural network
scaler = StandardScaler()
feature_matrix_std = scaler.fit_transform(feature_matrix.toarray())
sparse_feature_matrix = sparse.csr_matrix(feature_matrix)

print feature_matrix_std.shape
print sparse_feature_matrix.shape

In [None]:

layers0 = [('input', InputLayer),
           ('dense0', DenseLayer),
           ('dropout', DropoutLayer),
           ('dense1', DenseLayer),
           ('output', DenseLayer)]

In [None]:

net0 = NeuralNet(layers=layers0,
                 
                 input_shape=(None, num_features),
                 dense0_num_units=200,
                 dropout_p=0.5,
                 dense1_num_units=200,
                 output_num_units=num_classes,
                 output_nonlinearity=softmax,
                 
                 update=nesterov_momentum,
                 update_learning_rate=0.01,
                 update_momentum=0.9,
                 
                 train_split=TrainSplit(eval_size=0.2),
                 verbose=1,
                 max_epochs=1000)

In [None]:
param_grid = {
        'more_params': [{'dense0_num_units': 100, 'dense1_num_units': 100}, \
                        {'dense0_num_units': 200, 'dense1_num_units': 200}],
        'update_momentum': [0.9, 0.98],
        }
gs = GridSearchCV(net0, param_grid, cv=2, refit=False, verbose=4)
gs.fit(feature_matrix_std, y)

print gs.best_score_
print gs.best_params_

In [None]:
net0.fit(feature_matrix_std, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train, y, test_size=0.33, random_state=0)


In [None]:
test_df = pd.read_csv('train.csv')

df_w = test_df[['VisitNumber', 'Weekday']].groupby('VisitNumber').first()
df_w = df_w.reset_index()
week = enc.transform(df_w.Weekday)

is_wknd = np.array(df_w['Weekday']=='Sunday')
is_wknd = is_wknd.reshape(-1,1)

df_upc = prepareForCountVector(test_df, 'Upc')
upc = getCountVector(df_upc[1], 'Upc', False)

df_fln = prepareForCountVector(test_df, 'FinelineNumber')
fln = getCountVector(df_fln[1], 'FinelineNumber', False)

words = getCountVector(train_df, 'DepartmentDescription', True)

df_ScanCount = test_df[['VisitNumber', 'ScanCount']].groupby('VisitNumber').sum()
df_ScanCount = df_ScanCount.reset_index()
scancount = np.array(df_ScanCount.ScanCount)
scancount = scancount.reshape(-1,1)

feature_matrix = []
feature_matrix.append(week[1])
feature_matrix.append(is_wknd)
feature_matrix.append(upc[1])
feature_matrix.append(fln[1])
feature_matrix.append(words[1])
feature_matrix.append(scancount)

feature_matrix = sparse.hstack(feature_matrix).tocsr()
num_train, num_features = feature_matrix_std.shape

