In [16]:
import sys
import os
import time

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.grid_search import GridSearchCV

import math
import numpy as np
import pandas as pd
from scipy import sparse

import theano
from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from lasagne.layers import DropoutLayer
from lasagne.nonlinearities import softmax
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet
from nolearn.lasagne import TrainSplit
from lasagne.updates import nesterov_momentum, adagrad
from lasagne.objectives import categorical_crossentropy, aggregate


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

%run 'XGBoost_class.ipynb'

In [13]:
def loss_function(pred, y):
    eps = 1e-15
    total = 0.
    for i in range(len(y)):
        p = max(min(pred[i][y[i]], (1 - eps)), eps)
        total += math.log(p)
    return -(total/len(y))

def prepareForCountVector(df, columnName, dictCount=2000, topk_dict=None):
    if not topk_dict:
        col = df[columnName].dropna()
        counts = col.value_counts()
        topk_dict = counts.iloc[0:min(dictCount, len(col))].index
    
        topk_dict = set(topk_dict).union(set(topk_dict))
        
    col = col.fillna('')
    
    topk = df[columnName].apply(lambda x: '%s%d'%(columnName, x) if x in topk_dict else '%sother'%(columnName))
 
    topk_se = pd.Series(topk, name=columnName)
    df_topk = pd.concat([topk_se, df['VisitNumber']], axis=1)
    return topk_dict, df_topk

def getCountVector(df, columnName, isWords, vec=None):
    if isWords:
        df[columnName] = df[columnName].fillna('')
    df_topk_gpy = df.groupby('VisitNumber')
    df_topk_list = df_topk_gpy.apply(lambda x: list(x[columnName]))
    topk_flat = df_topk_list.str.join(' ')
    
    if not vec: 
        vec = CountVectorizer() 
        vec.fit(topk_flat)    
    
    wc = vec.transform(topk_flat)
    wcar = wc.toarray()
    
    words_count = topk_flat.apply(lambda x : len(x.split(' '))).reshape(-1,1)
    ret = None
    if isWords:
        words_len = topk_flat.apply(lambda x : len(x)).reshape(-1,1)
        ret = np.column_stack([wcar, words_count, words_len])
    else:
        ret = np.column_stack([wcar, words_count])
    
    return vec, ret

def make_submission(clf, X_test, ids, encoder, prefix):
    y_prob = clf.predict_proba(X_test)
    outCols = ['TripType_' + col for col in encoder.classes_]
    
    millis = int(round(time.time() * 1000))
    filename = '%s_%d'%(prefix, millis)
    
    with open(filename, 'w') as f:
        f.write('VisitNumber,')
        f.write(','.join(outCols))
        f.write('\n')
        for id, probs in zip(ids, y_prob):
            probas = ','.join([id] + map(str, probs.tolist()))
            f.write(probas)
            f.write('\n')
    print("Wrote submission to file {}.".format(filename))
    
def make_submission_ensemble(y_prob, ids, encoder, prefix):
    outCols = ['TripType_' + col for col in encoder.classes_]
    
    millis = int(round(time.time() * 1000))
    filename = '%s_%d'%(prefix, millis)
    
    with open(filename, 'w') as f:
        f.write('VisitNumber,')
        f.write(','.join(outCols))
        f.write('\n')
        for id, probs in zip(ids, y_prob):
            probas = ','.join([id] + map(str, probs.tolist()))
            f.write(probas)
            f.write('\n')
    print("Wrote submission to file {}.".format(filename))

In [14]:

def getY(train_df):
    df_y = train_df[['VisitNumber', 'TripType']].groupby('VisitNumber').first()
    df_y = df_y.reset_index()

    encoder = LabelEncoder()
    y = encoder.fit_transform(df_y.TripType).astype(np.int32)
    return encoder, y
def preprocessDataTrain(df):
    df_w = df[['VisitNumber', 'Weekday']].groupby('VisitNumber').first()
    df_w = df_w.reset_index()
    dict_df_w = df_w[['Weekday']].T.to_dict().values()
    
    dictVec = DictVectorizer()
    dictVec.fit(dict_df_w)
        
    week = dictVec.transform(dict_df_w)
    
    is_wknd = np.array((df_w['Weekday']=='Sunday') | (df_w['Weekday']=='Saturday'))
    is_wknd = is_wknd.reshape(-1,1)

    df_upc = prepareForCountVector(df, 'Upc')
    upc = getCountVector(df_upc[1], 'Upc', False)

    df_fln = prepareForCountVector(df, 'FinelineNumber')
    fln = getCountVector(df_fln[1], 'FinelineNumber', False)

    words = getCountVector(df, 'DepartmentDescription', True)

    df_ScanCount = df[['VisitNumber', 'ScanCount']].groupby('VisitNumber').sum()
    df_ScanCount = df_ScanCount.reset_index()
    scancount = np.array(df_ScanCount.ScanCount)
    scancount = scancount.reshape(-1,1)
    
    feature_matrix = []
    feature_matrix.append(week)
    feature_matrix.append(is_wknd)
    feature_matrix.append(upc[1])
    feature_matrix.append(fln[1])
    feature_matrix.append(words[1])
    feature_matrix.append(scancount)

    feature_matrix = sparse.hstack(feature_matrix).tocsr()

    ret_params = {
        'week_dictVec': dictVec,
        'upc_vec':upc[0],
        'upc_dict':df_upc[0],
        'fln_vec':fln[0],
        'fln_dict':df_fln[0],
        'words_vec':words[0]
    }
    return feature_matrix, ret_params

def preprocessDataTest(df, params):
    df_w = df[['VisitNumber', 'Weekday']].groupby('VisitNumber').first()
    df_w = df_w.reset_index()
    dict_df_w = df_w[['Weekday']].T.to_dict().values()
    dictVec = params['week_dictVec']
    week = dictVec.transform(dict_df_w)
    
    is_wknd = np.array((df_w['Weekday']=='Sunday') | (df_w['Weekday']=='Saturday'))
    is_wknd = is_wknd.reshape(-1,1)
            
    df_upc = prepareForCountVector(df, 'Upc', params['upc_dict'])
    upc = getCountVector(df_upc[1], 'Upc', False, params['upc_vec'])
            
    df_fln = prepareForCountVector(df, 'FinelineNumber', params['fln_dict'])
    fln = getCountVector(df_fln[1], 'FinelineNumber', False, params['fln_vec'])

    words = getCountVector(df, 'DepartmentDescription', True, params['words_vec'])

    df_ScanCount = df[['VisitNumber', 'ScanCount']].groupby('VisitNumber').sum()
    df_ScanCount = df_ScanCount.reset_index()
    scancount = np.array(df_ScanCount.ScanCount)
    scancount = scancount.reshape(-1,1)
    
    feature_matrix = []
    feature_matrix.append(week)
    feature_matrix.append(is_wknd)
    feature_matrix.append(upc[1])
    feature_matrix.append(fln[1])
    feature_matrix.append(words[1])
    feature_matrix.append(scancount)

    feature_matrix = sparse.hstack(feature_matrix).tocsr()

    return feature_matrix

In [4]:
#TripType	VisitNumber	Weekday	Upc	ScanCount	DepartmentDescription	FinelineNumber
train_df = pd.read_csv('train.csv')

In [5]:
feature_matrix, params = preprocessDataTrain(train_df)
encoder, y = getY(train_df)

In [6]:
test_df = pd.read_csv('test.csv')
feature_matrix_test = preprocessDataTest(test_df,params)

In [9]:
num_test, num_features_test = feature_matrix_test.shape
num_train, num_features = feature_matrix.shape
assert(num_features_test == num_features)
num_classes = len(encoder.classes_)
print feature_matrix_test.shape
print num_train, num_features, num_classes

(95674, 4127)
95674 4127 38


In [10]:

layers = [('input', InputLayer),
           ('dropout1', DropoutLayer), 
           ('hidden1', DenseLayer),
           ('dropout2', DropoutLayer), 
           ('hidden2', DenseLayer), 
           ('dropout3', DropoutLayer),
           ('output', DenseLayer)]

In [17]:

nn = NeuralNet(layers=layers,
                 objective_loss_function=categorical_crossentropy,
                 input_shape=(None, num_features),
                 
                 dropout1_p=0.15,
                 dropout2_p=0.25,
                 dropout3_p=0.25,
                 
                 hidden1_num_units=1000,
                 hidden2_num_units=500,
                 
                 output_num_units=num_classes,
                 output_nonlinearity=softmax,
                 
                 update=adagrad,     #nesterov_momentum,
                 #update_learning_rate=theano.shared(np.float32(0.01)),   
                 update_learning_rate=theano.shared(np.float32(0.01)),
                 #update_momentum=0.04,
                 
                 train_split=TrainSplit(eval_size=0.2),
                 verbose=1,
                 max_epochs=50)

rf = RandomForestClassifier(n_estimators=300, n_jobs=-1, max_depth=17) 
et = ExtraTreesClassifier(n_estimators=300, n_jobs=-1, max_depth=25)
params = {   
              'objective': 'multi:softprob',
              'eval_metric': 'mlogloss',
              'num_class': num_classes,
              'eta': 0.0825,
              'max_depth': 10,
              'num_round': 2000,
              'subsample':0.85, 
              'colsample_bytree':0.8, 
              'min_child_weight':5.2475,
              'silent':1
    }
clfxgb = XGBoostClassifier(**params)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, y, test_size=0.5, random_state=56)


In [None]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train.toarray())
X_test_std = scaler.transform(X_test.toarray())
nn.fit(X_train_std, y_train)
y_prob_nn = nn.predict_proba(X_test_std)

rf.fit(X_train, y_train)
y_prob_rf = rf.predict_proba(X_test)

et.fit(X_train, y_train)
y_prob_et = et.predict_proba(X_test)

clfxgb.fit(X_train, y_train)
y_prob_xgb = clfxgb.predict_proba(X_test)

train_data_for2 = np.hstack((X_test, y_prob_nn, y_prob_rf, y_prob_et, y_prob_xgb))

train_data_y_for2 = y_test

# Neural Network with 4647538 learnable parameters

## Layer information

  #  name        size
---  --------  ------
  0  input       4127
  1  dropout1    4127
  2  hidden1     1000
  3  dropout2    1000
  4  hidden2      500
  5  dropout3     500
  6  output        38

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  ------
      1       [36m2.19536[0m       [32m1.43264[0m      1.53238      0.57410  85.04s
      2       [36m1.18087[0m       [32m1.32627[0m      0.89037      0.59953  80.35s
      3       [36m0.91240[0m       [32m1.31042[0m      0.69627      0.60761  76.29s
      4       [36m0.75473[0m       1.35137      0.55849      0.61214  67.07s
      5       [36m0.65576[0m       1.36196      0.48148      0.61533  59.01s


In [None]:
feature_matrix_test_std = scaler.transform(feature_matrix_test.toarray())

test_data_pred_nn = nn.predict_proba(feature_matrix_test_std)
test_data_pred_rf = rf.predict_proba(feature_matrix_test)
test_data_pred_et = et.predict_proba(feature_matrix_test)
test_data_pred_clfxgb = clfxgb.predict_proba(feature_matrix_test)

test_data_for2 = np.hstack((feature_matrix_test_std, test_data_pred_nn, test_data_pred_rf, test_data_pred_et, test_data_pred_clfxgb))

In [None]:
nn2 = NeuralNet(layers=layers,
                 objective_loss_function=categorical_crossentropy,
                 input_shape=(None, num_features),
                 
                 dropout1_p=0.15,
                 dropout2_p=0.25,
                 dropout3_p=0.25,
                 
                 hidden1_num_units=1000,
                 hidden2_num_units=500,
                 
                 output_num_units=num_classes,
                 output_nonlinearity=softmax,
                 
                 update=adagrad,     #nesterov_momentum,
                 #update_learning_rate=theano.shared(np.float32(0.01)),   
                 update_learning_rate=theano.shared(np.float32(0.01)),
                 #update_momentum=0.04,
                 
                 train_split=TrainSplit(eval_size=0.2),
                 verbose=1,
                 max_epochs=18)

clfxgb2 = XGBoostClassifier(**params)


In [None]:
scaler2 = StandardScaler()
train_data_for2_std = scaler2.fit_transform(train_data_for2.toarray())
test_data_for2_std = scaler2.transform(test_data_for2.toarray())

pred1 = np.zeros((num_test, num_classes)).astype(np.float32)
pred2 = np.zeros((num_test, num_classes)).astype(np.float32)

for i in range(30):
    clfxgb2.fit(train_data_for2, train_data_y_for2)
    pred1 += clfxgb2.predict_proba(test_data_for2)
    nn2.fit(train_data_for2_std, train_data_y_for2)
    pred2 += nn2.predict_proba(test_data_for2_std)

pred1 = pred1/30
pred2 = pred2/30
pred = (pred1 + pred2)/2

In [None]:
df_ids = test_df[['VisitNumber']].groupby('VisitNumber').first()
df_ids = df_ids.reset_index()
make_submission_ensemble(pred, df_ids.VisitNumber, encoder, "mega_ensemble"):
