In [6]:
import sys
import os
import time

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.grid_search import GridSearchCV

import math
import numpy as np
import pandas as pd
from scipy import sparse

import theano
from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from lasagne.layers import DropoutLayer
from lasagne.nonlinearities import softmax
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet
from nolearn.lasagne import TrainSplit
from lasagne.updates import nesterov_momentum, adagrad
from lasagne.objectives import categorical_crossentropy, aggregate


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from multiprocessing import Process

%run 'XGBoost_class.ipynb'

In [15]:
def loss_function(pred, y):
    eps = 1e-15
    total = 0.
    for i in range(len(y)):
        p = max(min(pred[i][y[i]], (1 - eps)), eps)
        total += math.log(p)
    return -(total/len(y))

def prepareForCountVector(df, columnName, dictCount=2000, topk_dict=None):
    if topk_dict is None:
        col = df[columnName].dropna()
        counts = col.value_counts()
        topk_dict = np.array(counts.iloc[0:min(dictCount, len(col))].index)
    else:
        print 'topk_dict is not none'
    topk = df[columnName].apply(lambda x: '%s%d'%(columnName, x) if x in topk_dict else '%sother'%(columnName))
 
    topk_se = pd.Series(topk, name=columnName)
    df_topk = pd.concat([topk_se, df['VisitNumber']], axis=1)
    return topk_dict, df_topk

def getCountVector(df, columnName, isWords, vec=None):
    if isWords:
        df[columnName] = df[columnName].fillna('')
    df_topk_gpy = df.groupby('VisitNumber')
    df_topk_list = df_topk_gpy.apply(lambda x: list(x[columnName]))
    topk_flat = df_topk_list.str.join(' ')
    
    if vec is None: 
        print 'building model for vec ......'
        vec = CountVectorizer() 
        vec.fit(topk_flat)    
    else:
        print 'vec is not none'
    wc = vec.transform(topk_flat)
    wcar = wc.toarray()
    
    words_count = topk_flat.apply(lambda x : len(x.split(' '))).reshape(-1,1)
    ret = None
    if isWords:
        words_len = topk_flat.apply(lambda x : len(x)).reshape(-1,1)
        ret = np.column_stack([wcar, words_count, words_len])
    else:
        ret = np.column_stack([wcar, words_count])
    
    return vec, ret

def make_submission(clf, X_test, ids, encoder, prefix):
    y_prob = clf.predict_proba(X_test)
    outCols = ['TripType_' + str(col) for col in encoder.classes_]
    
    millis = int(round(time.time() * 1000))
    filename = '%s_%d'%(prefix, millis)
    
    with open(filename, 'w') as f:
        f.write('VisitNumber,')
        f.write(','.join(outCols))
        f.write('\n')
        for id, probs in zip(ids, y_prob):
            probas = ','.join([str(id)] + map(str, probs.tolist()))
            f.write(probas)
            f.write('\n')
    f.close()
    print("Wrote submission to file {}.".format(filename))
    
def make_submission_ensemble(y_prob, ids, encoder, prefix):
    outCols = ['TripType_' + str(col) for col in encoder.classes_]
    
    millis = int(round(time.time() * 1000))
    filename = '%s_%d'%(prefix, millis)
    
    with open(filename, 'w') as f:
        f.write('VisitNumber,')
        f.write(','.join(outCols))
        f.write('\n')
        for id, probs in zip(ids, y_prob):
            probas = ','.join([str(id)] + map(str, probs.tolist()))
            f.write(probas)
            f.write('\n')
    print("Wrote submission to file {}.".format(filename))

In [8]:

def getY(train_df):
    df_y = train_df[['VisitNumber', 'TripType']].groupby('VisitNumber').first()
    df_y = df_y.reset_index()

    encoder = LabelEncoder()
    y = encoder.fit_transform(df_y.TripType).astype(np.int32)
    return encoder, y
def preprocessDataTrain(df):
    df_w = df[['VisitNumber', 'Weekday']].groupby('VisitNumber').first()
    df_w = df_w.reset_index()
    dict_df_w = df_w[['Weekday']].T.to_dict().values()
    
    dictVec = DictVectorizer()
    dictVec.fit(dict_df_w)
        
    week = dictVec.transform(dict_df_w)
    
    is_wknd = np.array((df_w['Weekday']=='Sunday') | (df_w['Weekday']=='Saturday'))
    is_wknd = is_wknd.reshape(-1,1)

    df_upc = prepareForCountVector(df, 'Upc')
    upc = getCountVector(df_upc[1], 'Upc', False)

    df_fln = prepareForCountVector(df, 'FinelineNumber')
    fln = getCountVector(df_fln[1], 'FinelineNumber', False)

    words = getCountVector(df, 'DepartmentDescription', True)

    df_ScanCount = df[['VisitNumber', 'ScanCount']].groupby('VisitNumber').sum()
    df_ScanCount = df_ScanCount.reset_index()
    scancount = np.array(df_ScanCount.ScanCount)
    scancount = scancount.reshape(-1,1)
    
    feature_matrix = []
    feature_matrix.append(week)
    feature_matrix.append(is_wknd)
    feature_matrix.append(upc[1])
    feature_matrix.append(fln[1])
    feature_matrix.append(words[1])
    feature_matrix.append(scancount)

    feature_matrix = sparse.hstack(feature_matrix).tocsr()

    ret_params = {
        'week_dictVec': dictVec,
        'upc_vec':upc[0],
        'upc_dict':df_upc[0],
        'fln_vec':fln[0],
        'fln_dict':df_fln[0],
        'words_vec':words[0]
    }
    return feature_matrix, ret_params

def preprocessDataTest(df, params):
    df_w = df[['VisitNumber', 'Weekday']].groupby('VisitNumber').first()
    df_w = df_w.reset_index()
    dict_df_w = df_w[['Weekday']].T.to_dict().values()
    dictVec = params['week_dictVec']
    week = dictVec.transform(dict_df_w)
    
    is_wknd = np.array((df_w['Weekday']=='Sunday') | (df_w['Weekday']=='Saturday'))
    is_wknd = is_wknd.reshape(-1,1)
            
    df_upc = prepareForCountVector(df, 'Upc', topk_dict=params['upc_dict'])
    upc = getCountVector(df_upc[1], 'Upc', False, params['upc_vec'])
            
    df_fln = prepareForCountVector(df, 'FinelineNumber', topk_dict=params['fln_dict'])
    fln = getCountVector(df_fln[1], 'FinelineNumber', False, params['fln_vec'])

    words = getCountVector(df, 'DepartmentDescription', True, params['words_vec'])

    df_ScanCount = df[['VisitNumber', 'ScanCount']].groupby('VisitNumber').sum()
    df_ScanCount = df_ScanCount.reset_index()
    scancount = np.array(df_ScanCount.ScanCount)
    scancount = scancount.reshape(-1,1)
    
    feature_matrix = []
    feature_matrix.append(week)
    feature_matrix.append(is_wknd)
    feature_matrix.append(upc[1])
    feature_matrix.append(fln[1])
    feature_matrix.append(words[1])
    feature_matrix.append(scancount)

    feature_matrix = sparse.hstack(feature_matrix).tocsr()

    return feature_matrix

In [9]:
#TripType	VisitNumber	Weekday	Upc	ScanCount	DepartmentDescription	FinelineNumber
train_df = pd.read_csv('train.csv')

In [14]:
feature_matrix, params = preprocessDataTrain(train_df)
encoder, y = getY(train_df)

building model for vec ......
building model for vec ......
building model for vec ......


In [16]:
test_df = pd.read_csv('test.csv')
feature_matrix_test = preprocessDataTest(test_df,params)

topk_dict is not none
vec is not none
topk_dict is not none
vec is not none
vec is not none


In [17]:
num_test, num_features_test = feature_matrix_test.shape
num_train, num_features = feature_matrix.shape
assert(num_features_test == num_features)
num_classes = len(encoder.classes_)
print feature_matrix_test.shape
print num_train, num_features, num_classes
df_ids = test_df[['VisitNumber']].groupby('VisitNumber').first()
df_ids = df_ids.reset_index()
ids = df_ids.VisitNumber

(95674, 4127)
95674 4127 38


In [18]:

layers = [('input', InputLayer),
           ('dropout1', DropoutLayer), 
           ('hidden1', DenseLayer),
           ('dropout2', DropoutLayer), 
           ('hidden2', DenseLayer), 
           ('dropout3', DropoutLayer),
           ('output', DenseLayer)]

In [19]:

nn = NeuralNet(layers=layers,
                 objective_loss_function=categorical_crossentropy,
                 input_shape=(None, num_features),
                 
                 dropout1_p=0.15,
                 dropout2_p=0.25,
                 dropout3_p=0.25,
                 
                 hidden1_num_units=1000,
                 hidden2_num_units=500,
                 
                 output_num_units=num_classes,
                 output_nonlinearity=softmax,
                 
                 update=adagrad,     #nesterov_momentum,
                 #update_learning_rate=theano.shared(np.float32(0.01)),   
                 update_learning_rate=theano.shared(np.float32(0.01)),
                 #update_momentum=0.04,
                 
                 train_split=TrainSplit(eval_size=0.2),
                 verbose=1,
                 max_epochs=50)

rf = RandomForestClassifier(n_estimators=300, n_jobs=-1, max_depth=17) 
et = ExtraTreesClassifier(n_estimators=300, n_jobs=-1, max_depth=25)
params = {   
              'objective': 'multi:softprob',
              'eval_metric': 'mlogloss',
              'num_class': num_classes,
              'eta': 0.0825,
              'max_depth': 10,
              'num_round': 2000,
              'subsample':0.85, 
              'colsample_bytree':0.8, 
              'min_child_weight':5.2475,
              'silent':1
    }
clfxgb = XGBoostClassifier(**params)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, y, test_size=0.5, random_state=56)


In [21]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train.toarray())
X_test_std = scaler.transform(X_test.toarray())
nn.fit(X_train_std, y_train)
y_prob_nn = nn.predict_proba(X_test_std)

rf.fit(X_train, y_train)
y_prob_rf = rf.predict_proba(X_test)

et.fit(X_train, y_train)
y_prob_et = et.predict_proba(X_test)

clfxgb.fit(X_train, y_train)
y_prob_xgb = clfxgb.predict_proba(X_test)

train_data_for2 = sparse.hstack((X_test, y_prob_nn, y_prob_rf, y_prob_et, y_prob_xgb))

train_data_y_for2 = y_test

# Neural Network with 4647538 learnable parameters

## Layer information

  #  name        size
---  --------  ------
  0  input       4127
  1  dropout1    4127
  2  hidden1     1000
  3  dropout2    1000
  4  hidden2      500
  5  dropout3     500
  6  output        38

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  ------
      1       [36m2.26950[0m       [32m1.44072[0m      1.57525      0.57138  69.06s
      2       [36m1.20048[0m       [32m1.32506[0m      0.90599      0.59768  67.45s
      3       [36m0.92882[0m       1.32776      0.69954      0.60557  59.10s
      4       [36m0.76391[0m       1.35318      0.56453      0.61181  62.92s
      5       [36m0.66312[0m       1.37246      0.48316      0.61392  63.94s
      6       [36m0.59062[0m       1.42153      0.41548      0.61330  58.35s
      7       [36m0.53520[0m       1.43788      0.37221      0.61750  57.36s
      8       [36m



In [22]:
feature_matrix_test_std = scaler.transform(feature_matrix_test.toarray())

test_data_pred_nn = nn.predict_proba(feature_matrix_test_std)
test_data_pred_rf = rf.predict_proba(feature_matrix_test)
test_data_pred_et = et.predict_proba(feature_matrix_test)
test_data_pred_clfxgb = clfxgb.predict_proba(feature_matrix_test)

test_data_for2 = sparse.hstack((feature_matrix_test, test_data_pred_nn, test_data_pred_rf, test_data_pred_et, test_data_pred_clfxgb))

In [23]:
scaler2 = StandardScaler()
train_data_for2_std = scaler2.fit_transform(train_data_for2.toarray())
test_data_for2_std = scaler2.transform(test_data_for2.toarray())

In [24]:
def process2(train_data_for2, train_data_for2_std, train_data_y_for2, \
             test_data_for2, test_data_for2_std, prefix, layers, ids):
    num_test, num_features = test_data_for2.shape
    num_classes = 38
    nn2 = NeuralNet(layers=layers,
                 objective_loss_function=categorical_crossentropy,
                 input_shape=(None, num_features),
                 
                 dropout1_p=0.15,
                 dropout2_p=0.25,
                 dropout3_p=0.25,
                 
                 hidden1_num_units=1000,
                 hidden2_num_units=500,
                 
                 output_num_units=num_classes,
                 output_nonlinearity=softmax,
                 
                 update=adagrad,     #nesterov_momentum,  
                 update_learning_rate=theano.shared(np.float32(0.01)),
                 #update_momentum=0.04,
                 
                 train_split=TrainSplit(eval_size=0.2),
                 verbose=1,
                 max_epochs=18)

    clfxgb2 = XGBoostClassifier(**params)

    pred1 = np.zeros((num_test, num_classes)).astype(np.float32)
    pred2 = np.zeros((num_test, num_classes)).astype(np.float32)

    for i in range(30):
        clfxgb2.fit(train_data_for2, train_data_y_for2)
        pred1 += clfxgb2.predict_proba(test_data_for2)
        nn2.fit(train_data_for2_std, train_data_y_for2)
        pred2 += nn2.predict_proba(test_data_for2_std)

    pred11 = pred1/30
    pred21 = pred2/30
    pred = (pred11 + pred21)/2
    
    make_submission_ensemble(pred, ids, encoder, prefix)


In [25]:
process2(train_data_for2, train_data_for2_std, train_data_y_for2, \
             test_data_for2, test_data_for2_std, "mega_ensemble4", layers, ids)

# Neural Network with 4799538 learnable parameters

## Layer information

  #  name        size
---  --------  ------
  0  input       4279
  1  dropout1    4279
  2  hidden1     1000
  3  dropout2    1000
  4  hidden2      500
  5  dropout3     500
  6  output        38

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  ------
      1       [36m1.52741[0m       [32m0.97990[0m      1.55874      0.71041  63.76s
      2       [36m0.74181[0m       [32m0.94257[0m      0.78700      0.72002  59.19s
      3       [36m0.55380[0m       0.96154      0.57595      0.72095  59.32s
      4       [36m0.43594[0m       0.99964      0.43610      0.72137  59.95s
      5       [36m0.36872[0m       1.03500      0.35625      0.72320  59.07s
      6       [36m0.32435[0m       1.06691      0.30401      0.72334  59.25s
      7       [36m0.28617[0m       1.10231      0.25961      0.72446  59.44s
      8       [36m



In [None]:
#import cPickle 
#f = file('train_data_for2.save', 'wb')
#cPickle.dump(train_data_for2, f, protocol=cPickle.HIGHEST_PROTOCOL)
#f.close()
#f1 = file('train_data_for2_std.save', 'wb')
#cPickle.dump(train_data_for2_std, f1, protocol=cPickle.HIGHEST_PROTOCOL)
#f1.close()
#f2 = file('train_data_y_for2.save', 'wb')
#cPickle.dump(train_data_y_for2, f2, protocol=cPickle.HIGHEST_PROTOCOL)
#f2.close()
#f3 = file('test_data_for2.save', 'wb')
#cPickle.dump(test_data_for2, f3, protocol=cPickle.HIGHEST_PROTOCOL)
#f3.close()
#f4 = file('test_data_for2_std.save', 'wb')
#cPickle.dump(test_data_for2_std, f4, protocol=cPickle.HIGHEST_PROTOCOL)
#f4.close()

#f5 = file('ids.save', 'wb')
#cPickle.dump(ids, f5, protocol=cPickle.HIGHEST_PROTOCOL)
#f5.close()



In [None]:
import cPickle 
f = file('train_data_for2.save', 'rb')
train_data_for2 = cPickle.load(f)
f.close()
f1 = file('train_data_y_for2.save', 'rb')
train_data_y_for2 = cPickle.load(f1)
f1.close()
f2 = file('test_data_for2.save', 'rb')
test_data_for2 = cPickle.load(f2)
f2.close()
f3 = file('ids.save', 'rb')
ids = cPickle.load(f3)
f3.close()
scaler2 = StandardScaler()
train_data_for2_std = scaler2.fit_transform(train_data_for2.toarray())
test_data_for2_std = scaler2.transform(test_data_for2.toarray())

In [None]:
pxgb1 = Process(target=process2, args=(train_data_for2, train_data_for2_std, train_data_y_for2, \
             test_data_for2, test_data_for2_std, "mega_ensemble1", layers, ids))
pxgb1.start()
pxgb1.join()

pxgb2 = Process(target=process2, args=(train_data_for2, train_data_for2_std, train_data_y_for2, \
             test_data_for2, test_data_for2_std, "mega_ensemble2", layers, ids))
pxgb2.start()
pxgb2.join()

pxgb3 = Process(target=process2, args=(train_data_for2, train_data_for2_std, train_data_y_for2, \
             test_data_for2, test_data_for2_std, "mega_ensemble3", layers, ids))
pxgb3.start()
pxgb3.join()



In [None]:
#mean from two submission files
df1 = pd.read_csv('mega_ensemble4_1451246644669', index_col= 'VisitNumber')
df2 = pd.read_csv('mega_ensemble_no_upc_fln_1451222117007', index_col='VisitNumber')
df_concat = pd.concat((df1, df2))
by_row_index = df_concat.groupby(df_concat.index)
df_means = by_row_index.mean()

millis = int(round(time.time() * 1000))
filename = 'mega_ensemble_ensemble%d'%(millis)
df_means.to_csv(filename)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, y, test_size=0.5, random_state=56)

import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train.tolist())
dtest = xgb.DMatrix(X_test, label=y_test.tolist())

xgb_params = {'max_depth': 8,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.35,
          'colsample_bytree': 1,
          'eta': 0.1}
num_rounds = 2000


watchlist = [(dtrain, 'train'), (dtest, 'eval')]
bstc = xgb.train(xgb_params, dtrain, num_rounds, evals=watchlist, verbose_eval=False, early_stopping_rounds=25)


predictv2 = bstc.predict(xgb.DMatrix(X_test))
print(loss_function(predictv2, y_test))

0.717459295878


In [52]:
import xgboost as xgb

dtrain = xgb.DMatrix(feature_matrix, label=y)
"""
 xgb_params = {}
    # use softmax multi-class classification
    param['objective'] = 'multi:softprob'
    # scale weight of positive examples
    param['eta'] = 0.05
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 38
    param['eval_metric'] = "mlogloss"
    #param['min_child_weight'] = 2
    param['subsample'] = 0.9
    param['colsample_bytree'] = 0.7
    param['gamma'] = 1
    
num_rounds = 4200
"""
xgb_params = {'max_depth': 8,
          'objective': 'multi:softprob',
          'eval_metric': 'mlogloss',
          'num_class': 38,
          'subsample': 0.35,
          'colsample_bytree': 1,
          'eta': 0.1}
num_rounds = 2000

bstc = xgb.train(xgb_params, dtrain, num_rounds, verbose_eval=False)

pred = bstc.predict(xgb.DMatrix(feature_matrix_test))
make_submission_ensemble(pred, ids, encoder, "improved")


Wrote submission to file improved_1451748103378.
