In [99]:
from sklearn import metrics, cross_validation, linear_model
from scipy import sparse
from itertools import combinations
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import math
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from multiprocessing import Process

%run 'XGBoost_class.ipynb'

eps = 1e-15

In [109]:
def OneHotEncoder(col):
    #(values,counts) = np.unique(col,return_counts=True)
    #values[np.argsort(counts)[-2000:]]
    col = np.nan_to_num(col)
    uniques = np.unique(col)
    keymap = dict((key, i) for i, key in enumerate(uniques))
    
    total_pts = len(col)
    num_labels = len(uniques)
    
    spmat = sparse.lil_matrix((total_pts, num_labels))
    for j, val in enumerate(col):
        if val[0] in keymap:
            spmat[j, keymap[val[0]]] = 1
    return keymap, spmat


In [110]:
#loss function
def loss_function(y, pred):
    eps = 1e-15
    total = 0.
    for i in range(len(y)):
        p = max(min(pred[i][y[i]], (1 - eps)), eps)
        total += math.log(p)
    return -(total/len(y))
def cv_loop(X, y, model, N=10):
    #k fold validation
    kf= KFold(len(y), n_folds=N, shuffle=True, random_state=None)

    mean_score = 0.
    i = 0
    for train_index, test_index in kf:
        X_train, X_cv, y_train, y_cv = X[train_index], X[test_index], y[train_index], y[test_index]
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)
        score = loss_function(y_cv, preds)
        i += 1
        print "Score (fold %d/%d): %f" % (i, N, score)
        mean_score += score
    
    print model.get_params()
    print "Score: %f" % (mean_score/len(kf))

In [111]:
def prepareForCountVector(df, columnName, dictCount=2000):
    col = df[columnName].dropna()
    col = col.fillna('')

    counts = col.value_counts()
    topk_dict = counts.iloc[0:min(dictCount, len(col))].index
    
    topk_dict = set(topk_dict).union(set(topk_dict))
    topk = df[columnName].apply(lambda x: '%s%d'%(columnName, x) if x in topk_dict else '%sother'%(columnName))
 
    topk_se = pd.Series(topk, name=columnName)
    df_topk = pd.concat([topk_se, df['VisitNumber']], axis=1)
    return topk_dict, df_topk

def getCountVector(df, columnName, isWords, vec=None):
    if isWords:
        df[columnName] = df[columnName].fillna('')
    df_topk_gpy = df.groupby('VisitNumber')
    df_topk_list = df_topk_gpy.apply(lambda x: list(x[columnName]))
    topk_flat = df_topk_list.str.join(' ')
    
    if not vec: 
        vec = CountVectorizer() 
        vec.fit(topk_flat)    
    
    wc = vec.transform(topk_flat)
    wcar = wc.toarray()
    
    words_count = topk_flat.apply(lambda x : len(x.split(' '))).reshape(-1,1)
    ret = None
    if isWords:
        words_len = topk_flat.apply(lambda x : len(x)).reshape(-1,1)
        ret = np.column_stack([wcar, words_count, words_len])
    else:
        ret = np.column_stack([wcar, words_count])
    
    return vec, ret



In [91]:

#TripType	VisitNumber	Weekday	Upc	ScanCount	DepartmentDescription	FinelineNumber
train_df = pd.read_csv('train.csv')
num_train = np.shape(train_df)[0]

df_y = train_df[['VisitNumber', 'TripType']].groupby('VisitNumber').first()
df_y = df_y.reset_index()

encoder = LabelEncoder()
y = encoder.fit_transform(df_y.TripType).astype(np.int32)

In [92]:
df_w = train_df[['VisitNumber', 'Weekday']].groupby('VisitNumber').first()
df_w = df_w.reset_index()
week = OneHotEncoder(df_w.Weekday)

is_wknd = np.array(df_w['Weekday']=='Sunday')
is_wknd = is_wknd.reshape(-1,1)

df_upc = prepareForCountVector(train_df, 'Upc')
upc = getCountVector(df_upc[1], 'Upc', False)

df_fln = prepareForCountVector(train_df, 'FinelineNumber')
fln = getCountVector(df_fln[1], 'FinelineNumber', False)

words = getCountVector(train_df, 'DepartmentDescription', True)

df_ScanCount = train_df[['VisitNumber', 'ScanCount']].groupby('VisitNumber').sum()
df_ScanCount = df_ScanCount.reset_index()
scancount = np.array(df_ScanCount.ScanCount)
scancount = scancount.reshape(-1,1)

In [93]:
feature_matrix = []
feature_matrix.append(week[1])
feature_matrix.append(is_wknd)
feature_matrix.append(upc[1])
feature_matrix.append(fln[1])
feature_matrix.append(words[1])
feature_matrix.append(scancount)

feature_matrix = sparse.hstack(feature_matrix).tocsr()
feature_matrix.shape

(95674, 4127)

In [112]:
def runXgBoost(feature_matrix, y):
    num_classes = len(np.unique(y))

    params = {   
              'objective': 'multi:softprob',
              'eval_metric': 'mlogloss',
              'num_class': num_classes,
              'eta': 0.3,
              'max_depth': 5,
              'num_round': 512,
              'silent':1
    }
    for eta in [.05, .08, .1]:
        for max_depth in [4,6,8]:
            params['eta'] = eta
            params['max_depth'] = max_depth
                
            clfxgb = XGBoostClassifier(**params)
            cv_loop(feature_matrix, y, clfxgb)        

In [113]:
def runRandomForest(feature_matrix, y):
    for max_depth in [4, 8, 12, 17, 25, 30, 40, 50]: 
        clf = RandomForestClassifier(max_depth=max_depth, \
                                        n_estimators=1000, \
                                        min_samples_split=2) 
        cv_loop(feature_matrix, y, clf)
        
      
def runExRandom(feature_matrix, y):
    for max_depth in [4, 8, 12, 17, 25, 30, 40, 50]: 
        clf = ExtraTreesClassifier(max_depth=max_depth, \
                             n_estimators=1000, \
                             min_samples_split=2) 
        cv_loop(feature_matrix, y, clf)
 


In [None]:
#runXgBoost(feature_matrix, y)
runRandomForest(feature_matrix, y)

Score (fold 1/10): 2.524722
Score (fold 2/10): 2.517027
Score (fold 3/10): 2.519336
Score (fold 4/10): 2.529887
Score (fold 5/10): 2.511664
Score (fold 6/10): 2.537348
Score (fold 7/10): 2.520378
Score (fold 8/10): 2.522375
Score (fold 9/10): 2.515665
Score (fold 10/10): 2.507850
{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 1, 'n_estimators': 1000, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'gini', 'random_state': None, 'max_features': 'auto', 'max_depth': 4, 'class_weight': None}
Score: 2.520625
Score (fold 1/10): 2.179830
Score (fold 2/10): 2.178269
Score (fold 3/10): 2.186922
Score (fold 4/10): 2.181367
Score (fold 5/10): 2.177397
Score (fold 6/10): 2.182475
Score (fold 7/10): 2.190321
Score (fold 8/10): 2.185065
Score (fold 9/10): 2.178438
Score (fold 10/10): 2.180353
{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap

In [None]:
runExRandom(feature_matrix, y)

In [96]:

pxgb = Process(target=runXgBoost, args=(feature_matrix, y,))
pxgb.start()
pxgb.join()

prf = Process(target=runRandomForest, args=(feature_matrix, y,))
prf.start()
prf.join()


Score (fold 2/10): 2.161242
Score (fold 3/10): 2.163775
Score (fold 4/10): 2.143860
Score (fold 5/10): 2.153776
Score (fold 6/10): 2.146942
Score (fold 7/10): 2.152576
Score (fold 8/10): 2.156201
Score (fold 9/10): 2.149507
Score (fold 10/10): 2.148776
Score (fold 11/10): 2.144151


Process Process-1:
Traceback (most recent call last):
  File "/Users/atulkumar/anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/atulkumar/anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-94-9fe93fea214d>", line 18, in runXgBoost
    cv_loop(feature_matrix, y, clfxgb)
  File "<ipython-input-89-1db95ae92445>", line 23, in cv_loop
    print forest.get_params()
NameError: global name 'forest' is not defined
Process Process-2:
Traceback (most recent call last):
  File "/Users/atulkumar/anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/atulkumar/anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-95-ff83169f6d92>", line 3, in runRandomForest
    forest = RandomForestClassifier(max_depth=max_depth, n_estimator

In [None]:
Score (fold 2/10): 2.160128
Score (fold 3/10): 2.154966
Score (fold 4/10): 2.159780
Score (fold 5/10): 2.145777
Score (fold 6/10): 2.157452
Score (fold 7/10): 2.142206
Score (fold 8/10): 2.150331
Score (fold 9/10): 2.151317
Score (fold 10/10): 2.154482
Score (fold 11/10): 2.146740
{'num_class': 38, 'silent': 1, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob', 'eta': 0.05, 'num_round': 512, 'max_depth': 4}
Score: 2.152318
Score (fold 2/10): 2.052925
Score (fold 3/10): 2.067954
Score (fold 4/10): 2.062446
Score (fold 5/10): 2.064072
Score (fold 6/10): 2.056952
Score (fold 7/10): 2.067447
Score (fold 8/10): 2.060989
Score (fold 9/10): 2.046080
Score (fold 10/10): 2.060704
Score (fold 11/10): 2.053441
{'num_class': 38, 'silent': 1, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob', 'eta': 0.05, 'num_round': 512, 'max_depth': 6}
Score: 2.059301
Score (fold 2/10): 2.011427
Score (fold 3/10): 1.995652
Score (fold 4/10): 2.006907
Score (fold 5/10): 2.010782
Score (fold 6/10): 2.009524
Score (fold 7/10): 2.013509
Score (fold 8/10): 2.025687
Score (fold 9/10): 2.016535
Score (fold 10/10): 2.010544
Score (fold 11/10): 2.007745
{'num_class': 38, 'silent': 1, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob', 'eta': 0.05, 'num_round': 512, 'max_depth': 8}
Score: 2.010831
Score (fold 2/10): 1.817894
Score (fold 3/10): 1.822976
Score (fold 4/10): 1.807480
Score (fold 5/10): 1.820414
Score (fold 6/10): 1.827904
Score (fold 7/10): 1.796771
Score (fold 8/10): 1.810835
Score (fold 9/10): 1.810819
Score (fold 10/10): 1.826726
Score (fold 11/10): 1.806147
{'num_class': 38, 'silent': 1, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob', 'eta': 0.08, 'num_round': 512, 'max_depth': 4}
Score: 1.814797
Score (fold 2/10): 1.719526
Score (fold 3/10): 1.699885
Score (fold 4/10): 1.707952
Score (fold 5/10): 1.707693
Score (fold 6/10): 1.705791
Score (fold 7/10): 1.712567
Score (fold 8/10): 1.712766
Score (fold 9/10): 1.728956
Score (fold 10/10): 1.707087
Score (fold 11/10): 1.720716
{'num_class': 38, 'silent': 1, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob', 'eta': 0.08, 'num_round': 512, 'max_depth': 6}
Score: 1.712294
Score (fold 2/10): 1.647543
Score (fold 3/10): 1.660003
Score (fold 4/10): 1.661555
Score (fold 5/10): 1.643998
Score (fold 6/10): 1.670657
Score (fold 7/10): 1.660829
Score (fold 8/10): 1.660725
Score (fold 9/10): 1.668795
Score (fold 10/10): 1.659585
Score (fold 11/10): 1.683898
{'num_class': 38, 'silent': 1, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob', 'eta': 0.08, 'num_round': 512, 'max_depth': 8}
Score: 1.661759
Score (fold 2/10): 1.649760
Score (fold 3/10): 1.658230
Score (fold 4/10): 1.660598
Score (fold 5/10): 1.646918
Score (fold 6/10): 1.648000
Score (fold 7/10): 1.656087
Score (fold 8/10): 1.638838
Score (fold 9/10): 1.640263
Score (fold 10/10): 1.657341
Score (fold 11/10): 1.668422
{'num_class': 38, 'silent': 1, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob', 'eta': 0.1, 'num_round': 512, 'max_depth': 4}
Score: 1.652446
Score (fold 2/10): 1.555201
Score (fold 3/10): 1.534963
Score (fold 4/10): 1.558239
Score (fold 5/10): 1.549954
Score (fold 6/10): 1.532547
Score (fold 7/10): 1.563973
Score (fold 8/10): 1.558331
Score (fold 9/10): 1.535773
Score (fold 10/10): 1.551378
Score (fold 11/10): 1.553009
{'num_class': 38, 'silent': 1, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob', 'eta': 0.1, 'num_round': 512, 'max_depth': 6}
Score: 1.549337
Score (fold 2/10): 1.495246
Score (fold 3/10): 1.495048
Score (fold 4/10): 1.501001
Score (fold 5/10): 1.489708
Score (fold 6/10): 1.494882
Score (fold 7/10): 1.499987
Score (fold 8/10): 1.494674
Score (fold 9/10): 1.505025
Score (fold 10/10): 1.503532
Score (fold 11/10): 1.483944
{'num_class': 38, 'silent': 1, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob', 'eta': 0.1, 'num_round': 512, 'max_depth': 8}
Score: 1.496305
Score (fold 2/10): 2.001295
Score (fold 3/10): 2.019033
Score (fold 4/10): 2.019613
Score (fold 5/10): 2.001016
Score (fold 6/10): 2.005965
Score (fold 7/10): 1.994953
Score (fold 8/10): 2.023502
Score (fold 9/10): 2.011969
Score (fold 10/10): 2.014396
Score (fold 11/10): 2.013169
{'num_class': 38, 'eval_metric': 'mlogloss', 'objective': 'multi:softprob', 'eta': 0.05, 'num_round': 512, 'max_depth': 8}
Score: 2.010491
Score (fold 2/10): 2.014759
Score (fold 3/10): 2.012488
Score (fold 4/10): 2.007637
Score (fold 5/10): 2.006032
Score (fold 6/10): 2.007296