In [1]:
import math
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import pickle
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.grid_search import GridSearchCV

In [13]:
%run 'methods.ipynb'
%run 'XGBoost_class.ipynb'

In [24]:
df_train = pd.read_csv('train.csv')
labels, y = getLabeled(df_train['TripType'])


num_classes = len(np.unique(y))

params = {    
              'subsample':0.8,
              'colsample_bytree': 0.7,
              'objective': 'multi:softprob',
              'eval_metric': 'mlogloss',
              'num_class': num_classes,
              'silent':1,
              'eta': 0.3,
              'max_depth': 8,
              'min_child_weight': 1,
              'num_round': 1024
}

#k fold validation
kf= KFold(len(y), n_folds=10, shuffle=True, random_state=None)

#for num_round in [1024]:
for eta in [0.03, 0.06, 0.1]:
    for max_depth in [4, 5, 6, 7, 8]:
        avg = 0
        for train_index, test_index in kf:
            X_train, X_test = df_train.iloc[train_index] , df_train.iloc[test_index] 
                
            y_train = getY(X_train, labels)
            y_test = getY(X_test, labels)
            #features_train, features_test = generateTestTrainFeature(X_train, X_test)
                    
            train_other = getFeaturesTrain(X_train)
            upc_dict, upc_vec, train_upc = getCountVectorTrain(X_train,X_test, 'Upc')
            fileno_dict, fileno_vec, train_fine = getCountVectorTrain(X_train,X_test, 'FinelineNumber')
            
            test_other = getFeaturesTest(X_test)
            test_upc = getCountVectorTest(X_test, 'Upc', upc_dict, upc_vec)
            test_fine = getCountVectorTest(X_test, 'FinelineNumber', fileno_dict, fileno_vec)
            
            vec_desc, train_desc = processDescTrain(X_train, True)
            test_desc = processDescTest(X_test, True, vec_desc)

            forest = RandomForestClassifier(max_depth=17, n_estimators=50, random_state=0) 
            forest.fit(train_desc, y_train)
            y_train_desc_rf = forest.predict_proba(train_desc)
            y_test_desc_rf = forest.predict_proba(test_desc)
            
            train_desc_rf = pd.DataFrame(y_train_desc_rf)
            train_desc_rf.set_index(np.unique(train_desc.index), inplace=True)
            train_desc_rf.index.name = 'VisitNumber'
            
            test_desc_rf = pd.DataFrame(y_test_desc_rf)
            test_desc_rf.set_index(np.unique(test_desc.index), inplace=True)
            test_desc_rf.index.name = 'VisitNumber'
    
            features_train = pd.concat([train_other, train_upc, train_fine, train_desc_rf], axis=1) 
            features_test = pd.concat([test_other, test_upc, test_fine, test_desc_rf], axis=1) 
            
            params['eta'] = eta
            params['max_depth'] = max_depth
                
            clfxgb = XGBoostClassifier(**params)
            clfxgb.fit(features_train , y_train )
            y_pred = clfxgb.predict_proba(features_test)
            avg += loss_function(y_test, y_pred)

        print eta, max_depth,"=>", avg/len(kf)

0.03 4 => 3.29110023383
0.03 5 => 3.29185186442
0.03 6 => 3.29170321854
0.03 7 => 3.29132234063
0.03 8 => 3.29064725825
0.06 4 => 3.20329943795
0.06 5 => 3.20720585765
0.06 6 => 3.21092431128
0.06 7 => 3.21197361246
0.06 8 => 3.21303930247
0.1 4 => 3.20088396599
0.1 5 => 3.21163508682
0.1 6 => 3.21900733777
0.1 7 => 3.22261530458
0.1 8 => 3.22669499453


In [17]:
df_train = pd.read_csv('train.csv')
labels, y = getLabeled(df_train['TripType'])
df_test = pd.read_csv('test.csv')
upc_dict, upc_vec, features_train = getCountVectorTrain(df_train,df_test, 'FinelineNumber')
features_test = getCountVectorTest(df_test, 'FinelineNumber', upc_dict, upc_vec)

y_train = getY(df_train, labels)

params['eta'] = 0.1
params['max_depth'] = 8
                
clfxgb = XGBoostClassifier(**params)
clfxgb.fit(features_train , y_train )

In [18]:
generateSubmission(features_test, clfxgb, labels, "xgb_filelineno")

In [25]:
df_train = pd.read_csv('train.csv')
labels, y = getLabeled(df_train['TripType'])

y_train = getY(df_train, labels)

train_other = getFeaturesTrain(X_train)
upc_dict, upc_vec, train_upc = getCountVectorTrain(X_train,X_test, 'Upc')
fileno_dict, fileno_vec, train_fine = getCountVectorTrain(X_train,X_test, 'FinelineNumber')
vec_desc, train_desc = processDescTrain(X_train, True)
            
            

In [45]:
train_desc.head()

Unnamed: 0_level_0,14,6x,access,accessories,aids,and,apparel,automotive,bakery,bath,...,supp,supplies,swimwear,tobacco,toys,wear,wine,wireless,words_count,words_len
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,18
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,13
8,0,0,0,1,0,2,0,0,0,0,...,1,1,0,0,0,0,0,0,17,105
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,27
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,5,35


In [32]:
df_train.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017
