In [1]:
import os
from time import time 
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [21]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 2
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plist = list(param.items())
    xgtrain = xgb.DMatrix(train_X,label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test')]
        train_start = time()
        model = xgb.train(plist,xgtrain,num_rounds,watchlist,early_stopping_rounds=20)
        train_end = time()
    else:
        xgtest = xgb.DMatrix(test_X)
        train_start = time()
        model = xgb.train(plist, xgtrain, num_rounds)
        train_end = time()
    
    test_start = time()
    pred_test_y = model.predict(xgtest)
    test_end = time()
    return pred_test_y, model, (train_end - train_start), (test_end - test_start)

In [3]:
input_path = '../../localData/prj3/training_data/sift_features/sift_features.csv'
fix_test_path = '../output/prediction_inceptionV3.csv'
test_images = pd.read_csv(fix_test_path)['image']
test_images = [x.split('.')[0] for x in test_images.tolist()]

total_df = pd.read_csv(input_path).transpose()

labels = [1 for i in range(1000)] + [0 for i in range(1000)]
total_df['label'] = labels

In [5]:
# for ensemble purpose make sure test set the same as in inceptionV3 model
test_df = total_df.ix[total_df.index.isin(test_images)]
train_df = total_df.ix[~total_df.index.isin(test_images)]

train_X = train_df.ix[:,:5000]
train_y = train_df['label']

test_X = test_df.ix[:,:5000]

train_X = sparse.csr_matrix(train_X.values)
test_X = sparse.csr_matrix(test_X.values)

train_y = np.array(train_y)

In [22]:
preds, _ , training_time, predicting_time = runXGB(train_X, train_y, test_X, num_rounds=400)

In [23]:
print('training time is:'+str(round(training_time,2))+'seconds;\npredicting time is:'+
str(round(predicting_time,2))+'seconds.')

training time is:50.34seconds;
predicting time is:0.01seconds.


In [24]:
out_df = pd.DataFrame(preds)

In [25]:
out_df.head()

Unnamed: 0,0,1
0,0.930725,0.069275
1,0.757777,0.242223
2,0.432586,0.567414
3,0.802996,0.197004
4,0.065175,0.934825


In [14]:
train_X.shape

(1600, 5000)

In [15]:
train_y.shape

(1600,)

In [17]:
test_X.shape

(400, 5000)

In [18]:
train_y

array([1, 1, 1, ..., 0, 0, 0])

In [20]:
param = {}
param['objective'] = 'multi:softprob'
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['num_class'] = 3
param['eval_metric'] = "mlogloss"
param['min_child_weight'] = 1
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['seed'] = 0
num_rounds = 400