In [1]:
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
import re
import os
import pandas as pd
from tqdm import tqdm_notebook

import matplotlib.pylab as plt

project_dir="G:/My Drive/CRG_Dropbox/AljoComputer/Temp"
dir_keypoints="../expts/KeypointsAndDescriptors/validKeyPoints"
dir_descriptors="../expts/KeypointsAndDescriptors/descriptors"

def modelfit(alg, dtrain, train_labels, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain, label=train_labels)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=5)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain, train_labels, eval_metric='auc')

    #Predict training set:
    dtrain_predictions = alg.predict(dtrain)
    dtrain_predprob = alg.predict_proba(dtrain)[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % accuracy_score(train_labels, dtrain_predictions)
    print "AUC Score (Train): %f" % roc_auc_score(train_labels, dtrain_predprob)
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [2]:
annotation_list = '../expts/annotation_list'
all_annotations = []
with open(annotation_list) as test_list:
    for annotation_file in test_list:
        annotation_file = project_dir + re.sub(".*/20180417_BG57_Midpoint_200micron_30fps/", "/", annotation_file.strip())
        with open(annotation_file) as fin_annotation:
            annotation = json.load(fin_annotation)
            all_annotations.extend(annotation["Annotations"])

print "len(all_annotations):", len(all_annotations)

gt = pd.DataFrame([])
descs = []
for ind, annotation in enumerate(all_annotations):
    parent, file_name = os.path.split(annotation['FrameFile'])
    file_name = os.path.splitext(file_name)[0]
    _, parent_name = os.path.split(parent)
    gt.loc[ind, 'FrameIndex'] = int(annotation['FrameIndexVideo'])
    gt.loc[ind, 'FileName'] = str(file_name.strip())
    gt.loc[ind, 'ParentName'] = str(parent_name.strip())
    for bp in annotation['FrameValueCoordinates']:
        if bp['Name'] == 'LeftDorsalOrgan':
            if (bp['Value']['x_coordinate'] != -1) and (bp['Value']['y_coordinate'] != -1):
                gt.loc[ind, 'LeftDorsalOrgan'] = 1
            else:
                gt.loc[ind, 'LeftDorsalOrgan'] = 0
        elif bp['Name'] == 'RightDorsalOrgan':
            if (bp['Value']['x_coordinate'] != -1) and (bp['Value']['y_coordinate'] != -1):
                gt.loc[ind, 'RightDorsalOrgan'] = 1
            else:
                gt.loc[ind, 'RightDorsalOrgan'] = 0
    desc_file = os.path.join(dir_descriptors, str(parent_name.strip()), str(file_name.strip() + '.csv'))
    try:
        temp = pd.read_csv(desc_file, sep='\t', header=None)
        descs.append(temp.values)
    except:
        temp = pd.DataFrame(np.zeros((1, 128)))
        descs.append(temp.values)
        continue

print "len(descs):", len(descs)

len(all_annotations): 2613
len(descs): 2613


In [3]:
bp = 'LeftDorsalOrgan'
X = gt.index.values
y = gt.loc[:, bp]
train_idx, test_idx, train_labs_idx, test_labs_idx = train_test_split(X, y, test_size=0.30, random_state=42)

train_desc = np.zeros((0,128))
train_labels = np.zeros((0,1))
for i in tqdm_notebook(train_idx):
    train_desc = np.r_[train_desc, descs[i]]
    train_labels = np.r_[train_labels, np.tile(gt.loc[i, bp], (np.shape(descs[i])[0], 1))]


HBox(children=(IntProgress(value=0, max=1829), HTML(value=u'')))




In [5]:
xgb1 = XGBClassifier(
 learning_rate =1,
 n_estimators=1000,
 max_depth=8,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=24,
 scale_pos_weight=1,
 seed=27)

modelfit(xgb1, train_desc, np.ravel(train_labels))

[0]	train-auc:0.729408+0.00144017	test-auc:0.707167+0.00171519
[5]	train-auc:0.833186+0.00142919	test-auc:0.775956+0.00341134
[10]	train-auc:0.86946+0.00153207	test-auc:0.792479+0.00223852
[15]	train-auc:0.894249+0.00273728	test-auc:0.802551+0.00115865
[20]	train-auc:0.912085+0.00192918	test-auc:0.80976+0.00164427
[25]	train-auc:0.92765+0.00178556	test-auc:0.815294+0.00124614
[30]	train-auc:0.939601+0.00131596	test-auc:0.819772+0.00160356
[35]	train-auc:0.949537+0.0015553	test-auc:0.82358+0.00187074
[40]	train-auc:0.957722+0.00198836	test-auc:0.826299+0.00158079
[45]	train-auc:0.964615+0.00179876	test-auc:0.829292+0.00156021
[50]	train-auc:0.970696+0.00171239	test-auc:0.831865+0.00181791
[55]	train-auc:0.97615+0.00173416	test-auc:0.834175+0.00195443
[60]	train-auc:0.980289+0.00169134	test-auc:0.835886+0.00161277
[65]	train-auc:0.984022+0.00159539	test-auc:0.837298+0.00138476
[70]	train-auc:0.987149+0.00113784	test-auc:0.839345+0.00179286
[75]	train-auc:0.989853+0.00105427	test-auc:0.84

[785]	train-auc:1+0	test-auc:0.897876+0.00147374
[790]	train-auc:1+0	test-auc:0.897902+0.00144667
[795]	train-auc:1+0	test-auc:0.89796+0.00146412
[800]	train-auc:1+0	test-auc:0.89802+0.00144556
[805]	train-auc:1+0	test-auc:0.898074+0.00145135
[810]	train-auc:1+0	test-auc:0.89814+0.00144651
[815]	train-auc:1+0	test-auc:0.898221+0.00145217
[820]	train-auc:1+0	test-auc:0.898271+0.00144338
[825]	train-auc:1+0	test-auc:0.898324+0.00144873
[830]	train-auc:1+0	test-auc:0.898411+0.00145741
[835]	train-auc:1+0	test-auc:0.898469+0.00144328
[840]	train-auc:1+0	test-auc:0.898503+0.00144039
[845]	train-auc:1+0	test-auc:0.898574+0.00145895
[850]	train-auc:1+0	test-auc:0.898635+0.00143888
[855]	train-auc:1+0	test-auc:0.898714+0.00144595
[860]	train-auc:1+0	test-auc:0.898774+0.0014318
[865]	train-auc:1+0	test-auc:0.898845+0.00141472
[870]	train-auc:1+0	test-auc:0.898899+0.00142556
[875]	train-auc:1+0	test-auc:0.898984+0.00139797
[880]	train-auc:1+0	test-auc:0.899041+0.00140527
[885]	train-auc:1+0	test

  if diff:



Model Report
Accuracy : 1
AUC Score (Train): 1.000000


TypeError: 'str' object is not callable

In [6]:
!git add -A

The file will have its original line endings in your working directory.
The file will have its original line endings in your working directory.
The file will have its original line endings in your working directory.
The file will have its original line endings in your working directory.
The file will have its original line endings in your working directory.
The file will have its original line endings in your working directory.


In [None]:
!git push

In [None]:
param_test1 = {'max_depth': [4, 8, 16, 32], 'min_child_weight': range(1, 7, 2)}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=24, scale_pos_weight=1, seed=27), 
                        param_grid = param_test1, scoring='roc_auc', n_jobs=24, iid=False, cv=5)
gsearch1.fit(train, train_labels)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test2 = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=24,iid=False, cv=5)
gsearch2.fit(train[predictors],train[target])
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
param_test2b = {
 'min_child_weight':[6,8,10,12]
}
gsearch2b = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2b, scoring='roc_auc',n_jobs=24,iid=False, cv=5)
gsearch2b.fit(train[predictors],train[target])

In [None]:
modelfit(gsearch3.best_estimator_, train, predictors)
gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=24,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=4,
 min_child_weight=6,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=24,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb2, train, predictors)

In [None]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=24,iid=False, cv=5)
gsearch4.fit(train[predictors],train[target])
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
param_test5 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=24,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])