In [1]:
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
import re
import os
import pandas as pd
from tqdm import tqdm_notebook
import json

import matplotlib.pylab as plt

project_dir="G:/My Drive/CRG_Dropbox/AljoComputer/Temp"
dir_keypoints="../expts/KeypointsAndDescriptors/validKeyPoints"
dir_descriptors="../expts/KeypointsAndDescriptors/descriptors"

def modelfit(alg, dtrain, train_labels, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain, label=train_labels)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=25)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain, train_labels, eval_metric='auc')

    #Predict training set:
    dtrain_predictions = alg.predict(dtrain)
    dtrain_predprob = alg.predict_proba(dtrain)[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % accuracy_score(train_labels, dtrain_predictions)
    print "AUC Score (Train): %f" % roc_auc_score(train_labels, dtrain_predprob)
                    
    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    


In [2]:
annotation_list = '../expts/annotation_list'
all_annotations = []
with open(annotation_list) as test_list:
    for annotation_file in test_list:
        annotation_file = project_dir + re.sub(".*/20180417_BG57_Midpoint_200micron_30fps/", "/", annotation_file.strip())
        with open(annotation_file) as fin_annotation:
            annotation = json.load(fin_annotation)
            all_annotations.extend(annotation["Annotations"])

print "len(all_annotations):", len(all_annotations)

gt = pd.DataFrame([])
descs = []
for ind, annotation in enumerate(all_annotations):
    parent, file_name = os.path.split(annotation['FrameFile'])
    file_name = os.path.splitext(file_name)[0]
    _, parent_name = os.path.split(parent)
    gt.loc[ind, 'FrameIndex'] = int(annotation['FrameIndexVideo'])
    gt.loc[ind, 'FileName'] = str(file_name.strip())
    gt.loc[ind, 'ParentName'] = str(parent_name.strip())
    for bp in annotation['FrameValueCoordinates']:
        if bp['Name'] == 'LeftDorsalOrgan':
            if (bp['Value']['x_coordinate'] != -1) and (bp['Value']['y_coordinate'] != -1):
                gt.loc[ind, 'LeftDorsalOrgan'] = 1
            else:
                gt.loc[ind, 'LeftDorsalOrgan'] = 0
        elif bp['Name'] == 'RightDorsalOrgan':
            if (bp['Value']['x_coordinate'] != -1) and (bp['Value']['y_coordinate'] != -1):
                gt.loc[ind, 'RightDorsalOrgan'] = 1
            else:
                gt.loc[ind, 'RightDorsalOrgan'] = 0
    desc_file = os.path.join(dir_descriptors, str(parent_name.strip()), str(file_name.strip() + '.csv'))
    try:
        temp = pd.read_csv(desc_file, sep='\t', header=None)
        descs.append(temp.values)
    except:
        temp = pd.DataFrame(np.zeros((1, 128)))
        descs.append(temp.values)
        continue

print "len(descs):", len(descs)

len(all_annotations): 2613
len(descs): 2613


In [3]:
bp = 'LeftDorsalOrgan'
X = gt.index.values
y = gt.loc[:, bp]
train_idx, test_idx, train_labs_idx, test_labs_idx = train_test_split(X, y, test_size=0.30, random_state=42)

train_desc = np.zeros((0,128))
train_labels = np.zeros((0,1))
for i in tqdm_notebook(train_idx):
    train_desc = np.r_[train_desc, descs[i]]
    train_labels = np.r_[train_labels, np.tile(gt.loc[i, bp], (np.shape(descs[i])[0], 1))]


HBox(children=(IntProgress(value=0, max=1829), HTML(value=u'')))




In [4]:
xgb1 = XGBClassifier(
 learning_rate=0.5,
 n_estimators=500,
 max_depth=8,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=20,
 scale_pos_weight=0.64,
 seed=27)

modelfit(xgb1, train_desc, np.ravel(train_labels))

# xgb_param = xgb1.get_xgb_params()
# xgtrain = xgb.DMatrix(train_desc, label=np.ravel(train_labels))
# cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5,
#                   metrics='auc', early_stopping_rounds=50, verbose_eval=25)
# xgb1.set_params(n_estimators=cvresult.shape[0])

# xgb1.fit(train_desc, np.ravel(train_labels), eval_metric='auc')

# #Predict training set:
# dtrain_predictions = xgb1.predict(train_desc)
# dtrain_predprob = xgb1.predict_proba(train_desc)[:,1]

# #Print model report:
# print "\nModel Report"
# print "Accuracy : %.4g" % accuracy_score(np.ravel(train_labels), dtrain_predictions)
# print "AUC Score (Train): %f" % roc_auc_score(np.ravel(train_labels), dtrain_predprob)

# feat_imp = pd.Series(xgb1.get_booster().get_fscore()).sort_values(ascending=False)
# feat_imp.plot(kind='bar', title='Feature Importances')
# plt.ylabel('Feature Importance Score')

[0]	train-auc:0.888511+0.00090023	test-auc:0.74367+0.00351358
[25]	train-auc:1+0	test-auc:0.860424+0.00162565
[50]	train-auc:1+0	test-auc:0.874963+0.00140374
[75]	train-auc:1+0	test-auc:0.881973+0.00118579
[99]	train-auc:1+0	test-auc:0.885958+0.00144475


  if diff:



Model Report
Accuracy : 1
AUC Score (Train): 1.000000


TypeError: 'str' object is not callable

In [None]:
!git add -A

In [None]:
!git push

In [None]:
param_test1 = {'max_depth': [4, 8, 16, 32], 'min_child_weight': range(1, 7, 2)}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=24, scale_pos_weight=1, seed=27), 
                        param_grid = param_test1, scoring='roc_auc', n_jobs=24, iid=False, cv=5)
gsearch1.fit(train, train_labels)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test2 = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=24,iid=False, cv=5)
gsearch2.fit(train[predictors],train[target])
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
param_test2b = {
 'min_child_weight':[6,8,10,12]
}
gsearch2b = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2b, scoring='roc_auc',n_jobs=24,iid=False, cv=5)
gsearch2b.fit(train[predictors],train[target])

In [None]:
modelfit(gsearch3.best_estimator_, train, predictors)
gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=24,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=4,
 min_child_weight=6,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=24,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb2, train, predictors)

In [None]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=24,iid=False, cv=5)
gsearch4.fit(train[predictors],train[target])
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
param_test5 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=24,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])