In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.model_selection import StratifiedKFold

In [6]:
from sklearn.preprocessing import Normalizer

In [7]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as rf
import xgboost as xgb

In [8]:
from sklearn.model_selection import GridSearchCV


In [9]:
from sklearn.metrics import make_scorer, matthews_corrcoef


In [10]:
from sklearn.metrics import precision_score, f1_score, accuracy_score

In [11]:
model_path = 'doc_sent_level'

In [12]:
mcc_scorer = make_scorer(matthews_corrcoef)

In [13]:
def GridSearch_table_(grid_clf, param_name,
                          num_results=15,
                          negative=False,
                          graph=True,
                          display_all_params=True):
                        
    from matplotlib      import pyplot as plt
    from IPython.display import display
    import pandas as pd

    clf = grid_clf.best_estimator_
    clf_params = grid_clf.best_params_
    if negative:
        clf_score = -grid_clf.best_score_
    else:
        clf_score = grid_clf.best_score_
    clf_stdev = grid_clf.cv_results_['std_test_score'][grid_clf.best_index_]
    cv_results = grid_clf.cv_results_

    print("best parameters: {}".format(clf_params))
    print("best score:      {:0.5f} (+/-{:0.5f})".format(clf_score, clf_stdev))

    # pick out the best results
    # =========================
    scores_df = pd.DataFrame(cv_results).sort_values(by='rank_test_score')

    best_row = scores_df.iloc[0, :]
    if negative:
        best_mean = -best_row['mean_test_score']
    else:
        best_mean = best_row['mean_test_score']
    best_stdev = best_row['std_test_score']
    best_param = best_row['param_' + param_name]

    # display the top 'num_results' results
    # =====================================
    display(pd.DataFrame(cv_results) \
            .sort_values(by='rank_test_score').head(num_results))


## Get data

In [16]:
path_data = './tcdc/'+model_path+'.csv'
df_raw = pd.read_csv(path_data, header=None)
df_raw.head()

FileNotFoundError: [Errno 2] File ./tcdc/doc_sent_level.csv does not exist: './tcdc/doc_sent_level.csv'

In [32]:
X = df_raw.values[:, 1:]
X.shape

y = df_raw.values[:, 0]
y.shape

(492, 200)

# 5-CV

In [34]:
skf = StratifiedKFold(n_splits=5)

In [35]:
iter_idx=0
for train_index, test_index in skf.split(X, y):
    iter_idx+=1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    transformer = Normalizer().fit(X_train)
    X_train = transformer.transform(X_train)
    X_test = transformer.transform(X_test)
    
    clf = xgb.XGBClassifier(reg_lambda=1, alpha=1, tree_method='auto', 
                            booster='dart', normalize_type ='forest', rate_drop=0.0)
    clf.fit(X_train, y_train)
    
    acc_tr = clf.score(X_train, y_train)
    acc_te = clf.score(X_test, y_test)
    acc_base = 1-(y_test.sum()/len(y_test))
    
    print('CV:{}\t ACC_train:{:.2f}%\tACC_te:{:.2f}%\tDataDist:{:.2f}'.format(iter_idx, acc_tr*100, acc_te*100, acc_base*100))
    print('-'*40)
    

CV:1	 ACC_train:92.88%	ACC_te:62.63%	DataDist:65.66
----------------------------------------
CV:2	 ACC_train:93.64%	ACC_te:68.69%	DataDist:66.67
----------------------------------------
CV:3	 ACC_train:93.91%	ACC_te:65.31%	DataDist:66.33
----------------------------------------
CV:4	 ACC_train:92.39%	ACC_te:75.51%	DataDist:66.33
----------------------------------------
CV:5	 ACC_train:93.15%	ACC_te:70.41%	DataDist:66.33
----------------------------------------


## Grid search

In [36]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=1, stratify=y)

In [39]:
transformer = Normalizer().fit(X)
X = transformer.transform(X)

In [40]:
1 - y_test.sum()/len(y_test)

0.6666666666666667

In [41]:
parameters = {'subsample':[0.68], 'reg_lambda':[2.5]}

In [42]:
gscv = GridSearchCV(estimator=xgb.XGBClassifier(), param_grid=parameters,
                   n_jobs=32, return_train_score=True, cv=5, 
                   verbose=1)

In [43]:
gscv.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done  58 out of  60 | elapsed:    1.8s remaining:    0.1s
[Parallel(n_jobs=32)]: Done  60 out of  60 | elapsed:    1.8s finished


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

MCC

In [44]:
GridSearch_table_(gscv, param_name='subsample', num_results=5, display_all_params=False, negative=False,)

best parameters: {'reg_alpha': 0.0, 'reg_lambda': 2.0, 'subsample': 0.8}
best score:      0.66660 (+/-0.02548)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reg_alpha,param_reg_lambda,param_subsample,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
5,0.676099,0.112198,0.002852,0.000711,0.0,2.0,0.8,"{'reg_alpha': 0.0, 'reg_lambda': 2.0, 'subsamp...",0.683544,0.64557,...,0.666602,0.025479,1,0.94586,0.93949,0.952229,0.952381,0.961905,0.950373,0.007473
10,0.697379,0.070571,0.001542,4.8e-05,0.5,2.0,1.0,"{'reg_alpha': 0.5, 'reg_lambda': 2.0, 'subsamp...",0.696203,0.696203,...,0.664038,0.033597,2,0.94586,0.93949,0.952229,0.952381,0.961905,0.950373,0.007473
7,0.636311,0.095367,0.00215,0.000436,0.5,1.0,0.8,"{'reg_alpha': 0.5, 'reg_lambda': 1.0, 'subsamp...",0.658228,0.64557,...,0.661603,0.012121,3,0.94586,0.93949,0.952229,0.952381,0.961905,0.950373,0.007473
9,0.626579,0.061098,0.002048,0.000425,0.5,1.5,0.8,"{'reg_alpha': 0.5, 'reg_lambda': 1.5, 'subsamp...",0.658228,0.658228,...,0.661603,0.009102,3,0.94586,0.93949,0.952229,0.952381,0.961905,0.950373,0.007473
4,0.744572,0.15,0.002381,0.000741,0.0,2.0,1.0,"{'reg_alpha': 0.0, 'reg_lambda': 2.0, 'subsamp...",0.670886,0.658228,...,0.65125,0.032456,5,0.94586,0.93949,0.952229,0.952381,0.961905,0.950373,0.007473
0,0.934369,0.025107,0.003485,0.000376,0.0,1.0,1.0,"{'reg_alpha': 0.0, 'reg_lambda': 1.0, 'subsamp...",0.64557,0.670886,...,0.648718,0.037923,6,0.94586,0.93949,0.952229,0.952381,0.961905,0.950373,0.007473
11,0.663162,0.053306,0.001726,0.00038,0.5,2.0,0.8,"{'reg_alpha': 0.5, 'reg_lambda': 2.0, 'subsamp...",0.670886,0.594937,...,0.641285,0.026115,7,0.94586,0.93949,0.952229,0.952381,0.961905,0.950373,0.007473
2,0.949319,0.038288,0.003205,0.000517,0.0,1.5,1.0,"{'reg_alpha': 0.0, 'reg_lambda': 1.5, 'subsamp...",0.658228,0.658228,...,0.641155,0.021453,8,0.94586,0.93949,0.952229,0.952381,0.961905,0.950373,0.007473
3,0.740274,0.080419,0.002641,0.000571,0.0,1.5,0.8,"{'reg_alpha': 0.0, 'reg_lambda': 1.5, 'subsamp...",0.607595,0.658228,...,0.63606,0.037279,9,0.94586,0.93949,0.952229,0.952381,0.961905,0.950373,0.007473
1,0.763498,0.085045,0.003597,0.001145,0.0,1.0,0.8,"{'reg_alpha': 0.0, 'reg_lambda': 1.0, 'subsamp...",0.64557,0.64557,...,0.636027,0.023842,10,0.94586,0.93949,0.952229,0.952381,0.961905,0.950373,0.007473


In [None]:
result = pd.DataFrame(gscv.cv_results_)

result.T.to_csv('./tcdc/'+model_path+'_result.csv')