In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, roc_curve, auc, f1_score
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.model_selection import GridSearchCV
import random
import time
import numpy as np
import pandas as pd

In [None]:
train = np.load("../data/datasets/train_tensor.npy", allow_pickle=True)
val = np.load("../data/datasets/val_tensor.npy.npy", allow_pickle=True)
train_lab = np.load("../data/datasets/train_label.npy", allow_pickle=True)
val_lab = np.load("../data/datasets/val_label.npy", allow_pickle=True)

In [None]:
print(train.shape, "\t", train_lab.shape)

In [None]:
record = pd.DataFrame({'seeds':[], 'aupr':[], 'auc':[], 'precision':[], 'recall': [], 'thresh':[]})
seeds = [2007, 2022, 2277, 2622, 2491, 2169, 2061, 2176, 2931, 2994]

for i, seeding in enumerate(seeds):
    rf = RandomForestClassifier(n_estimators = 100, max_depth = 32, n_jobs = 8,
                                max_features= 0.014, min_samples_leaf= 4, 
                                min_samples_split = 6, random_state = seeding,
                                oob_score = True, class_weight = {0:2, 1:1})
    cv = StratifiedKFold(n_splits=5, random_state = 2022, shuffle = True)
    def scorer(clf, X, y):
        y_pred = clf.predict_proba(X)
        y_pred = y_pred[:,1]
        precision, recall, thresh1 = precision_recall_curve(y, y_pred)
        fpr, tpr, thresh2 = roc_curve(y, y_pred)
        f1_score, threshold_b, precision_b, recall_b = utils.f1_score_best(precision, recall, thresh1) 

        aupr = auc(recall, precision)
        auc_score = auc(fpr, tpr)

        return {'AUPR': aupr, 'AUC':auc_score, 'f1_score':f1_score, 
                'bestprecision': precision_b, 'bestrecall': recall_b, 
                'threshold': threshold_b}

    t1 = time.time()
    rfclf = cross_validate(rf, train, train_lab, scoring = scorer, cv = cv, return_estimator = True)
    t2 = time.time()
    print("Training completed with {} minutes!".format(int((t2-t1)/60)))
    
    best_index = np.argmax(rfclf['test_f1_score'])
    record = pd.concat([record, pd.DataFrame([[seeding, rfclf['test_AUPR'][best_index], rfclf['test_AUC'][best_index], 
                                               rfclf['test_bestprecision'][best_index], rfclf['test_bestrecall'][best_index], 
                                               rfclf['test_threshold'][best_index]]], 
                                             columns= ['seeds', 'aupr', 'auc', 'precision', 'recall', 'thresh'])])
    
    print("Mean of AUPR:{}".format(np.mean(rfclf['test_AUPR'])))
    print("Mean of AUC:{}".format(np.mean(rfclf['test_AUC'])))
    
    rfclf = rfclf['estimator'][best_index]
    dump(rfclf, "../temp/s1rf_finalrep"+str(seeding)+".joblib")
    
    index, (train_index, test_index) = list(enumerate(cv.split(train, train_lab)))[best_index]
    
    test = train[test_index]
    test_lab = train_lab[test_index]
    test_pred = rfclf.predict_proba(test)
    test_pred = test_pred[:, 1]
    precision, recall, thresh1 = precision_recall_curve(test_lab, test_pred)
    fpr, tpr, thresh2 = roc_curve(test_lab, test_pred)
    f1_score, threshold_b, precision_b, recall_b = utils.f1_score_best(precision, recall, thresh1) 
    
    pr = np.array([precision, recall]).T
    fp = np.array([fpr, tpr]).T
    utils.mat2csv(pr, ["precision", "recall"], "../temp/s1pr_finalrf"+str(seeding)+".csv")
    utils.mat2csv(fp, ["fpr", "tpr"], "../temp/s1fp_finalrf"+str(seeding)+".csv")

In [None]:
record

In [None]:
record.reset_index(drop=True).to_csv("../temp/repeats10.csv", header = True, index =  True)

In [None]:
print("mean of aupr is {} with standard deviation {}".format(np.mean(record['aupr']), np.std(record['aupr'])))
print("mean of auc is {} with standard deviation {}".format(np.mean(record['auc']), np.std(record['auc'])))
print("mean of precision is {} with standard deviation {}".format(np.mean(record['precision']), np.std(record['precision'])))
print("mean of recall is {} with standard deviation {}".format(np.mean(record['recall']), np.std(record['recall'])))

In [None]:
record = record.reset_index(drop=True)
f1 = [(2*record['precision'][i]*record['recall'][i] / (record['precision'][i] + record['recall'][i])) for i in range(record.shape[0])]
print("mean of f1 is {} with standard deviation {}".format(np.mean(f1), np.std(f1)))