In [None]:
###############################################################################################################################
#
#   Copyright © 2022 Center for Agricultural Systems Biology
#   Authorships: Ruengsrichaiya B., Nukoolkit C., Kalapanulak S. and Saithong T., 
#   (202x) Plant-DTI: Extending the landscape of TF protein and DNA interaction in plants by a machine learning-based approach. 
#   xxxxx., xx, xxx. (in preperation). 
#   Contact: bhukrit.r@mail.kmutt.ac.th
#
###############################################################################################################################
#
#   Pseudocode for model Plant-DTI model construction with Random Forest classifier using scikit learn: 
#   Classes labeled  are 1 (interacted) and 0 (not interacted)
#   Random within models (RW) are avaiable for TFBS length range from 7-15 bp.
#   Random pairs models (RP) are avaiable for TFBS length range from 7-14 bp.
#
###############################################################################################################################
#
#   Random pairs models (RP)
#
###############################################################################################################################

INPUT: Feature of train data, feature of traintest data, class label of train data and, class label of test data
OUTPUT: Plant-DTI model and its performances.
    
BEGIN
FOR i = 7 to 15 do
    Read feature of train data
    Read class label of train data
    Read features of test data
    Read class label of test data

    Train model using random forest classifier from scikit learn with number of trees = 100
    Save trained model in .sav file

    Predict features of test data by using trained random forest model
    Evaluate model performance by compare predicted result with label of test data
    Collect model performance results
END FOR loop
Write summary model performance in .csv file
END

##############################################################################################################################
#
#   Random within models (RW)
#
##############################################################################################################################

INPUT: Feature of train data, feature of traintest data, class label of train data and, class label of test data
OUTPUT: Plant-DTI model and its performances.
    
BEGIN
FOR i = 7 to 14 do
    read features of train data
    Read class label of train data
    Read features of test data
    Read class labels of test data

    Train model using random forest classifier from scikit learn with number of trees= 100
    Save trained model in .sav file

    Predict features of test data by using trained random forest model
    Evaluate model performance by compare predicted result with label of test data
    Collect model performance results
END FOR loop
Write summary model performance in .csv file
END

In [1]:
#Import all required library
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt
import sklearn
import pickle

from textwrap import wrap
import sklearn.metrics as metrics
from sklearn.metrics import roc_auc_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score,  classification_report


from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

    

In [4]:
#Code for model construction, Random Forest with 100 trees: Random within (RW)

#Create dataframe for collecting model performance results
report_summary_df = pd.DataFrame(columns=['TN','FP','FN','TP','Accuracy','Precision', 'Recall (Sensitivity)',
                 'F1-score','Specificity', 'NPV'])

for i in range(7,16):
    #read train_data
    X_train = pd.read_csv('Data/train_data/70_holdout_X_train_len' + str(i) +  '_RW.csv')
    Y_train = pd.read_csv('Data/train_data/70_holdout_Y_train_len' + str(i) + '_RW.csv')

    #read test_data
    X_test = pd.read_csv('Data/test_data/30_holdout_X_test_len' + str(i) + '_RW.csv')
    Y_test = pd.read_csv('Data/test_data/30_holdout_Y_test_len' + str(i) + '_RW.csv')


    #Model training data with RF classifier (100 trees)
    rf_clf=RandomForestClassifier(n_estimators= 100,random_state= 100,oob_score=True)
    rf_clf= rf_clf.fit(X_train, Y_train['class'])

    #save model .sav
    pickle.dump(rf_clf, open('OUTPUT/model_RW_len' + str(i) + '.sav', 'wb'))


    #Predict test dataset
    Y_pred = rf_clf.predict(X_test)
    
    #collect model performance (confusion matrix, accuracy, precision, sensitivity, f1-score, specificity, npv)
    #confusion matrix
    confuse_mat=list(confusion_matrix(Y_test, Y_pred).ravel())
    
    
    acc=accuracy_score(Y_test, Y_pred)
    precision=np.ndarray.tolist(precision_score(Y_test, Y_pred, average = None))
    recall=np.ndarray.tolist(recall_score(Y_test, Y_pred, average = None))
    f1=np.ndarray.tolist(f1_score(Y_test, Y_pred, average = None))
    
    #classification report for specificity and npv
    class_report_dict=classification_report(Y_test, Y_pred, digits=4, output_dict=True)
    specificity=class_report_dict['0']['recall']
    npv=class_report_dict['0']['precision']
    
    report=[]
    report.append(confuse_mat[0]) #TN
    report.append(confuse_mat[1]) #FP
    report.append(confuse_mat[2]) #FN
    report.append(confuse_mat[3]) #TP
    report.append(acc)
    report.append(precision[1])
    report.append(recall[1])
    report.append(f1[1])
    report.append(specificity)
    report.append(npv)
    report_df = pd.DataFrame([report], columns=['TN','FP','FN','TP','Accuracy','Precision', 'Recall (Sensitivity)',
                 'F1-score','Specificity', 'NPV'])
    report_summary_df = pd.concat([report_summary_df, report_df])

report_summary_df.to_csv('OUTPUT/model_RW_performance.csv')
report_summary_df



Unnamed: 0,TN,FP,FN,TP,Accuracy,Precision,Recall (Sensitivity),F1-score,Specificity,NPV
0,2872,17,11,2877,0.995153,0.994126,0.996191,0.995157,0.994116,0.996185


In [5]:
#Code for model construction, Random Forest with 100 trees: Random pairs (RP)

#Create dataframe for collecting model performance results
report_summary_df = pd.DataFrame(columns=['TN','FP','FN','TP','Accuracy','Precision', 'Recall (Sensitivity)',
                 'F1-score','Specificity', 'NPV'])

for i in range(7,15):
    #read train_data
    X_train = pd.read_csv('Data/train_data/70_holdout_X_train_len' + str(i) +  '_RP.csv')
    Y_train = pd.read_csv('Data/train_data/70_holdout_Y_train_len' + str(i) + '_RP.csv')

    #read test_data
    X_test = pd.read_csv('Data/test_data/30_holdout_X_test_len' + str(i) + '_RP.csv')
    Y_test = pd.read_csv('Data/test_data/30_holdout_Y_test_len' + str(i) + '_RP.csv')


    #Model training data with RF classifier (100 trees)
    rf_clf=RandomForestClassifier(n_estimators= 100,random_state= 100,oob_score=True)
    rf_clf= rf_clf.fit(X_train, Y_train['class'])

    #save model .sav
    pickle.dump(rf_clf, open('OUTPUT/model_RP_len' + str(i) + '.sav', 'wb'))


    #Predict test dataset
    Y_pred = rf_clf.predict(X_test)
    
    #collect model performance (confusion matrix, accuracy, precision, sensitivity, f1-score, specificity, npv)
    #confusion matrix
    confuse_mat=list(confusion_matrix(Y_test, Y_pred).ravel())
    
    
    acc=accuracy_score(Y_test, Y_pred)
    precision=np.ndarray.tolist(precision_score(Y_test, Y_pred, average = None))
    recall=np.ndarray.tolist(recall_score(Y_test, Y_pred, average = None))
    f1=np.ndarray.tolist(f1_score(Y_test, Y_pred, average = None))
    
    #classification report for specificity and npv
    class_report_dict=classification_report(Y_test, Y_pred, digits=4, output_dict=True)
    specificity=class_report_dict['0']['recall']
    npv=class_report_dict['0']['precision']
    
    report=[]
    report.append(confuse_mat[0]) #TN
    report.append(confuse_mat[1]) #FP
    report.append(confuse_mat[2]) #FN
    report.append(confuse_mat[3]) #TP
    report.append(acc)
    report.append(precision[1])
    report.append(recall[1])
    report.append(f1[1])
    report.append(specificity)
    report.append(npv)
    report_df = pd.DataFrame([report], columns=['TN','FP','FN','TP','Accuracy','Precision', 'Recall (Sensitivity)',
                 'F1-score','Specificity', 'NPV'])
    report_summary_df = pd.concat([report_summary_df, report_df])

report_summary_df.to_csv('OUTPUT/model_RP_performance.csv')
report_summary_df



Unnamed: 0,TN,FP,FN,TP,Accuracy,Precision,Recall (Sensitivity),F1-score,Specificity,NPV
0,186,9,4,191,0.966667,0.955,0.979487,0.967089,0.953846,0.978947


In [None]:
##############################################################################################################################
#
#   Copyright © 2022 Center for Agricultural Systems Biology
#   Authorships: Ruengsrichaiya B., Nukoolkit C., Kalapanulak S. and Saithong T., 
#   (202x) Plant-DTI: Extending the landscape of TF protein and DNA interaction in plants by a machine learning-based approach. 
#   xxxxx., xx, xxx. (in preperation). 
#   Contact: bhukrit.r@mail.kmutt.ac.th
#
##############################################################################################################################
#   
#   Pseudocode for model hyperparameter tuning with Random Forest classifier using scikit learn:
#   Automatically optimized hyperparameter of each model using GridSearchCV package
#   Classes labeled  are 1 (interacted) and 0 (not interacted)
#   Random within models (RW) are avaiable for TFBS length range from 7-15 bp.
#   Random pairs models (RP) are avaiable for TFBS length range from 7-14 bp.
#
##############################################################################################################################
#
#   Random pairs models (RP)
#
##############################################################################################################################

INPUT: Feature of train data, feature of traintest data, class label of train data and, class label of test data
OUTPUT: Hyperparameters of Plant-DTI model and its performances.
    
BEGIN
for i = 7 to 14 do
    read features of train data
    read class label of train data
    read features of test data
    read class labels of test data

    SET model to RandomForestClassifier
    SET number of trees range from 1 to 150
    SET number of kfold cross validation to 10
    Automatically optimize the model using GridSearchCV
    Collect GridSearch results in .csv file
END

##############################################################################################################################
#
#   Random within models (RW)
#
##############################################################################################################################

INPUT: Feature of train data, feature of traintest data, class label of train data and, class label of test data
OUTPUT: Hyperparameters of Plant-DTI model and its performances.
    
BEGIN
FOR i = 7 to 15 do
    Read features of train data
    Read class label of train data
    Read features of test data
    Read class label of test data

    SET model to RandomForestClassifier
    SET number of trees range from 1 to 150
    SET number of kfold cross validation to 10
    Automatically optimize the model using GridSearchCV
    Collect GridSearch results in .csv file
END

In [7]:
##Code for tuning hyperparameter: number of trees from 1 to 150: Random Pairs (RP) 

for i in range(7,15):
    #read train_data
    X_train = pd.read_csv('Data/train_data/70_holdout_X_train_len' + str(i) + '_RP.csv')
    Y_train = pd.read_csv('Data/train_data/70_holdout_Y_train_len' + str(i) + '_RP.csv')

    #read test_data
    X_test = pd.read_csv('Data/test_data/30_holdout_X_test_len' + str(i) + '_RP.csv')
    Y_test = pd.read_csv('Data/test_data/30_holdout_Y_test_len' + str(i) + '_RP.csv')


    #Random forrest tune
    model = RandomForestClassifier(random_state=100)
    n_estimators = range(1, 150, 1)
    param_grid = dict(n_estimators=n_estimators)
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=100)
    grid_search = GridSearchCV(model, param_grid, cv=kfold, scoring="f1")
    grid_result = grid_search.fit(X_train, Y_train['class'])



    ##Create file for collect results
    best_para=str("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']

    report_tune_df=pd.DataFrame(grid_result.cv_results_)
    report_tune_df.to_csv('OUTPUT/'+ 'model_RP_len' + str(i) + '_tuned_result.csv', index=False)

#output ที่ show เป็นตัวอย่างเฉย ๆ นะครับ จริง ๆ n_estimators จะตั้งแต่ 1-150

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008278,0.001612,0.005086,0.00094,1,{'n_estimators': 1},0.977778,0.978261,0.93617,0.913043,0.931818,0.946237,0.945055,0.93617,0.933333,0.941176,0.943904,0.019162,2
1,0.023038,0.003848,0.006084,0.001041,6,{'n_estimators': 6},1.0,0.989011,0.946237,0.934783,0.977778,0.968421,0.933333,0.956522,0.91954,0.977273,0.96029,0.025123,1


In [8]:
##Code for tuning hyperparameter: number of trees from 1 to 150: Random Pairs (RW)


for i in range(7,16):
    #read train_data
    X_train = pd.read_csv('Data/train_data/70_holdout_X_train_len' + str(i) + '_RW.csv')
    Y_train = pd.read_csv('Data/train_data/70_holdout_Y_train_len' + str(i) + '_RW.csv')

    #read test_data
    X_test = pd.read_csv('Data/test_data/30_holdout_X_test_len' + str(i) + '_RW.csv')
    Y_test = pd.read_csv('Data/test_data/30_holdout_Y_test_len' + str(i) + '_RW.csv')


    #Random forrest tune
    model = RandomForestClassifier(random_state=100)
    n_estimators = range(1, 150, 1)
    param_grid = dict(n_estimators=n_estimators)
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=100)
    grid_search = GridSearchCV(model, param_grid, cv=kfold, scoring="f1")
    grid_result = grid_search.fit(X_train, Y_train['class'])



    ##Create file for collect results
    best_para=str("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']

    report_tune_df=pd.DataFrame(grid_result.cv_results_)
    report_tune_df.to_csv('OUTPUT/'+ 'model_RW_len' + str(i) + '_tuned_result.csv', index=False)

#output ที่ show เป็นตัวอย่างเฉย ๆ นะครับ จริง ๆ n_estimators จะตั้งแต่ 1-150

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.027127,0.002849,0.006985,0.001336,1,{'n_estimators': 1},0.982301,0.973646,0.974321,0.977289,0.979351,0.973607,0.979442,0.980249,0.978534,0.983776,0.978252,0.003358,2
1,0.116688,0.011918,0.009479,0.002463,6,{'n_estimators': 6},0.997776,0.991124,0.996299,0.995556,0.997028,0.995556,0.998516,0.995549,0.997037,0.995549,0.995999,0.001905,1
