<a href="https://colab.research.google.com/github/alicewoo0925/miRNA-COVID19detection/blob/main/RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set Up

In [None]:
# import modules
import os
from google.colab import files
from google.colab import drive
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split # to split the dataset into train and test
from sklearn.model_selection import StratifiedKFold # stratified k-fold
from sklearn import ensemble # random forest
from sklearn.decomposition import PCA

from sklearn.metrics import classification_report, confusion_matrix

# Train and Evaluation

In [None]:
def getResult(y_test, prediction):
  print(classification_report(y_test, prediction)) # classification report

  #confusion matrix
  CM = confusion_matrix(y_test, prediction)
  CM = pd.DataFrame(confusion_matrix(y_test, prediction))
  CM.set_axis(['Mild','Moderate','Negative Control','Severe','Severe Negative'], axis=1, inplace=True)
  CM.set_axis(['Mild','Moderate','Negative Control','Severe','Severe Negative'], axis=0, inplace=True)
  print(CM)

## Full Dataset

In [None]:
# split the dataset
X_train, X_test, Y_train, Y_test = train_test_split(
    df_clean.loc[:, df_clean.columns != 'Target'], # variables X include every columns except Target
    df_clean['Target'], # Y is target column
    test_size = 0.2) # 80% for train and 20% for test

In [None]:
# random forest with full dataset
learner = ensemble.RandomForestClassifier()
model = learner.fit(X_train,Y_train)
prediction = model.predict(X_test)
getResult(Y_test,prediction)

## Selected Features by Random Forest + SMOTE

In [None]:
# 1000 RF features with 4 max depth
# split into train and test
X_train, X_test, Y_train, Y_test = train_test_split(
    X_smote_1k, # variables X include every columns except Target
    Y_smote_1k,
    test_size = 0.2,
    random_state=30)

# build the model
learner = ensemble.RandomForestClassifier(n_estimators=1000, max_depth=4, random_state=0)
model = learner.fit(X_train,Y_train)
prediction = model.predict(X_test)
getResult(Y_test,prediction)

In [None]:
# 500 RF features with 4 max depth
# split into train and test
X_train, X_test, Y_train, Y_test = train_test_split(
    X_smote_500, # variables X include every columns except Target
    Y_smote_500,
    test_size = 0.2,
    random_state=30)

# build the model
learner = ensemble.RandomForestClassifier(n_estimators=500, max_depth=4, random_state=0)
model = learner.fit(X_train,Y_train)
prediction = model.predict(X_test)
getResult(Y_test,prediction)

In [None]:
# 1000 RF features with 10 max depth
# split into train and test
X_train, X_test, Y_train, Y_test = train_test_split(
    X_smote_10d, # variables X include every columns except Target
    Y_smote_10d,
    test_size = 0.2,
    random_state=30)

# build the model
learner = ensemble.RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
model = learner.fit(X_train,Y_train)
prediction = model.predict(X_test)
getResult(Y_test,prediction)

# Stratified K-Fold

## 1000 RF features with 4 max depth

In [None]:
# Create StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
clf = ensemble.RandomForestClassifier(n_estimators=1000, max_depth=4, random_state=0)

In [None]:
accu_stratified = []
cr = []
cm = []
res_list = []

for train_index, test_index in skf.split(X_smote_1k, Y_smote_1k):
    x_train_fold, x_test_fold = X_smote_1k.iloc[train_index], X_smote_1k.iloc[test_index]
    y_train_fold, y_test_fold = Y_smote_1k.iloc[train_index], Y_smote_1k.iloc[test_index]

    # train the model with each fold
    clf.fit(x_train_fold, y_train_fold)

    # get the accuracy
    accu_stratified.append(clf.score(x_test_fold, y_test_fold))

    # predict using the test set
    prediction = clf.predict(x_test_fold)

    # get the classification report
    cr.append(classification_report(y_test_fold,prediction))

    # create confusion matrix
    cm.append(confusion_matrix(y_test_fold,prediction))

    res = []
    for l in ['Mild','Moderate','Negative Control','Severe','Severe Negative']:
      prec,recall,_,_ = precision_recall_fscore_support(np.array(y_test_fold)==l,
                                                      np.array(prediction)==l,
                                                      pos_label=True,average=None)
      res.append([l,recall[0],recall[1]])
    res_list.append(res)

print('\nMax Accuracy:', max(accu_stratified)*100, '%')
print('\nMin Accuracy:', min(accu_stratified)*100, '%')
print('\nAverage Accuracy:', np.mean(accu_stratified)*100, '%')

In [None]:
accu_stratified

In [None]:
# classification report (highest accuracy)
print(cr[1])

In [None]:
# confusion matrix (highest accuracy)
CM = pd.DataFrame(cm[1])
CM.set_axis(['Mild','Moderate','Negative Control','Severe','Severe Negative'], axis=1, inplace=True)
CM.set_axis(['Mild','Moderate','Negative Control','Severe','Severe Negative'], axis=0, inplace=True)
print(CM)

In [None]:
# sensitivity and specificity (highest accuracy)
pd.DataFrame(res_list[1],columns = ['target','sensitivity','specificity'])

In [None]:
# classification report (lowest accuracy)
print(cr[4])

In [None]:
# confusion matrix (lowest accuracy)
CM = pd.DataFrame(cm[4])
CM.set_axis(['Mild','Moderate','Negative Control','Severe','Severe Negative'], axis=1, inplace=True)
CM.set_axis(['Mild','Moderate','Negative Control','Severe','Severe Negative'], axis=0, inplace=True)
print(CM)

In [None]:
# sensitivity and specificity (lowest accuracy)
pd.DataFrame(res_list[4],columns = ['target','sensitivity','specificity'])

## 1000 RF features with 10 max depth

In [None]:
# Create StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
clf = ensemble.RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)

In [None]:
accu_stratified = []
cr = []
cm = []
res_list = []

for train_index, test_index in skf.split(X_smote_10d, Y_smote_10d):
    # split the dataset
    x_train_fold, x_test_fold = X_smote_10d.iloc[train_index], X_smote_10d.iloc[test_index]
    y_train_fold, y_test_fold = Y_smote_10d.iloc[train_index], Y_smote_10d.iloc[test_index]

    # train the model with each fold
    clf.fit(x_train_fold, y_train_fold)

    # get the accuracy
    accu_stratified.append(clf.score(x_test_fold, y_test_fold))

    # predict using the test set
    prediction = clf.predict(x_test_fold)

    # get the classification report
    cr.append(classification_report(y_test_fold,prediction))

    # create confusion matrix
    cm.append(confusion_matrix(y_test_fold,prediction))

    res = []
    for l in ['Mild','Moderate','Negative Control','Severe','Severe Negative']:
      prec,recall,_,_ = precision_recall_fscore_support(np.array(y_test_fold)==l,
                                                      np.array(prediction)==l,
                                                      pos_label=True,average=None)
      res.append([l,recall[0],recall[1]])
    res_list.append(res)

print('\nMax Accuracy:', max(accu_stratified)*100, '%')
print('\nMin Accuracy:', min(accu_stratified)*100, '%')
print('\nAverage Accuracy:', np.mean(accu_stratified)*100, '%')

In [None]:
accu_stratified

In [None]:
# classification report (highest accuracy)
print(cr[1])

In [None]:
# confusion matrix (highest accuracy)
CM = pd.DataFrame(cm[1])
CM.set_axis(['Mild','Moderate','Negative Control','Severe','Severe Negative'], axis=1, inplace=True)
CM.set_axis(['Mild','Moderate','Negative Control','Severe','Severe Negative'], axis=0, inplace=True)
print(CM)

In [None]:
# sensitivity and specificity (highest accuracy)
pd.DataFrame(res_list[1],columns = ['target','sensitivity','specificity'])

In [None]:
# classification report (lowest accuracy)
print(cr[4])

In [None]:
# confusion matrix (lowest accuracy)
CM = pd.DataFrame(cm[4])
CM.set_axis(['Mild','Moderate','Negative Control','Severe','Severe Negative'], axis=1, inplace=True)
CM.set_axis(['Mild','Moderate','Negative Control','Severe','Severe Negative'], axis=0, inplace=True)
print(CM)

In [None]:
# sensitivity and specificity (lowest accuracy)
pd.DataFrame(res_list[4],columns = ['target','sensitivity','specificity'])