## RF

----

### 1. Import Modules

In [9]:
import numpy as np
import pandas as pd
import cimcb as cb
from sklearn.model_selection import train_test_split

print('All packages successfully loaded')

All packages successfully loaded


### 2. Load data and peak sheet

In [10]:
home = 'data/' 
file = 'MTBLS404.xlsx' 

DataTable,PeakTable = cb.utils.load_dataXL(home + file, DataSheet='Data', PeakSheet='Peak') 

Loadings PeakFile: Peak
Loadings DataFile: Data
Data Table & Peak Table is suitable.
TOTAL SAMPLES: 184 TOTAL PEAKS: 120
Done!


### 3. Extract X & Y

In [11]:
peaklist = PeakTable['Name']
DataTable2 = DataTable[(DataTable['Class']==0) | (DataTable['Class']==1)]
Y = DataTable2['Class'].values 

### 4. Hyperparameters optimisation

In [12]:
# Set seed
seed=50

# Split DataTable2 and Y into train and test (with stratification
DataTrain, DataTest, Ytrain, Ytest = train_test_split(DataTable2, Y, test_size=0.30, stratify=Y, random_state=seed)

# Extract and scale the metabolite data from the DataTable
peaklist = PeakTable['Name']                           
XT = DataTrain[peaklist]                                    
XTlog = np.log(XT)                                          
XTscale = cb.utils.scale(XTlog, method='auto')              
XTknn = cb.utils.knnimpute(XTscale, k=3)                                      
                                         
# param_dict
trees = list(range(1,11))
depth = list(range(1,11))
param_dict = dict(n_estimators=trees, max_depth=depth, n_jobs=-1)


# Initalise
cv = cb.cross_val.kfold(model=cb.model.RF,                      
                                X=XTknn,                                 
                                Y=Ytrain,                               
                                param_dict=param_dict,                   
                                folds=5,
                                n_mc=5)                                

# Run and plot
cv.run()  
cv.plot(metric='r2q2', ci=95, scale=1.5)

Number of cores set to: 8
Running ...


100%|██████████| 500/500 [02:23<00:00,  4.26it/s]


Time taken: 2.47 minutes with 8 cores
Done!
metric changed from 'r2q2' to 'auc' as the model is non-parametric.


### 6. Get Model and Evaluate

In [13]:
# 6. Train, test and evaluate model

# Train model
model = cb.model.RF(n_estimators=8, max_depth=3, n_jobs=-1)
model.train(XTknn, Ytrain)

# Get X, Y
mu, sigma  = cb.utils.scale(XTlog, return_mu_sigma=True) 
peaklist = PeakTable.Name 
XV = DataTest[peaklist].values
XVlog = np.log(XV)
XVscale  = cb.utils.scale(XVlog, method='auto', mu=mu, sigma=sigma)
XVknn = cb.utils.knnimpute(XVscale, k=3)
YVpred = model.test(XVknn)

# Evaluate Ypred against Ytest
evals = [Ytest, YVpred] 
model.evaluate(evals, cutoffscore=0.5, plot_median=False) 

print("AUC Train: {}".format(model.table['auc'][0][0]))
print("AUC Test: {}".format(model.table['auc'][1][0]))

AUC Train: 0.97 (0.95, 0.99)
AUC Test: 0.7


In [14]:
# Get X
X = DataTable2[peaklist]                                    
Xlog = np.log(X)                                          
Xscale = cb.utils.scale(Xlog, method='auto')              
Xknn = cb.utils.knnimpute(Xscale, k=3)    

# Bootstrap evaluate
model.booteval(Xknn, Y, cutoffscore=0.5, bootnum=100) 

100%|██████████| 100/100 [00:43<00:00,  2.32it/s]


### 7. Save table

In [15]:
home = 'tables/'
file = 'RF_MTBLS404.xlsx'

model.save_table(home + file)

Done! Saved table as tables/RF_MTBLS404.xlsx
