In [1]:
# 1. Import Packages/Modules

import numpy as np
import pandas as pd
from beakerx.object import beakerx
from sklearn.model_selection import train_test_split
import cimcb as cb
beakerx.pandas_display_table() # by default display pandas tables as BeakerX interactive tables
print('All packages successfully loaded')


Using TensorFlow backend.


All packages successfully loaded


In [2]:
# 2. Load Data and Peak Sheet

home = ''  
file = 'MTBLS90.xlsx' 
DataTable,PeakTable = cb.utils.load_dataXL(home + file, DataSheet='Data', PeakSheet='Peak') 

Loadings PeakFile: Peak
Loadings DataFile: Data
Data Table & Peak Table is suitable.
TOTAL SAMPLES: 968 TOTAL PEAKS: 189
Done!


In [3]:
# 3. Get X, and Y

# Select subset of Data for the PLS-DA model
DataTable2 = DataTable[(DataTable.Class == 1) | (DataTable.Class == 0)]

# Create a Binary Y vector for stratifiying the samples
Outcomes = DataTable2['Class']                                  
Y = [1 if outcome == 1 else 0 for outcome in Outcomes]         
Y = np.array(Y)                                                

# Split DataTable2 and Y into train and test (with stratification)
DataTrain, DataTest, Ytrain, Ytest = train_test_split(DataTable2, Y, test_size=0.5, stratify=Y, random_state=40)

# Extract and scale the metabolite data from the DataTable
peaklist = PeakTable['Name']                           
XT = DataTrain[peaklist]                                    
XTlog = np.log(XT)                                          
XTscale = cb.utils.scale(XTlog, method='auto')              
XTknn = cb.utils.knnimpute(XTscale, k=3)                                      


In [4]:
# 4. Optimise model hyperparameters 

# param_dict
#param_dict = {'n_components': [1,2,3,4,5,6]}                   # 2 components is the optimal
param_dict = {'n_components': [1,2,3,4,5,6,7,8,9,10,11,12]} 


# Initalise
cv = cb.cross_val.kfold(model=cb.model.PLS_SIMPLS,                      
                                X=XTknn,                                 
                                Y=Ytrain,                               
                                param_dict=param_dict,                   
                                folds=10)                                

# Run and plot
cv.run()  
cv.plot(metric='r2q2')
cv.plot(metric='auc')
#cv.plot(metric='sse')

Kfold: 100%|██████████| 12/12 [00:00<00:00, 21.35it/s]


In [5]:
# 5. Train and evaluate model

model = cb.model.PLS_SIMPLS(n_components=2)
model.train(XTknn, Ytrain)

#model.evaluate(specificity=0.8)  
model.evaluate(cutoffscore=0.5) 


In [6]:
# 6. Test model

# Get X, Y
mu, sigma  = cb.utils.scale(XTlog, return_mu_sigma=True) 
peaklist = PeakTable.Name 
XV = DataTest[peaklist].values
XVlog = np.log(XV)
XVscale  = cb.utils.scale(XVlog, method='auto', mu=mu, sigma=sigma)
XVknn = cb.utils.knnimpute(XVscale, k=3)
YVpred = model.test(XVknn)

# Evaluate Ypred against Ytest
evals = [Ytest, YVpred] 
model.evaluate(evals) 

In [7]:
# 6. Save tables to excel

# CV full / cv 
table = pd.DataFrame(cv.table)
writer = pd.ExcelWriter("MTBLS90_pls_cv.xlsx")
table.to_excel(writer, index=False)
writer.save()

# Evaluate
table = pd.DataFrame(model.table)
writer = pd.ExcelWriter("MTBLS90_pls_eval.xlsx")
table.to_excel(writer, index=False)
writer.save()
