In [1]:
# 1. Import Packages/Modules

import numpy as np
import pandas as pd
from beakerx.object import beakerx
from sklearn.model_selection import train_test_split
import cimcb as cb
beakerx.pandas_display_table() # by default display pandas tables as BeakerX interactive tables
print('All packages successfully loaded')


%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


All packages successfully loaded


In [2]:
# 2. Load Data and Peak Sheet

home = ''  
file = 'MTBLS90.xlsx' 
DataTable,PeakTable = cb.utils.load_dataXL(home + file, DataSheet='Data', PeakSheet='Peak') 

Loadings PeakFile: Peak
Loadings DataFile: Data
Data Table & Peak Table is suitable.
TOTAL SAMPLES: 968 TOTAL PEAKS: 189
Done!


In [3]:
# 3. Get X, and Y

# Select subset of Data for the PLS-DA model
DataTable2 = DataTable[(DataTable.Class == 1) | (DataTable.Class == 0)]

# Create a Binary Y vector for stratifiying the samples
Outcomes = DataTable2['Class']                                  
Y = [1 if outcome == 1 else 0 for outcome in Outcomes]         
Y = np.array(Y)  
Ytrain = Y

# Split DataTable2 and Y into train and test (with stratification)
#DataTrain, DataTest, Ytrain, Ytest = train_test_split(DataTable2, Y, test_size=0.25, stratify=Y, random_state=40)
DataTrain = DataTable2

# Extract and scale the metabolite data from the DataTable
peaklist = PeakTable['Name']                           
XT = DataTrain[peaklist]       
XTlog = np.log(XT)                                          
XTscale = cb.utils.scale(XTlog, method='auto')              
XTknn = cb.utils.knnimpute(XTscale, k=3)    

In [6]:
# 4. Optimise model hyperparameters 

# Optimiser: we are using the vanilla/standard gradient descent (SGD)

# Learning Rate: Recommended range 0.0001 - 0.1 
# Momentum: 0 to 1 [steps of 0.1] (default values: 0, 0.9, 0.99)
# Decay: Recommended range 0 - 0.0001                 
# Nodes: ? Maybe 1 - 7
# Epochs: Can vary a lot... a recommended range 50 - 250  
# nesterov:  True / False (default False) ---> Classical momentum vs. delayed momentum (nesterov)


### README: likely reason why linear-linear is so unstable
# standard loss is "binary_crossentropy" for binary, and "categorical_crossentropy" for multi-class
# However, linear-linear (end with linear) should be treated differently ... try using "mean_squared_error"
# Note (need to solve): with "mean_squared_error", if learning_rate / decay is too high, all ypred becomes nans

e = list(range(1,1000))

param_dict = dict(learning_rate=0.01, decay=0.0001, momentum=0.3, n_nodes=4, epochs=e, nesterov=False, verbose=0, loss='binary_crossentropy') # "binary_crossentropy" "mean_squared_error"

# kfold (proper kfold), kfold_average (previous kfold), holdout (default split=0.8) [split=0.80 means 80Train/20Ttest] 
cv = cb.cross_val.holdout(model=cb.model.NN_LinearLogit,                      
                                X=XTknn,                                 
                                Y=Ytrain,                               
                                param_dict=param_dict,
                                split=0.8)                                

# Run and plot mean @mean %percentage
cv.run()  
cv.plot(metric='acc', scale=1) # Accuracy
cv.plot(metric='auc', scale=1) # AUC
cv.plot(metric='sse', scale=1) # Actually this is mean squared error (should be named 'mse')

returning stats at 'x' epoch interval during training until epoch=999.


In [5]:
# 5. Train and evaluate model

model = cb.model.NN_LinearLinear(learning_rate=0.01, decay=0.0001, momentum=0, n_nodes=2, epochs=200, nesterov=False, verbose=0, loss='mean_squared_error') 
model.train(XTknn,Ytrain)
model.evaluate(cutoffscore=0.5)  