In [1]:
# 1. Import Packages/Modules

import numpy as np
import pandas as pd
from beakerx.object import beakerx
from sklearn.model_selection import train_test_split
import cimcb as cb
beakerx.pandas_display_table() # by default display pandas tables as BeakerX interactive tables
print('All packages successfully loaded')

Using TensorFlow backend.


All packages successfully loaded


In [2]:
# 2. Load Data and Peak Sheet

home = ''  
file = 'MTBLS90.xlsx' 
DataTable,PeakTable = cb.utils.load_dataXL(home + file, DataSheet='Data', PeakSheet='Peak') 

Loadings PeakFile: Peak
Loadings DataFile: Data
Data Table & Peak Table is suitable.
TOTAL SAMPLES: 968 TOTAL PEAKS: 189
Done!


In [3]:
# 3. Get X, and Y

# Select subset of Data for the PLS-DA model
DataTable2 = DataTable[(DataTable.Class == 1) | (DataTable.Class == 0)]

# Create a Binary Y vector for stratifiying the samples
Outcomes = DataTable2['Class']                                  
Y = [1 if outcome == 1 else 0 for outcome in Outcomes]         
Y = np.array(Y)                                                

# Split DataTable2 and Y into train and test (with stratification)
DataTrain, DataTest, Ytrain, Ytest = train_test_split(DataTable2, Y, test_size=0.25, stratify=Y, random_state=40)

# Extract and scale the metabolite data from the DataTable
peaklist = PeakTable['Name']                           
XT = DataTrain[peaklist]                                    
XTlog = np.log(XT)                                          
XTscale = cb.utils.scale(XTlog, method='auto')              
XTknn = cb.utils.knnimpute(XTscale, k=3)    


In [4]:
# 4. Optimise model hyperparameters (LEARNING-RATE)

# Learning Rate: Recommended range 0.0001 - 0.1 
# Momentum: 0 to 1 [steps of 0.1] (default values: 0, 0.9, 0.99)
# Decay: Recommended range 0 - 0.0001                 
# Nodes: ? Maybe 1 - 7
# Epochs: Can vary a lot... a recommended range 50 - 250  
# nesterov:  True / False (default False) ---> Classical momentum vs. delayed momentum (nesterov) ... Keep as false
# loss: standard loss is "binary_crossentropy" for binary, and "categorical_crossentropy" for multi-class... try "mean_squared_error" (issue to solve... if learning_rate / decay is too high, all ypred becomes nans)
 
# param_dict
lr = np.logspace(-5, 1, 7)
param_dict = dict(learning_rate=lr, momentum=0.0, decay=0.0, n_nodes=2, epochs=200, nesterov=False, verbose=0, loss='binary_crossentropy') # "binary_crossentropy" "mean_squared_error"

# Initalise
# kfold (proper kfold), kfold_average (previous kfold), holdout (default split=0.8) [split=0.80 means 80Train/20Ttest] 
cv = cb.cross_val.kfold(model=cb.model.NN_LinearLogit,                      
                                X=XTknn,                                 
                                Y=Ytrain,                              
                                param_dict=param_dict,                   
                                folds=10)                                

# Run and plot
cv.run()  
cv.plot()

Kfold: 100%|██████████| 7/7 [05:13<00:00, 50.39s/it]


metric changed from 'r2q2' to 'auc' as the model is non-parametric.


In [5]:
# 4. Optimise model hyperparameters (MOMENTUM)

# Learning Rate: Recommended range 0.0001 - 0.1 
# Momentum: 0 to 1 [steps of 0.1] (default values: 0, 0.9, 0.99)
# Decay: Recommended range 0 - 0.0001                 
# Nodes: ? Maybe 1 - 7
# Epochs: Can vary a lot... a recommended range 50 - 250  
# nesterov:  True / False (default False) ---> Classical momentum vs. delayed momentum (nesterov) ... Keep as false
# loss: standard loss is "binary_crossentropy" for binary, and "categorical_crossentropy" for multi-class... try "mean_squared_error" (issue to solve... if learning_rate / decay is too high, all ypred becomes nans) 


# param_dict
#m = np.logspace(-5, 0, 7)
#m = [0.9, 0.92, 0.94, 0.96, 0.98, 1] 
m = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
param_dict = dict(momentum=m, learning_rate=0.01, decay=0.0, n_nodes=2, epochs=200, nesterov=False, verbose=0, loss='binary_crossentropy') # "binary_crossentropy" "mean_squared_error"

# Initalise
# kfold (proper kfold), kfold_average (previous kfold), holdout (default split=0.8) [split=0.80 means 80Train/20Ttest] 
cv = cb.cross_val.kfold(model=cb.model.NN_LinearLogit,                      
                                X=XTknn,                                 
                                Y=Ytrain,                             
                                param_dict=param_dict,                   
                                folds=10)                                

# Run and plot
cv.run()  
cv.plot()

Kfold: 100%|██████████| 11/11 [18:14<00:00, 121.07s/it]


metric changed from 'r2q2' to 'auc' as the model is non-parametric.


In [5]:
# 4. Optimise model hyperparameters (DECAY)

# Learning Rate: Recommended range 0.0001 - 0.1 
# Momentum: 0 to 1 [steps of 0.1] (default values: 0, 0.9, 0.99)
# Decay: Recommended range 0 - 0.0001                 
# Nodes: ? Maybe 1 - 7
# Epochs: Can vary a lot... a recommended range 50 - 250  
# nesterov:  True / False (default False) ---> Classical momentum vs. delayed momentum (nesterov) ... Keep as false
# loss: standard loss is "binary_crossentropy" for binary, and "categorical_crossentropy" for multi-class... try "mean_squared_error" (issue to solve... if learning_rate / decay is too high, all ypred becomes nans)
 

decay = np.logspace(-10, 1, 12)
param_dict = dict(decay=decay, learning_rate=0.1, 0, momentum=0.0, n_nodes=2, epochs=200, nesterov=False, verbose=0, loss='binary_crossentropy') # "binary_crossentropy" "mean_squared_error"

# Initalise
# kfold (proper kfold), kfold_average (previous kfold), holdout (default split=0.8) [split=0.80 means 80Train/20Ttest] 
cv = cb.cross_val.kfold(model=cb.model.NN_LinearLogit,                      
                                X=XTknn,                                 
                                Y=Ytrain,                              
                                param_dict=param_dict,                   
                                folds=10)                                

# Run and plot
cv.run()  
cv.plot()

Kfold: 100%|██████████| 11/11 [18:14<00:00, 121.07s/it]


metric changed from 'r2q2' to 'auc' as the model is non-parametric.


In [6]:
# 4. Optimise model hyperparameters (NODES)

# Learning Rate: Recommended range 0.0001 - 0.1 
# Momentum: 0 to 1 [steps of 0.1] (default values: 0, 0.9, 0.99)
# Decay: Recommended range 0 - 0.0001                 
# Nodes: ? Maybe 1 - 7
# Epochs: Can vary a lot... a recommended range 50 - 250  
# nesterov:  True / False (default False) ---> Classical momentum vs. delayed momentum (nesterov) ... Keep as false
# loss: standard loss is "binary_crossentropy" for binary, and "categorical_crossentropy" for multi-class... try "mean_squared_error" (issue to solve... if learning_rate / decay is too high, all ypred becomes nans)


# param_dict
nodes = [1,2,3,4]
param_dict = dict(n_nodes=nodes, learning_rate=0.1, momentum=0, decay=0, epochs=200, nesterov=False, verbose=0, loss='binary_crossentropy') # "binary_crossentropy" "mean_squared_error"

# Initalise
# kfold (proper kfold), kfold_average (previous kfold), holdout (default split=0.8) [split=0.80 means 80Train/20Ttest] 
cv = cb.cross_val.kfold(model=cb.model.NN_LinearLogit,                      
                                X=XTknn,                                 
                                Y=Ytrain,                               
                                param_dict=param_dict,                   
                                folds=10)                                

# Run and plot
cv.run()  
cv.plot()

Kfold: 100%|██████████| 4/4 [10:03<00:00, 151.17s/it]


metric changed from 'r2q2' to 'auc' as the model is non-parametric.


In [11]:
# 4. Optimise model hyperparameters (EPOCHS)

# Learning Rate: Recommended range 0.0001 - 0.1 
# Momentum: 0 to 1 [steps of 0.1] (default values: 0, 0.9, 0.99)
# Decay: Recommended range 0 - 0.0001                 
# Nodes: ? Maybe 1 - 7
# Epochs: Can vary a lot... a recommended range 50 - 250  
# nesterov:  True / False (default False) ---> Classical momentum vs. delayed momentum (nesterov) ... Keep as false
# loss: standard loss is "binary_crossentropy" for binary, and "categorical_crossentropy" for multi-class... try "mean_squared_error" (issue to solve... if learning_rate / decay is too high, all ypred becomes nans)



# param_dict
#e = [10,20,30,40,50,60,80,90,100,110,120,130,140,150,160,170,190,200,210,220,230,240,250,350,400,450,500,550,600,650,700,750] 
e = list(range(1,1000))
param_dict = dict(epochs=e, learning_rate=0.1, momentum=0.3, decay=0.0, n_nodes=1, nesterov=False, verbose=0, loss='binary_crossentropy') # "binary_crossentropy" "mean_squared_error"

# Initalise
# kfold (proper kfold), kfold_average (previous kfold), holdout (default split=0.8) [split=0.80 means 80Train/20Ttest] 
cv = cb.cross_val.kfold(model=cb.model.NN_LinearLogit,                      
                                X=XTknn,                                 
                                Y=Ytrain,                               
                                param_dict=param_dict,                   
                                folds=10)                                

# Run and plot
cv.run()  
cv.plot(scale=3)

Kfold: 100%|██████████| 10/10 [10:27<00:00, 65.78s/it]


returning stats at 'x' epoch interval during training until epoch=999.
metric changed from 'r2q2' to 'auc' as the model is non-parametric.


In [9]:
# 5. Train and evaluate model

model = cb.model.NN_LinearLogit(learning_rate=0.1, momentum=0.3, decay=0.0, n_nodes=1, epochs=500) 
model.train(XTknn,Ytrain)
model.evaluate(cutoffscore=0.5)  

In [10]:
# 6. Test model

# Get X, Y
mu, sigma  = cb.utils.scale(XTlog, return_mu_sigma=True) 
peaklist = PeakTable.Name 
XV = DataTest[peaklist].values
XVlog = np.log(XV)
XVscale  = cb.utils.scale(XVlog, method='auto', mu=mu, sigma=sigma)
XVknn = cb.utils.knnimpute(XVscale, k=3)
YVpred = model.test(XVknn)

# Evaluate Ypred against Ytest
evals = [Ytest, YVpred] 
model.evaluate(evals, cutoffscore=0.5) 

In [11]:
# 7. Save tables to excel

# CV full / cv 
table = pd.DataFrame(cv.table)
writer = pd.ExcelWriter("MTBLS90_nnlinlog_cv.xlsx")
table.to_excel(writer, index=False)
writer.save()

# Evaluate
table = pd.DataFrame(model.table)
writer = pd.ExcelWriter("MTBLS90_nnlinlog_eval.xlsx")
table.to_excel(writer, index=False)
writer.save()