In [1]:
# 1. Import Packages/Modules

import numpy as np
import pandas as pd
from beakerx.object import beakerx
from sklearn.model_selection import train_test_split
import cimcb as cb
beakerx.pandas_display_table() # by default display pandas tables as BeakerX interactive tables
print('All packages successfully loaded')

%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


All packages successfully loaded


In [2]:
# 2. Load Data and Peak Sheet

home = 'data/'  
file = 'GastricCancer_NMR.xlsx' 
DataTable,PeakTable = cb.utils.load_dataXL(home + file, DataSheet='Data', PeakSheet='Peak') 

Loadings PeakFile: Peak
Loadings DataFile: Data
Data Table & Peak Table is suitable.
TOTAL SAMPLES: 140 TOTAL PEAKS: 149
Done!


In [5]:
# 3. Get X, and Y

# Clean PeakTable
RSD = PeakTable['QC_RSD']   
PercMiss = PeakTable['Perc_missing']  
PeakTableClean = PeakTable[(RSD < 20) & (PercMiss < 10)]   

# Select subset of Data for the PLS-DA model
DataTable2 = DataTable[(DataTable.Class == "GC") | (DataTable.Class == "HE")]

# Create a Binary Y vector for stratifiying the samples
Outcomes = DataTable2['Class']                                  
Y = [1 if outcome == 'GC' else 0 for outcome in Outcomes]         
Y = np.array(Y)                                                

# Split DataTable2 and Y into train and test (with stratification)
DataTrain, DataTest, Ytrain, Ytest = train_test_split(DataTable2, Y, test_size=0.25, stratify=Y, random_state=42)

# Extract and scale the metabolite data from the DataTable
peaklist = PeakTableClean['Name']                           
XT = DataTrain[peaklist]                                    
XTlog = np.log(XT)                                          
XTscale = cb.utils.scale(XTlog, method='auto')              
XTknn = cb.utils.knnimpute(XTscale, k=3)                    


In [6]:
# 4. Optimise model hyperparameters 

# param_dict
param_dict_a = {'gamma': [2**-7, 2**-5, 2**-3, 2**-1]}
param_dict_b = {'C' : [2**-7, 2**-5, 2**-3, 2**-1, 2**2]} 
param_dict = {**param_dict_a, **param_dict_b}

# Initalise
cv = cb.cross_val.kfold(model=cb.model.SVM,                      
                                X=XTknn,                                 
                                Y=Ytrain,                               
                                param_dict=param_dict,                   
                                folds=10)                                

# Run and plot
cv.run()  
cv.plot()

Kfold: 100%|██████████| 20/20 [00:01<00:00, 13.90it/s]


metric changed from 'r2q2' to 'auc' as the model is non-parametric.
something


In [7]:
# 6. Test model
model = cb.model.SVM()
model.train(XTknn, Ytrain)
model.evaluate(cutoffscore=0.5) 

In [8]:
# 6. Save tables to excel

# CV full / cv 
table = pd.DataFrame(cv.table)
writer = pd.ExcelWriter("gastriccancer_svm_cv.xlsx")
table.to_excel(writer, index=False)
writer.save()

# Evaluate
table = pd.DataFrame(model.table)
writer = pd.ExcelWriter("gastriccancer_svm_eval.xlsx")
table.to_excel(writer, index=False)
writer.save()