In [None]:
# Required packages
import random
import pandas as pd
import numpy as np
from sklearn import svm, metrics, neighbors, linear_model
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler


In [None]:
# Load data
s1 = pd.read_csv("cna_s1.csv", index_col = 0)

# Random state
rs = 42

In [None]:
# This could be interesting to change working directory
#import os
#print(os.getcwd())
#os.chdir('/home/CNIO.ES/asanchezb/esophageous_cancer/intensities/')
#print(os.getcwd())

In [None]:
# x and y split
x = s1.iloc[:,:-1]
y = s1.iloc[:,-1]

# Scale data from 0 to 1
scaler = MinMaxScaler()
scaler.fit_transform(x)

In [None]:
# outer cv
cv_outer = KFold(n_splits=5, shuffle=True, random_state=rs)

### LINEAR REGRESSION ###

In [None]:
# results
outer_results_ridge = []

In [None]:
# If you want tu use another metric, you must choose in sco other metric and in the performance metric
for train_ix, test_ix in cv_outer.split(x):
    
    #split in test and training set
    x_train, x_test = x.iloc[train_ix,:], x.iloc[test_ix,:]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    #inner cv
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=rs)
    # model
    model=linear_model.Ridge()
    #parameters LM
    par = dict()
    par['alpha'] = np.arange(0,1,0.01)
    #scoring
    sco = 'r2'
    # tuning of hyperparameters
    search = GridSearchCV(model,par,scoring=sco,cv=cv_inner)
    #result
    result = search.fit(x_train,y_train)
    # get the best model
    best_model = result.best_estimator_
    # prediction values about x test
    yhat = best_model.predict(x_test)
    #performance metric, accuracy
    acc = metrics.r2_score(y_test,yhat)
    outer_results_ridge.append(acc)
    print('>metric=%.3f, train=%.3f, par=%s' % (acc, result.best_score_, result.best_params_))

print('Metric: %.3f (%.3f)' % (np.mean(outer_results_ridge), np.std(outer_results_ridge)))

### SVM REGRESSION ###

In [None]:
# results
outer_results_svm = []

In [None]:
for train_ix, test_ix in cv_outer.split(x):
    
    #split in test and training set
    x_train, x_test = x.iloc[train_ix,:], x.iloc[test_ix,:]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    #inner cv
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=rs)
    # model
    model=svm.SVR(kernel='rbf')
    #parameters SVR
    par = dict()
    par['C'] = [1,10,100]
    par['gamma'] = ['scale',0.001,0.01,0.1]
    #scoring
    sco = 'r2'
    # tuning of hyperparameters
    search = GridSearchCV(model,par,scoring=sco,cv=cv_inner)
    #result
    result = search.fit(x_train,y_train)
    # get the best model
    best_model = result.best_estimator_
    # prediction values about x test
    yhat = best_model.predict(x_test)
    #performance metric, accuracy
    acc = metrics.r2_score(y_test,yhat)
    outer_results_svm.append(acc)
    print('>metric=%.3f, train=%.3f, par=%s' % (acc, result.best_score_, result.best_params_))

print('Metric: %.3f (%.3f)' % (np.mean(outer_results_svm), np.std(outer_results_svm)))

### KNN REGRESSION ###

In [None]:
# results
outer_results_knn = []

In [None]:
for train_ix, test_ix in cv_outer.split(x):
    
    #split in test and training set
    x_train, x_test = x.iloc[train_ix,:], x.iloc[test_ix,:]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    #inner cv
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=rs)
    # model
    model=neighbors.KNeighborsRegressor()
    #parameters knn
    par = dict()
    par['n_neighbors'] = np.arange(1,21,1)
    par['p'] = np.arange(1,6,1)
    #scoring
    sco = 'r2'
    # tuning of hyperparameters
    search = GridSearchCV(model,par,scoring=sco,cv=cv_inner)
    #result
    result_knn = search.fit(x_train,y_train)
    # get the best model
    best_model = result.best_estimator_
    # prediction values about x test
    yhat = best_model.predict(x_test)
    #performance metric, accuracy
    acc = metrics.r2_score(y_test,yhat)
    outer_results_knn.append(acc)
    print('>metric=%.3f, train=%.3f, par=%s' % (acc, result_knn.best_score_, result_knn.best_params_))

print('Metric: %.3f (%.3f)' % (np.mean(outer_results_knn), np.std(outer_results_knn)))

### SUMMARY RESULTS R2 ###

In [None]:
print('Ridge: %.3f (%.3f)' % (np.mean(outer_results_ridge), np.std(outer_results_ridge)))
print('SVM: %.3f (%.3f)' % (np.mean(outer_results_svm), np.std(outer_results_svm)))
print('kNN: %.3f (%.3f)' % (np.mean(outer_results_knn), np.std(outer_results_knn)))