In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [15]:
# Loading and transposing the dataset

df = pd.read_csv("Train_117.csv")
df = df.T
x = df.iloc[5:,:]

# Loading the labels
labels = pd.read_csv("Train_clinical.csv")
y = labels.iloc[1:,1]

# Splitting the data into train and testset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,107,108,109,110,111,112,113,114,115,116
Array.88,0,0,0,-1,-1,0,0,0,0,0,...,-1,0,0,0,0,1,1,1,0,-1
Array.47,-1,-1,-1,0,0,0,0,0,0,0,...,0,0,0,0,0,-1,0,-1,0,1
Array.110,0,0,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
Array.56,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,0,-1,1,0,0,-1,-1
Array.82,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,2,0,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Array.68,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,0,1,0,0,0,0,0,0,0,0
Array.139,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Array.118,0,0,0,0,0,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Array.30,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1


In [16]:
## Implementing random search to hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [17]:
## Nested CV process

# Outer CV
cv_outer = KFold(n_splits=5, shuffle=True, random_state=1)

# Lists for the inner CV results
outer_results = list()
best_parameters = list()

for train_ix, test_ix in cv_outer.split(x_test):
    # Train and test split in the inner cross-validation set
    x_in_train, x_in_test = x_test.iloc[train_ix, :], x_test.iloc[test_ix, :]
    y_in_train, y_in_test = y_test.iloc[train_ix], y_test.iloc[test_ix]
    
    # Configure the inner cross-validation procedure
    cv_inner = KFold(n_splits=4, shuffle=True, random_state=1)
    
    # Define the model
    model = RandomForestClassifier(random_state=1)
    
    # Define search space
    search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 50, cv = cv_inner, random_state=42, n_jobs = -1)
    result = search.fit(x_in_train, y_in_train)
    best_model = result.best_estimator_
    
    # Evaluate model on the hold out dataset
    yhat = best_model.predict(x_in_test)
    
    # Evaluate the model
    acc = accuracy_score(y_in_test, yhat)
    
    # Store the result
    outer_results.append(acc)
    best_parameters.append(result.best_params_)

    # Report progress
    print('acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))


acc=0.000, est=0.625, cfg={'n_estimators': 1400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 40, 'bootstrap': False}
acc=0.500, est=0.438, cfg={'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': False}
acc=0.250, est=0.562, cfg={'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': True}
acc=0.750, est=0.438, cfg={'n_estimators': 800, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': False}
acc=0.000, est=0.375, cfg={'n_estimators': 600, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}


In [19]:
print('Average Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))
print("We choose model {} for the final model, as it showed the highest accuracy".format(outer_results.index(max(outer_results))+1))

Average Accuracy: 0.300 (0.292)
We choose model 4 for the final model, as it showed the highest accuracy
