In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Loading and transposing the dataset

df = pd.read_csv("Train_call.csv")
df = df.T
df = df.astype(float)
x = df.iloc[4:,:]


In [3]:
labels = pd.read_csv("Train_clinical.csv")
y = labels.iloc[1:,1]
y.value_counts()

HR+           36
HER2+         32
Triple Neg    32
Name: C2, dtype: int64

In [4]:
## Implementing random search to hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [5]:
## Nested CV process

# Outer CV
cv_outer = KFold(n_splits=5, shuffle=True, random_state=1)

# Lists for the inner CV results
outer_results = list()
best_parameters = list()

for train_ix, test_ix in cv_outer.split(x):
    # Train and test split in the inner cross-validation set
    x_train, x_test = x.iloc[train_ix, :], x.iloc[test_ix, :]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    
    # Configure the inner cross-validation procedure
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
    
    # Define the model
    model = RandomForestClassifier(random_state=1)
    
    # Define search space
    search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 50, cv = cv_inner, random_state=42, n_jobs = -1)
    result = search.fit(x_train, y_train)
    best_model = result.best_estimator_
    
    # Evaluate model on the hold out dataset
    yhat = best_model.predict(x_test)
    
    # Evaluate the model
    acc = accuracy_score(y_test, yhat)
    
    # Store the result
    outer_results.append(acc)
    best_parameters.append(result.best_params_)

    # Report progress
    print('acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))


KeyboardInterrupt: 

In [6]:
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))
print("We choose model {} for the final model, as it showed the highest accuracy".format(outer_results.index(max(outer_results))+1))

Accuracy: 0.710 (0.037)
We choose model 1 for the final model, as it showed the highest accuracy
