In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Loading and transposing the dataset

df = pd.read_csv("Train_117.csv")
df = df.T
x = df.iloc[5:,:]

# Loading the labels
labels = pd.read_csv("Train_clinical.csv")
y = labels.iloc[1:,1]

# Splitting the data into train and testset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [3]:
## Implementing random search to hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Below takes forever to run. Do not run it if not necessary

In [4]:
## First Nested CV process

# Outer CV
cv_outer = KFold(n_splits=5, shuffle=True, random_state=2)

# Dataframe for the inner CV results

cols = {"n_estimators":[],"min_samples_split":[],"min_samples_leaf":[],"max_features":[],"max_depth":[],"bootstrap":[]}
best_parameters = pd.DataFrame(data=cols)

for i in range(10):
    print("In progress...{}/10".format(i+1))

    for train_ix, test_ix in cv_outer.split(x_train):
        # Train and test split in the inner cross-validation set
        x_in_train, x_in_test = x_train.iloc[train_ix, :], x_train.iloc[test_ix, :]
        y_in_train, y_in_test = y_train.iloc[train_ix], y_train.iloc[test_ix]

        # Configure the inner cross-validation procedure
        cv_inner = KFold(n_splits=4, shuffle=True, random_state=i)

        # Define the model
        model = RandomForestClassifier(random_state=i)

        # Define search space
        search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 50, cv = cv_inner, random_state=i, n_jobs = -1)
        result = search.fit(x_in_train, y_in_train)
        best_model = result.best_estimator_

        # Evaluate model on the hold out dataset
        yhat = best_model.predict(x_in_test)

        # Evaluate the model
        acc = accuracy_score(y_in_test, yhat)

        # Store the result
        print("   Inner loop running...")
        best_parameters = best_parameters.append(result.best_params_,ignore_index = True)
    if i == 9:
        print("Done")

In progress...1/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
In progress...2/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
In progress...3/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
In progress...4/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
In progress...5/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
In progress...6/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
In progress...7/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop run

In [5]:
best_parameters

Unnamed: 0,n_estimators,min_samples_split,min_samples_leaf,max_features,max_depth,bootstrap
0,600.0,2.0,2.0,sqrt,20.0,1.0
1,200.0,2.0,2.0,sqrt,80.0,1.0
2,200.0,5.0,2.0,sqrt,20.0,0.0
3,1200.0,2.0,1.0,sqrt,90.0,1.0
4,1800.0,10.0,1.0,auto,100.0,0.0
5,1200.0,2.0,2.0,sqrt,90.0,1.0
6,800.0,2.0,1.0,auto,100.0,0.0
7,1200.0,10.0,1.0,sqrt,40.0,0.0
8,1800.0,5.0,2.0,auto,20.0,0.0
9,200.0,2.0,1.0,sqrt,10.0,1.0


In [6]:
best_parameters.to_csv("RandomSearch_Best_Parameters.csv")

In [11]:
# See the frequency of best hyperparameter in Random Search
for col in best_parameters.columns:
    print(best_parameters[col].value_counts(),"\n")

200.0     13
1800.0     8
1200.0     5
1400.0     5
800.0      4
1000.0     4
400.0      4
1600.0     4
600.0      2
2000.0     1
Name: n_estimators, dtype: int64 

2.0     22
5.0     18
10.0    10
Name: min_samples_split, dtype: int64 

1.0    32
2.0    15
4.0     3
Name: min_samples_leaf, dtype: int64 

auto    26
sqrt    24
Name: max_features, dtype: int64 

20.0     13
100.0     7
80.0      6
90.0      5
10.0      4
40.0      3
60.0      3
50.0      3
70.0      1
110.0     1
30.0      1
Name: max_depth, dtype: int64 

1.0    25
0.0    25
Name: bootstrap, dtype: int64 



In [14]:
from sklearn.model_selection import GridSearchCV
## Grid Search

# Outer CV
cv_outer = KFold(n_splits=5, shuffle=True, random_state=42)

# enumerate splits
outer_results = list()

for train_ix, test_ix in cv_outer.split(x_train):
    # Train and test split in the inner cross-validation set
    x_in_train, x_in_test = x_train.iloc[train_ix, :], x_train.iloc[test_ix, :]
    y_in_train, y_in_test = y_train.iloc[train_ix], y_train.iloc[test_ix]

    # configure the cross-validation procedure
    cv_inner = KFold(n_splits=4, shuffle=True, random_state=42)
    # define the model
    model = RandomForestClassifier(n_estimators = 200, max_depth = 20, random_state=42)
    # define search space
    space = dict()
    space['min_samples_split'] = [2, 5]
    space['min_samples_leaf'] = [1, 2]
    space['max_features'] = ['auto','sqrt']
    space['bootstrap'] = [True,False]
    # define search
    search = GridSearchCV(model, space, scoring='accuracy', cv=cv_inner, refit=True)
    # execute search
    result = search.fit(x_in_train, y_in_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(x_in_test)
    # evaluate the model
    acc = accuracy_score(y_in_test, yhat)
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.875, est=0.766, cfg={'bootstrap': True, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5}
>acc=0.750, est=0.828, cfg={'bootstrap': False, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}
>acc=0.750, est=0.766, cfg={'bootstrap': True, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5}
>acc=0.812, est=0.656, cfg={'bootstrap': True, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5}
>acc=0.750, est=0.812, cfg={'bootstrap': True, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy: 0.787 (0.050)


# I've already tested this on the test set and saw the accuracy, but I'll not leave it here, in case any of you want to change it.

In [16]:
# Final Model

# Hand picking hyperparameters from GridSearch CV
final_model = RandomForestClassifier(n_estimators = 200, max_depth = 20, min_samples_split = 5, min_samples_leaf = 1, max_features = "auto", bootstrap = True)