In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Loading and transposing the dataset

df = pd.read_csv("Train_117.csv")
df = df.T
x = df.iloc[5:,:]

# Loading the labels
labels = pd.read_csv("Train_clinical.csv")
y = labels.iloc[1:,1]

# Splitting the data into train and testset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [3]:
## Implementing random search to hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', None]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Below takes forever to run. Do not run it if not necessary

In [4]:
## First Nested CV process

# Outer CV
cv_outer = KFold(n_splits=5, shuffle=True, random_state=2)

# Dataframe for the inner CV results

cols = {"n_estimators":[],"min_samples_split":[],"min_samples_leaf":[],"max_features":[],"max_depth":[],"bootstrap":[]}
best_parameters = pd.DataFrame(data=cols)

for i in range(10):
    print("In progress...{}/10".format(i+1))

    for train_ix, test_ix in cv_outer.split(x_train):
        # Train and test split in the inner cross-validation set
        x_in_train, x_in_test = x_train.iloc[train_ix, :], x_train.iloc[test_ix, :]
        y_in_train, y_in_test = y_train.iloc[train_ix], y_train.iloc[test_ix]

        # Configure the inner cross-validation procedure
        cv_inner = KFold(n_splits=4, shuffle=True, random_state=i)

        # Define the model
        model = RandomForestClassifier(random_state=i)

        # Define search space
        search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 50, cv = cv_inner, random_state=i, n_jobs = -1)
        result = search.fit(x_in_train, y_in_train)
        best_model = result.best_estimator_

        # Evaluate model on the hold out dataset
        yhat = best_model.predict(x_in_test)

        # Evaluate the model
        acc = accuracy_score(y_in_test, yhat)

        # Store the result
        print("   Inner loop running...")
        best_parameters = best_parameters.append(result.best_params_,ignore_index = True)
    if i == 9:
        print("Done")

In progress...1/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
In progress...2/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
In progress...3/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
In progress...4/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
In progress...5/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
In progress...6/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
In progress...7/10
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop running...
   Inner loop run

In [5]:
best_parameters

Unnamed: 0,n_estimators,min_samples_split,min_samples_leaf,max_features,max_depth,bootstrap
0,1000.0,2.0,2.0,auto,70.0,1.0
1,400.0,5.0,4.0,,20.0,0.0
2,800.0,10.0,2.0,,110.0,1.0
3,600.0,5.0,2.0,,,1.0
4,1600.0,10.0,1.0,,10.0,0.0
5,1000.0,10.0,4.0,,90.0,0.0
6,600.0,10.0,2.0,,30.0,0.0
7,400.0,5.0,4.0,,100.0,1.0
8,400.0,5.0,4.0,,100.0,1.0
9,1000.0,10.0,4.0,,90.0,0.0


In [6]:
best_parameters.to_csv("RandomSearch_Best_Parameters_hubgenes.csv")

In [7]:
# See the frequency of best hyperparameter in Random Search
for col in best_parameters.columns:
    print(best_parameters[col].value_counts(),"\n")

1200.0    9
1800.0    7
800.0     6
400.0     5
200.0     5
1400.0    5
1000.0    4
600.0     4
1600.0    3
2000.0    2
Name: n_estimators, dtype: int64 

2.0     18
5.0     18
10.0    14
Name: min_samples_split, dtype: int64 

1.0    22
2.0    17
4.0    11
Name: min_samples_leaf, dtype: int64 

auto    10
Name: max_features, dtype: int64 

20.0     14
40.0      5
80.0      4
110.0     3
10.0      3
90.0      3
30.0      3
100.0     3
70.0      2
50.0      2
60.0      2
Name: max_depth, dtype: int64 

1.0    33
0.0    17
Name: bootstrap, dtype: int64 



In [8]:
from sklearn.model_selection import GridSearchCV
## Grid Search

# Outer CV
cv_outer = KFold(n_splits=5, shuffle=True, random_state=42)

# enumerate splits
outer_results = list()

for train_ix, test_ix in cv_outer.split(x_train):
    # Train and test split in the inner cross-validation set
    x_in_train, x_in_test = x_train.iloc[train_ix, :], x_train.iloc[test_ix, :]
    y_in_train, y_in_test = y_train.iloc[train_ix], y_train.iloc[test_ix]

    # configure the cross-validation procedure
    cv_inner = KFold(n_splits=4, shuffle=True, random_state=42)
    # define the model
    # max_depth and bootstrap are fixed from the most frequent one
    model = RandomForestClassifier(max_depth = 20, random_state=42, bootstrap= True)
    
    # define search space based on the frequency above
    # total 12 combination possible
    space = dict()
    space['n_estimators'] = [800, 1200, 1800]
    space['min_samples_split'] = [2,5]
    space['min_samples_leaf'] = [1,2]
    # define search
    search = GridSearchCV(model, space, scoring='accuracy', cv=cv_inner, refit=True)
    # execute search
    result = search.fit(x_in_train, y_in_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(x_in_test)
    # evaluate the model
    acc = accuracy_score(y_in_test, yhat)
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))

# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.750, est=0.812, cfg={'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1800}
>acc=0.688, est=0.797, cfg={'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 800}
>acc=0.812, est=0.750, cfg={'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 800}
>acc=0.812, est=0.672, cfg={'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 800}
>acc=0.750, est=0.844, cfg={'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 800}
Accuracy: 0.762 (0.047)


# I've already tested this on the test set and saw the accuracy, but I'll not leave it here, in case any of you want to change something above.

In [11]:
# Final model with hand-picked hyperparameters from the 5 best models of outer CV
final_model = RandomForestClassifier(n_estimators = 800, min_samples_split = 2, min_samples_leaf = 1, max_depth = 20, bootstrap = True)

In [12]:
final_model.fit(x_train,y_train)
pred = final_model.predict(x_test)
acc = accuracy_score(pred,y_test)
acc

0.8

In [20]:
# Final model on raw data

df_raw = pd.read_csv("Train_call.csv")
df_raw = df_raw.T
x_raw = df_raw.iloc[4:,:]

# Splitting the data into train and testset
x_raw_train, x_raw_test, y_train, y_test = train_test_split(x_raw, y, test_size=0.2, random_state=42)

In [22]:
final_model_raw = final_model
final_model_raw.fit(x_raw_train, y_train)
pred_raw = final_model_raw.predict(x_raw_test)
acc_raw = accuracy_score(pred_raw, y_test)
acc_raw

0.75