## Importing the libraries

In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from pprint import pprint

## Importing the dataset

In [5]:
dataset = pd.read_csv(r"dataset/train_data.csv")
dataset = dataset.drop(['Time (sec)','Infant'],axis=1)
X = dataset.iloc[:, -1].values # respiratory rate data
y = dataset.iloc[:, 0:-1].values  # heart rate labels

X = X.reshape(len(X),1)

In [6]:
print(X)

[[40.  ]
 [40.  ]
 [40.  ]
 ...
 [93.75]
 [93.75]
 [93.75]]


In [7]:
print(y)

[[118.57707 ]
 [117.1875  ]
 [118.11024 ]
 ...
 [104.166664]
 [245.90164 ]
 [107.52688 ]]


## Splitting the dataset into the Training set and Test set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Random Search Cross Validation in Scikit-Learn

In [6]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)


{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                               n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
# Fit the random search model
rf_random.fit(X_train, y_train.ravel())


In [78]:
rf_random.best_params_


{'n_estimators': 1000,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [11]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))

    return accuracy


In [12]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train.ravel())
base_accuracy = evaluate(base_model, X_train, y_train.ravel())


Model Performance
Average Error: 24.2413 degrees.
Accuracy = 81.28%.


In [80]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_train, y_train.ravel())


Model Performance
Average Error: 24.2401 degrees.
Accuracy = 81.28%.


In [None]:
print('Improvement of {:0.2f}%.'.format(
    100 * (random_accuracy - base_accuracy) / base_accuracy))


## Training the Random Forest Regression model on the whole dataset

In [7]:
rf_regressor = RandomForestRegressor(n_estimators=1000, min_samples_split=10,
                                     min_samples_leaf=2, max_features='sqrt', max_depth=10, bootstrap=True, random_state=42)
rf_regressor.fit(X_train, y_train.ravel())


## Predicting the Test set results

In [9]:
y_pred = rf_regressor.predict(X_test)
errors = abs(y_pred - y_test)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Model Performance')
print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
print('Accuracy = {:0.2f}%.'.format(accuracy))


Model Performance
Average Error: 24.3971 degrees.
Accuracy = 81.10%.


In [10]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))


[[143.45 178.57]
 [143.31 161.29]
 [141.43 119.05]
 ...
 [141.18 140.85]
 [139.61 111.94]
 [140.58 125.52]]


## Evaluating the Model Performance

In [12]:
from sklearn.metrics import mean_squared_error
print("Heart rate MSE:", mean_squared_error(y_test, y_pred))


Heart rate MSE: 968.7811634840056


## K-fold cross-validation.

In [31]:
from sklearn.model_selection import KFold

# define the number of folds for cross-validation
kf = KFold(n_splits=5)

In [32]:
# initialize a list to store the MSE for each fold
mse_list = []


In [33]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # train the random forest model
    rf = RandomForestRegressor(n_estimators=1000, min_samples_split=10,min_samples_leaf=2, max_features='sqrt', max_depth=10, bootstrap=True, random_state=42)
    rf.fit(X_train, y_train)

    # predict on the test set
    y_pred = rf.predict(X_test)

    # calculate the MSE for this fold
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)


  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  rf.fit(X_train, y_train)


In [34]:
# calculate the average MSE over all the folds
avg_mse = sum(mse_list) / len(mse_list)
print("Average MSE:", avg_mse)

Average MSE: 1025.5122370298309


## Export the model

In [None]:
# save the model to disk
filename = 'models/random_forest.sav'
pickle.dump(model, open(filename, 'wb'))