## Importing the libraries

In [25]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn import preprocessing
import pickle
import warnings
warnings.filterwarnings("ignore")

## Importing the dataset

In [26]:
dataset = pd.read_csv(r"dataset/train_data.csv")
#dataset = dataset.astype(int)
dataset = dataset.round(2)
dataset

Unnamed: 0,Infant,Heart Rate,Time (sec),Respiratory Rate
0,2,118.58,411.30,40.00
1,2,117.19,411.52,40.00
2,2,118.11,412.02,40.00
3,2,118.11,412.04,81.08
4,2,118.11,412.56,115.38
...,...,...,...,...
100112,10,106.01,158928.38,93.75
100113,10,102.04,160485.40,93.75
100114,10,104.17,161948.56,93.75
100115,10,245.90,162241.24,93.75


In [27]:
X = dataset[['Time (sec)','Respiratory Rate']] # features
y = dataset['Heart Rate']  # labels

In [28]:
# feature scaling
sc = preprocessing.StandardScaler()
X = sc.fit_transform(X)
y =  sc.fit_transform(y.values.reshape(-1, 1))

## Splitting the dataset into the Training set and Test set

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Grid Search Cross Validation to get the best hyperparameters

In [30]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=3)]
# Number of features to consider at every split
max_features = ['auto']

# Maximum number of levels in tree
#max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
#max_depth = [100,200,400,600]
max_depth = [200,400,600]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 8]

# Minimum number of samples required at each leaf node
#min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True]

criterion = ["friedman_mse"]

# Create the random grid
grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               #'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion':criterion
       }
pprint(grid)


{'bootstrap': [True],
 'criterion': ['friedman_mse'],
 'max_depth': [200, 400, 600],
 'max_features': ['auto'],
 'min_samples_split': [2, 5, 8],
 'n_estimators': [200, 1100, 2000]}


In [31]:

rf = RandomForestRegressor()

rf_random = GridSearchCV(estimator=rf, param_grid=grid,cv=2,
                               verbose=20, n_jobs=1,scoring='neg_mean_squared_error')

rf_random.fit(X_train, y_train.ravel())


Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV 1/2; 1/27] START bootstrap=True, criterion=friedman_mse, max_depth=200, max_features=auto, min_samples_split=2, n_estimators=200
[CV 1/2; 1/27] END bootstrap=True, criterion=friedman_mse, max_depth=200, max_features=auto, min_samples_split=2, n_estimators=200;, score=-0.558 total time=  25.0s
[CV 2/2; 1/27] START bootstrap=True, criterion=friedman_mse, max_depth=200, max_features=auto, min_samples_split=2, n_estimators=200
[CV 2/2; 1/27] END bootstrap=True, criterion=friedman_mse, max_depth=200, max_features=auto, min_samples_split=2, n_estimators=200;, score=-0.545 total time=  26.4s
[CV 1/2; 2/27] START bootstrap=True, criterion=friedman_mse, max_depth=200, max_features=auto, min_samples_split=2, n_estimators=1100
[CV 1/2; 2/27] END bootstrap=True, criterion=friedman_mse, max_depth=200, max_features=auto, min_samples_split=2, n_estimators=1100;, score=-0.557 total time= 2.4min
[CV 2/2; 2/27] START bootstrap=True, criter

[CV 2/2; 14/27] END bootstrap=True, criterion=friedman_mse, max_depth=400, max_features=auto, min_samples_split=5, n_estimators=1100;, score=-0.541 total time= 2.0min
[CV 1/2; 15/27] START bootstrap=True, criterion=friedman_mse, max_depth=400, max_features=auto, min_samples_split=5, n_estimators=2000
[CV 1/2; 15/27] END bootstrap=True, criterion=friedman_mse, max_depth=400, max_features=auto, min_samples_split=5, n_estimators=2000;, score=-0.553 total time= 3.5min
[CV 2/2; 15/27] START bootstrap=True, criterion=friedman_mse, max_depth=400, max_features=auto, min_samples_split=5, n_estimators=2000
[CV 2/2; 15/27] END bootstrap=True, criterion=friedman_mse, max_depth=400, max_features=auto, min_samples_split=5, n_estimators=2000;, score=-0.541 total time= 3.5min
[CV 1/2; 16/27] START bootstrap=True, criterion=friedman_mse, max_depth=400, max_features=auto, min_samples_split=8, n_estimators=200
[CV 1/2; 16/27] END bootstrap=True, criterion=friedman_mse, max_depth=400, max_features=auto, m

In [32]:
print(rf_random.best_params_)
print(rf_random.best_score_)

{'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': 400, 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 2000}
-0.5469618654053874


## Training the Random Forest Regression model with optimal hyperparameters

In [33]:
rf_regressor = RandomForestRegressor(n_estimators=2000, 
                                     bootstrap=True,
                                     criterion='friedman_mse',
                                     max_depth = 400,
                                     max_features = 'auto',
                                     min_samples_split=5)
                                    
rf_regressor.fit(X_train, y_train.ravel())


## Validation

In [35]:
y_pred = rf_regressor.predict(X_test)

In [36]:
print("Heart rate MSE:", mean_squared_error(y_test, y_pred))
print("Heart rate RMSE:", mean_squared_error(y_test, y_pred,squared=False))
print("Heart rate MAE:", mean_absolute_error(y_test, y_pred))


Heart rate MSE: 0.4496177125776639
Heart rate RMSE: 0.6705353924869767
Heart rate MAE: 0.39697090348394487


## Export the model

In [37]:
# save the model to disk
filename = 'models/random_forest.sav'
pickle.dump(rf_regressor, open(filename, 'wb'))