In [None]:
## Import required libraries ##
import sys, os, pickle
sys.path.append(os.path.abspath('../tools'))
# Third party imports
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
# Local imports
import preprocess as pre

In [None]:
## Select route to train ##
route = '4017105'

In [None]:
## Set filepaths ##
inputPath = os.path.abspath(f'../../data/{route}.csv')
if not os.path.isdir(f'../models/{route}/'):
    os.mkdir(f'../models/{route}/')
outputPath = f'../models/{route}/randomforest'

In [None]:
## Read and process data ##
df = pd.read_csv(inputPath)
df = pre.convertTime(df)
df = pre.calculateETA(df)
df = pre.splitTime(df)
df.drop('Heading', axis=1, inplace=True)
print(df.isnull().sum())

In [None]:
display(df)

In [None]:
## Split data into dependent and independent variables ##
X = df.iloc[:, df.columns != 'ETA'].values
y = df.iloc[:, df.columns == 'ETA'].values

In [None]:
## Split data into training and test sets ##
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=False) # Allocates last 20% of data as test set

In [None]:
## Train RFR model on the training set ##
regressor = RandomForestRegressor(n_estimators=500, max_depth=8, min_samples_leaf=2, random_state=42)
regressor.fit(X_train, y_train)

In [None]:
## Predicting the test set results ##
y_pred = regressor.predict(X_test)
print(y_pred)

In [None]:
## Determine feature importance ##
results = permutation_importance(regressor, X, y, scoring='neg_mean_squared_error')
importance = results.importances_mean
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
## Calculate accuracy ##
r2 = r2_score(y_test, y_pred)
print('R-squared score:', r2)
mae = mean_absolute_error(y_test, y_pred)
print('Mean absolute error:', mae)
mse = mean_squared_error(y_test, y_pred)
print('Mean squared error:', mse)

In [None]:
## Save trained model ##
pickle.dump(regressor, open(outputPath, 'wb'))