# ML Model

In [210]:
# Further model for train test split

x_train, x_test, y_train, y_test = train_test_split( train.iloc[:, train.columns != 'fare_amount'], 
                         train.iloc[:, 0], test_size = 0.20, random_state = 1)

In [211]:
print(x_train.shape)

(12314, 7)


In [212]:
print(x_test.shape)

(3079, 7)


**Linear Regression Model**

In [213]:
model = LinearRegression()
model.fit(x_train, y_train)

In [214]:
print("Fare amount stats in training set:")
print(y_train.describe())

Fare amount stats in training set:
count    12314.000000
mean         2.333152
std          0.545348
min          0.760806
25%          1.945910
50%          2.251292
75%          2.602690
max          6.118097
Name: fare_amount, dtype: float64


In [215]:
# Predict on Train data
y_pred_train = model.predict(x_train)

# Compute RMSE
RMSE_train_LR = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Root Mean Squared Error for Train data = " + str(RMSE_train_LR))

# Calculate R² score
R2_train_LR = r2_score(y_train, y_pred_train)
print("R² score for Train data = ", R2_train_LR)

Root Mean Squared Error for Train data = 0.2574863274165948
R² score for Train data =  0.777055946176191


In [216]:
# Predict on Test data
y_pred_test = model.predict(x_test)

# Compute RMSE
RMSE_test_LR = np.sqrt(mean_squared_error(y_test, y_pred_test))
print("Root Mean Squared Error for Test data = " + str(RMSE_test_LR))

# Calculate R² score
R2_test_LR = r2_score(y_test, y_pred_test)
print("R² score for Test data = ", R2_test_LR)

Root Mean Squared Error for Test data = 0.24635441508894804
R² score for Test data =  0.7915949185124902


**Decision Tree Model**

In [217]:
model = DecisionTreeRegressor(max_depth=2)
model.fit(x_train, y_train)

In [218]:
# Predict on Train data
y_pred_train = model.predict(x_train)

# Compute RMSE
RMSE_train_DT = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Root Mean Squared Error for Train data = " + str(RMSE_train_DT))

# Compute R² score
print("R²_score for Train data = ", r2_score(y_train, y_pred_train))

Root Mean Squared Error for Train data = 0.29407567706468896
R²_score for Train data =  0.7091923953523228


In [219]:
# Predict on Test data
y_pred_test = model.predict(x_test)

# Compute RMSE
RMSE_test_DT = np.sqrt(mean_squared_error(y_test, y_pred_test))
print("Root Mean Squared Error for Test Data = " + str(RMSE_test_DT))

# Compute R² score
print("R²_score for Test data = ", r2_score(y_test, y_pred_test))

Root Mean Squared Error for Test Data = 0.28399409889905275
R²_score for Test data =  0.72304689659482


**Random Forest Model**

In [220]:
model = RandomForestRegressor(n_estimators=200)
model.fit(x_train, y_train)

In [221]:
# Predict on Train data
y_pred_train = model.predict(x_train)

# Compute RMSE
RMSE_train_RF = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Root Mean Squared Error for Train Data = " + str(RMSE_train_RF))

# Compute R² score
print("R²_score for Train Data = ", r2_score(y_train, y_pred_train))

Root Mean Squared Error for Train Data = 0.09374582122890777
R²_score for Train Data =  0.9704476934009677


In [222]:
# Predict on Test data
y_pred_test = model.predict(x_test)

# Compute RMSE
RMSE_test_RF = np.sqrt(mean_squared_error(y_test, y_pred_test))
print("Root Mean Squared Error for Test Data = " + str(RMSE_test_RF))

# Compute R² score
print("R²_score for Test data = ", r2_score(y_test, y_pred_test))

Root Mean Squared Error for Test Data = 0.24569661549789898
R²_score for Test data =  0.7927063721435674


**Gradient Boosting Model**

In [223]:
model = GradientBoostingRegressor()
model.fit(x_train, y_train)

In [224]:
# Predict on Train data
y_pred_train = model.predict(x_train)

# Compute RMSE
RMSE_train_GB = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Root Mean Squared Error for Train Data = " + str(RMSE_train_GB))

# Compute R² score
print("R²_score for Train data = ", r2_score(y_train, y_pred_train))

Root Mean Squared Error for Train Data = 0.22335519610240684
R²_score for Train data =  0.8322433794987887


In [225]:
# Predict on Test Data
y_pred_test = model.predict(x_test)

# Compute RMSE
RMSE_test_GB = np.sqrt(mean_squared_error(y_test, y_pred_test))
print("Root Mean Squared Error for Test Data = " + str(RMSE_test_GB))

# Compute R² score
print("R²_score for Test data = ", r2_score(y_test, y_pred_test))

Root Mean Squared Error for Test Data = 0.23291257079079958
R²_score for Test data =  0.8137168976496285


# Hyperparameter Tunning

**Random Forest**

In [226]:
rf = RandomForestRegressor(random_state = 42)

# Look at parameters used by our current forest

print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [227]:
# Random Search CV on Random Forest Model

RRF = RandomForestRegressor(random_state = 0)
n_estimator = list(range(1,20,2))
depth = list(range(1,100,2))

In [228]:
# Create the random grid

rand_grid = {'n_estimators': n_estimator,
               'max_depth': depth}

randomcv_rf = RandomizedSearchCV(RRF, param_distributions = rand_grid, n_iter = 5, cv = 5, random_state=0)

randomcv_rf = randomcv_rf.fit(x_train,y_train)
prediction_RRF = randomcv_rf.predict(x_test)

view_best_params_RRF = randomcv_rf.best_params_

best_model = randomcv_rf.best_estimator_

predictions_RRF = best_model.predict(x_test)

In [229]:
# R_square
RRF_r2 = r2_score(y_test, predictions_RRF)

# Calculating RMSE
RRF_rmse = np.sqrt(mean_squared_error(y_test,predictions_RRF))

In [230]:
print('Random Search CV Random Forest Regressor Model Performance:')
print('Best Parameters = ',view_best_params_RRF)
print('R-squared = {:0.2}.'.format(RRF_r2))
print('RMSE = ',RRF_rmse)

Random Search CV Random Forest Regressor Model Performance:
Best Parameters =  {'n_estimators': 15, 'max_depth': 9}
R-squared = 0.81.
RMSE =  0.23798160010688338


**Gradient Boosting**

In [231]:
gb = GradientBoostingRegressor(random_state = 42)

# Look at parameters used by our current forest

print('Parameters currently in use:\n')
pprint(gb.get_params())

Parameters currently in use:

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 42,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}


In [232]:
# Random Search CV on Gradient Boosting Model

gb = GradientBoostingRegressor(random_state = 0)
n_estimator = list(range(1,20,2))
depth = list(range(1,100,2))

In [233]:
# Create the random grid

rand_grid = {'n_estimators': n_estimator,
               'max_depth': depth}

randomcv_gb = RandomizedSearchCV(gb, param_distributions = rand_grid, n_iter = 5, cv = 5, random_state=0)

randomcv_gb = randomcv_gb.fit(x_train,y_train)
prediction_gb = randomcv_gb.predict(x_test)

view_best_params_gb = randomcv_gb.best_params_

best_model = randomcv_gb.best_estimator_

predictions_gb = best_model.predict(x_test)

In [234]:
# R_square
gb_r2 = r2_score(y_test, predictions_gb)

# Calculating RMSE
gb_rmse = np.sqrt(mean_squared_error(y_test,predictions_gb))

In [235]:
print('Random Search CV Gradient Boosting Model Performance:')
print('Best Parameters = ',view_best_params_gb)
print('R-squared = {:0.2}.'.format(gb_r2))
print('RMSE = ',gb_rmse)

Random Search CV Gradient Boosting Model Performance:
Best Parameters =  {'n_estimators': 15, 'max_depth': 9}
R-squared = 0.76.
RMSE =  0.26353421222668355


**Grid Search CV**

In [239]:
# Define base model
rf = RandomForestRegressor(random_state=42)

# Define parameter grid using strings as keys
param_grid = {
    'n_estimators': list(range(11, 20, 1)),
    'max_depth': list(range(5, 15, 2))
}

# Set up GridSearchCV
gridcv_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
gridcv_rf.fit(x_train, y_train)

# Now it's safe to predict
predictions_GRF = gridcv_rf.predict(x_test)


In [240]:
# Create the grid

grid_search = {'n_estimators': n_estimator,
               'max_depth': depth}

In [241]:
# Predict using GridSearchCV best model
predictions_GRF = gridcv_rf.predict(x_test)

# Evaluate metrics
GRF_r2 = r2_score(y_test, predictions_GRF)
GRF_rmse = np.sqrt(mean_squared_error(y_test, predictions_GRF))
view_best_params_GRF = gridcv_rf.best_params_

# Print results
print('Grid Search CV Random Forest Regressor Model Performance:')
print('Best Parameters =', view_best_params_GRF)
print('R-squared = {:.2f}'.format(GRF_r2))
print('RMSE = {:.2f}'.format(GRF_rmse))

Grid Search CV Random Forest Regressor Model Performance:
Best Parameters = {'max_depth': 7, 'n_estimators': 19}
R-squared = 0.81
RMSE = 0.23


In [242]:
# Grid Search CV for Gradient Boosting

gb = GradientBoostingRegressor(random_state = 0)
n_estimator = list(range(11,20,1))
depth = list(range(5,15,2))

In [243]:
grid_search = {'n_estimators': n_estimator,
               'max_depth': depth}

In [244]:
# Grid Search Cross-Validation with 5 fold CV

gridcv_gb = GridSearchCV(gb, param_grid = grid_search, cv = 5)

gridcv_gb = gridcv_gb.fit(x_train,y_train)
view_best_params_Ggb = gridcv_gb.best_params_

In [245]:
# Evaluate Metrics
predictions_Ggb = gridcv_gb.predict(x_test)
Ggb_r2 = r2_score(y_test, predictions_Ggb)
Ggb_rmse = np.sqrt(mean_squared_error(y_test,predictions_Ggb))

# Print Results
print('Grid Search CV Gradient Boosting regression Model Performance:')
print('Best Parameters = ',view_best_params_Ggb)
print('R-squared = {:0.2}.'.format(Ggb_r2))
print('RMSE = ',(Ggb_rmse))

Grid Search CV Gradient Boosting regression Model Performance:
Best Parameters =  {'max_depth': 5, 'n_estimators': 19}
R-squared = 0.8.
RMSE =  0.24260695324695197
