# Module 5: Kaitlyn Jesmonth

In [21]:
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

## Problem 1

(1) Split the data into a 70-30 split for training and testing data.

In [2]:
# read in the data
radar_df = pd.read_csv('homework/radar_parameters.csv')
radar_df

Unnamed: 0.1,Unnamed: 0,Zh (dBZ),Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km),R (mm/hr)
0,0,23.144878,0.418637,-41.757733,0.005395,0.000290,0.000012,2.393520
1,1,22.737156,0.322850,-43.772069,0.005194,0.000360,0.000012,3.502699
2,2,26.869826,0.330948,-43.577399,0.013385,0.000903,0.000030,8.627561
3,3,28.540561,0.399480,-42.139731,0.018872,0.001036,0.000043,8.424447
4,4,30.500127,0.543758,-39.763087,0.027438,0.001157,0.000064,8.189291
...,...,...,...,...,...,...,...,...
18964,18964,31.515997,0.579955,-39.244229,0.034048,0.001417,0.000080,10.648020
18965,18965,29.993334,0.567935,-39.399188,0.024134,0.001032,0.000057,7.981875
18966,18966,31.685913,0.655681,-38.375696,0.033971,0.001165,0.000081,6.822691
18967,18967,32.980096,0.768586,-37.166218,0.043117,0.001285,0.000105,6.801169


In [3]:
# subset features (usually represented by X) and target data (usually represented by y) from dataset
X = radar_df.drop('R (mm/hr)', axis=1)
y = radar_df['R (mm/hr)']

In [4]:
# separate training and testing data with a 70/30 split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=0, train_size=0.7)

## Problem 2

Using the split created in (1), train a multiple linear regression dataset using the training dataset, and validate it using the testing dataset. Compare the $R^2$ and root mean square errors of model on the training and testing sets to a baseline prediction of rain rate using the formula $Z = 200 R^{1.6}$.

In [5]:
# set up linear regression
model = LinearRegression(fit_intercept=True)

# train model
model.fit(Xtrain, ytrain)

# validate model using the test data
ypredict = model.predict(Xtest)

In [6]:
# get r2 and rmse values for model
r2 = r2_score(ytest, ypredict)
rmse = np.sqrt(mean_squared_error(ytest, ypredict))

print(f'R-Squared: {r2}')
print(f'Root Mean Square Error: {rmse}')

R-Squared: 0.9868605147786396
Root Mean Square Error: 0.9583373917841854


In [7]:
# get r2 and rmse values for baseline
ybase = ((10**(Xtest['Zh (dBZ)']/10))/200)**(1/1.6)

r2_base = r2_score(ytest, ybase)
rmse_base = np.sqrt(mean_squared_error(ytest, ybase))

print(f'R-Squared: {r2_base}')
print(f'Root Mean Square Error: {rmse_base}')

R-Squared: 0.22661047398943468
Root Mean Square Error: 7.3523877227693095


The multiple linear regression model performed better than the baseline shown by the higher r-squared value and lower root mean square error.

## Problem 3

(3) Repeat 1 doing a grid search over polynomial orders, using a grid search over orders 0-21, and use cross-validation of 7 folds.  For the best polynomial model in terms of $R^2$, does it outperform the baseline and the linear regression model in terms of $R^2$ and root mean square error?

In [8]:
# define polynomial regression function
def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

In [9]:
# grid search over polynomial orders 0-6, for computational purposes, with a cross-validation of 7 to find the best polynomial model
param_grid = {'polynomialfeatures__degree': np.arange(7)}

grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7)

In [10]:
# fit polynomial regression
grid.fit(Xtrain, ytrain)

In [19]:
# find best parameters
#grid.best_params_

In [17]:
# find best polynomial model
best_model = grid.best_estimator_

# train best polynomial model
best_model.fit(Xtrain, ytrain)

# validate best polynomial model using the test data
ybest = best_model.predict(Xtest)

In [20]:
# get r2 and rmse values for best polynomial model 
r2_best = r2_score(ytest, ybest)
rmse_best = np.sqrt(mean_squared_error(ytest, ybest))

print(f'R-Squared: {r2_best}')
print(f'Root Mean Square Error: {rmse_best}')

R-Squared: 0.9994617255568535
Root Mean Square Error: 0.19396846916697366


The polynomial model does greatly outperform both the multiple linear regression and the baseline models in terms of r-squared and root mean square error values.

## Problem 4

(4) Repeat 1 with a Random Forest Regressor, and perform a grid_search on the following parameters:

```python
   {'bootstrap': [True, False],  
   'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],  
   'max_features': ['auto', 'sqrt'],  
   'min_samples_leaf': [1, 2, 4],  
   'min_samples_split': [2, 5, 10],  
   'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
   ```
  Can you beat the baseline, or the linear regression, or best polynomial model with the best optimized Random Forest Regressor in terms of $R^2$ and root mean square error?

In [36]:
# set parameters and grid search with a Random Forest Regressor
#param_grid = {'bootstrap': [True, False],  
#              'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],  
#              'max_features': ['auto', 'sqrt'],  
#              'min_samples_leaf': [1, 2, 4],  
#              'min_samples_split': [2, 5, 10],  
#              'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

param_grid = {'bootstrap': [True, False],  
              'max_depth': [10, 50, None],  
              'max_features': ['auto', 'sqrt'],  
              'min_samples_leaf': [1, 4],  
              'min_samples_split': [2, 5],  
              'n_estimators': [200, 1000]}

grid = GridSearchCV(RandomForestRegressor(), param_grid)

In [32]:
# fit random forest regressor
grid.fit(Xtrain, ytrain)

In [33]:
# find best parameters
grid.best_params_

{'bootstrap': False,
 'max_depth': 50,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}

In [34]:
# find best random forest regressor model
best_model = grid.best_estimator_

# train best random forest regressor model
best_model.fit(Xtrain, ytrain)

# validate best random forest regressor model using the test data
ybest = best_model.predict(Xtest)

In [35]:
# get r2 and rmse values for best random forest regressor model 
r2_best = r2_score(ytest, ybest)
rmse_best = np.sqrt(mean_squared_error(ytest, ybest))

print(f'R-Squared: {r2_best}')
print(f'Root Mean Square Error: {rmse_best}')

R-Squared: 0.9724255942546174
Root Mean Square Error: 1.3882968523303036


In [None]:
# try with a randomized grid search to see if scores can improve
random_grid = RandomizedSearchCV(RandomForestRegressor(), param_grid, n_iter=100)

In [37]:
# fit random forest regressor
random_grid.fit(Xtrain, ytrain)

In [39]:
# find best parameters
random_grid.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 50,
 'bootstrap': False}

In [40]:
# find best random forest regressor model
best_model_randomized = random_grid.best_estimator_

# train best random forest regressor model
best_model_randomized.fit(Xtrain, ytrain)

# validate best random forest regressor model using the test data
ybest_randomized = best_model_randomized.predict(Xtest)

In [41]:
# get r2 and rmse values for best random forest regressor model 
r2_best = r2_score(ytest, ybest_randomized)
rmse_best = np.sqrt(mean_squared_error(ytest, ybest_randomized))

print(f'R-Squared: {r2_best}')
print(f'Root Mean Square Error: {rmse_best}')

R-Squared: 0.9696036529519164
Root Mean Square Error: 1.457605355505851


Interestingly, the random forest regressor model (both with the traditional grid search and the randomized grid search) did worse based on the r-squared and root mean square error scores when compared to the multiple linear regression and the polynomial models. The randomized search also proved to be less accurate than the regular grid search. However, the random forest regressor model still significantly outperformed the baseline model for predicting rainfall rate.