In [20]:
%matplotlib inline
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Problem 1

Split the data into a 70-30 split for training and testing data.

In [8]:
df = pd.read_csv('homework/radar_parameters.csv')

In [9]:
df

Unnamed: 0.1,Unnamed: 0,Zh (dBZ),Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km),R (mm/hr)
0,0,23.144878,0.418637,-41.757733,0.005395,0.000290,0.000012,2.393520
1,1,22.737156,0.322850,-43.772069,0.005194,0.000360,0.000012,3.502699
2,2,26.869826,0.330948,-43.577399,0.013385,0.000903,0.000030,8.627561
3,3,28.540561,0.399480,-42.139731,0.018872,0.001036,0.000043,8.424447
4,4,30.500127,0.543758,-39.763087,0.027438,0.001157,0.000064,8.189291
...,...,...,...,...,...,...,...,...
18964,18964,31.515997,0.579955,-39.244229,0.034048,0.001417,0.000080,10.648020
18965,18965,29.993334,0.567935,-39.399188,0.024134,0.001032,0.000057,7.981875
18966,18966,31.685913,0.655681,-38.375696,0.033971,0.001165,0.000081,6.822691
18967,18967,32.980096,0.768586,-37.166218,0.043117,0.001285,0.000105,6.801169


In [10]:
X = df.drop('R (mm/hr)', axis=1)
X.shape

(18969, 7)

In [12]:
y = df['R (mm/hr)']
y.shape

(18969,)

In [39]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=0, train_size=0.7)

# Problem 2

Using the split created in (1), train a multiple linear regression dataset using the training dataset, and validate it using the testing dataset. Compare the R^2 and root mean square errors of model on the training and testing sets to a baseline prediction of rain rate using the formula Z=200(R^1.6).

In [44]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)

model.fit(Xtrain, ytrain)

test_model = model.predict(Xtest)

In [56]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

R2 = r2_score(ytest, test_model)
RMSE = mean_squared_error(ytest, test_model)

print(f'r-squared: {R2}')
print(f'RMSE: {RMSE}')

r-squared: 0.9868605147786396
RMSE: 0.9184105564917148


In [57]:
# Using the baseline prediction:

rain_base = ((10**(Xtest['Zh (dBZ)']/10))/200)**0.625

R2_base = r2_score(ytest, rain_base)
RMSE_base = mean_squared_error(ytest, rain_base)

print(f'r-squared: {R2_base}')
print(f'RMSE: {RMSE_base}')

r-squared: 0.22661047398943468
RMSE: 54.057605225928874


Clearly the multiple linear regression did a much better job than the baseline prediction with a higher r-squared value and much lower RMSE.

# Problem 3

Repeat 1 doing a grid search over polynomial orders, using a grid search over orders 0-21, and use cross-validation of 7 folds. For the best polynomial model in terms of R^2, does it outperform the baseline and the linear regression model in terms of R^2 and root mean square error?

In [61]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

In [72]:
param_grid = {'polynomialfeatures__degree': np.arange(7)}

grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7)

In [73]:
grid.fit(X, y)

In [74]:
grid.best_params_

{'polynomialfeatures__degree': 2}

In [75]:
model = grid.best_estimator_

test_model = model.fit(X, y).predict(Xtest)

R2 = r2_score(ytest, test_model)
RMSE = mean_squared_error(ytest, test_model)

print(f'r-squared: {R2}')
print(f'RMSE: {RMSE}')

r-squared: 0.9995464485904482
RMSE: 0.03170188142279072


Yes, the polynomial model significantly outperforms both the baseline model and the multiple linear regression model in terms of r-squared and RMSE.

# Problem 4

Repeat 1 with a Random Forest Regressor, and perform a grid_search on the following parameters:

{'bootstrap': [True, False],  
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],  
'max_features': ['auto', 'sqrt'],  
'min_samples_leaf': [1, 2, 4],  
'min_samples_split': [2, 5, 10],  
'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

Can you beat the baseline, or the linear regression, or best polynomial model with the best optimized Random Forest Regressor in terms of R^2 and root mean square error?