In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Load the data
radar_data = pd.read_csv(r'/data/keeling/a/melinda3/Homework/ATMS523/atms523-repo/Module5/machine-learning-1-mberman99/homework/radar_parameters.csv')

radar_data

Unnamed: 0.1,Unnamed: 0,Zh (dBZ),Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km),R (mm/hr)
0,0,23.144878,0.418637,-41.757733,0.005395,0.000290,0.000012,2.393520
1,1,22.737156,0.322850,-43.772069,0.005194,0.000360,0.000012,3.502699
2,2,26.869826,0.330948,-43.577399,0.013385,0.000903,0.000030,8.627561
3,3,28.540561,0.399480,-42.139731,0.018872,0.001036,0.000043,8.424447
4,4,30.500127,0.543758,-39.763087,0.027438,0.001157,0.000064,8.189291
...,...,...,...,...,...,...,...,...
18964,18964,31.515997,0.579955,-39.244229,0.034048,0.001417,0.000080,10.648020
18965,18965,29.993334,0.567935,-39.399188,0.024134,0.001032,0.000057,7.981875
18966,18966,31.685913,0.655681,-38.375696,0.033971,0.001165,0.000081,6.822691
18967,18967,32.980096,0.768586,-37.166218,0.043117,0.001285,0.000105,6.801169


In [3]:
#Split into input features and prediction data
rain_rate = radar_data['R (mm/hr)']
features = radar_data.drop('R (mm/hr)', axis =1)

In [4]:
#Split the data into training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(features, rain_rate, test_size=0.3)

In [5]:
#Train a linear regression model on the data 
#Validate the model with the testing data

from sklearn.linear_model import LinearRegression
rain_linear_model = LinearRegression().fit(X_train, Y_train)
predicted_rain_rate = rain_linear_model.predict(X_test)

In [6]:
#Compare the R^2 and RMSE to a formula

zh = radar_data['Zh (dBZ)']
z = 10 ** (zh/10)
r = (z/200) ** (1/1.6)


In [7]:
#Calculate the R^2 and RMSE for the model and compare it to the R^2 and RMSE for the equation predicted values
from sklearn.metrics import r2_score, mean_squared_error
#Training Data
r2_train_ml = r2_score(Y_test, predicted_rain_rate)
rmse_ml = np.sqrt(mean_squared_error(Y_test,  predicted_rain_rate))

r2_equation = r2_score(r, radar_data['R (mm/hr)'])
rmse_equation = np.sqrt(mean_squared_error(r, radar_data['R (mm/hr)']))

print(r2_train_ml, rmse_ml)
print(r2_equation, rmse_equation)


0.9890293530817221 0.9253498015342462
0.2407320902415907 7.157590840042378


In [8]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {'polynomialfeatures__degree': np.arange(7)}

grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7)

In [10]:
#Repeat 2 for a grid search over polynomials from 0-21
grid.fit(X_train, Y_train);

grid.best_params_


{'polynomialfeatures__degree': 3}

In [11]:
poly_model = grid.best_estimator_

In [12]:
#Evaluate the model by printing the R2 and the RMSE
poly_predicted = poly_model.predict(X_test)
poly_r2 = r2_score(poly_predicted, Y_test)
rmse_equation = np.sqrt(mean_squared_error(poly_predicted, Y_test))

print(poly_r2, rmse_equation)

0.9995409081944144 0.1899206750694736


# Repeat the first part with a random forest regression

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

param_grid = {'bootstrap': [True, False],  
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],  
'max_features': ['auto', 'sqrt'],  
'min_samples_leaf': [1, 2, 4],  
'min_samples_split': [2, 5, 10],  
'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

#I decreased some of the options for the model fit because it was taking a very long time to run. 
#It was taking the RFR hours to fit the data so I wanted to make it run faster
#by reducing the number of fit options

In [17]:
rain_random_forest = RandomForestRegressor

random_forest_grid = RandomizedSearchCV(rain_random_forest(), param_grid, n_iter = 100)

#Decreased the interations to 40, 75 and 100 took a long time

In [23]:
random_forest_grid.fit(X_train, Y_train)

KeyboardInterrupt: 

In [20]:
#Perform a grid search for the Random Forest Regressor
rf_model = random_forest_grid.best_estimator_

In [22]:
from sklearn.metrics import r2_score, mean_squared_error
rf_predicted = rf_model.predict(X_test)
rf_r2 = r2_score(rf_predicted, Y_test)
rmse_rf = np.sqrt(mean_squared_error(rf_predicted, Y_test))

print(rf_r2, rmse_rf)

0.9856526559405405 0.992166938464581
