In [10]:
# Import necessary libraries and modules
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [3]:
# Load the dataset
file_path = '/content/radar_parameters.csv'
data = pd.read_csv(file_path)

In [4]:
# Prepare the dataset; rename/remove columns
data = data.rename(columns={
    'Zh (dBZ)': 'Zh', 'Zdr (dB)': 'Zdr', 'Ldr (dB)': 'Ldr',
    'Kdp (deg km-1)': 'Kdp', 'Ah (dBZ/km)': 'Ah', 'Adr (dB/km)': 'Adp',
    'R (mm/hr)': 'R'
}).drop(columns=['Unnamed: 0'], errors='ignore')


In [5]:
# Display a few rows of dataset to verify
data.head()

Unnamed: 0,Zh,Zdr,Ldr,Kdp,Ah,Adp,R
0,23.144878,0.418637,-41.757733,0.005395,0.00029,1.2e-05,2.39352
1,22.737156,0.32285,-43.772069,0.005194,0.00036,1.2e-05,3.502699
2,26.869826,0.330948,-43.577399,0.013385,0.000903,3e-05,8.627561
3,28.540561,0.39948,-42.139731,0.018872,0.001036,4.3e-05,8.424447
4,30.500127,0.543758,-39.763087,0.027438,0.001157,6.4e-05,8.189291


In [6]:
# (1) Split the data into training and testing sets (70-30 split)
train_df, test_df = train_test_split(data, test_size=0.3, random_state=42)

In [7]:
# Define the features and target variable
features = ['Zh', 'Zdr', 'Ldr', 'Kdp', 'Ah', 'Adp']
target = 'R'

In [8]:
# Split features and target datasets for training and testing
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]


In [9]:
# (2) Train and validate a multiple linear regression model

# Setup regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Set predictions on training and testing sets
linear_train_pred = linear_model.predict(X_train)
linear_test_pred = linear_model.predict(X_test)

# Calculating R^2 and RMSE for training and testing datasets for the linear model
linear_train_r2 = r2_score(y_train, linear_train_pred)
linear_test_r2 = r2_score(y_test, linear_test_pred)
linear_train_rmse = np.sqrt(mean_squared_error(y_train, linear_train_pred))
linear_test_rmse = np.sqrt(mean_squared_error(y_test, linear_test_pred))



In [10]:
# Calculate baseline values for training and testing sets
baseline_train_pred = (train_df['Zh'] / 200) ** (1 / 1.6)
baseline_test_pred = (test_df['Zh'] / 200) ** (1 / 1.6)

# Calculate R^2 for baseline
baseline_train_r2 = r2_score(y_train, baseline_train_pred)
baseline_test_r2 = r2_score(y_test, baseline_test_pred)

# Calculate RMSE for baseline
baseline_train_rmse = np.sqrt(mean_squared_error(y_train, baseline_train_pred))
baseline_test_rmse = np.sqrt(mean_squared_error(y_test, baseline_test_pred))


In [11]:
# Display values for regression vs baseline
print("Multiple Linear Regression vs Baseline")
print("Training set:")
print(f"Linear Regression R^2: {linear_train_r2}, Baseline R^2: {baseline_train_r2}")
print(f"Linear Regression RMSE: {linear_train_rmse}, Baseline RMSE: {baseline_train_rmse}")
print("\nTesting set:")
print(f"Linear Regression R^2: {linear_test_r2}, Baseline R^2: {baseline_test_r2}")
print(f"Linear Regression RMSE: {linear_test_rmse}, Baseline RMSE: {baseline_test_rmse}")


Multiple Linear Regression vs Baseline
Training set:
Linear Regression R^2: 0.9879085512445995, Baseline R^2: -0.7843154346178256
Linear Regression RMSE: 0.9229401590287888, Baseline RMSE: 11.21166698776975

Testing set:
Linear Regression R^2: 0.9890992951689396, Baseline R^2: -0.7389185953481798
Linear Regression RMSE: 0.9358124742086974, Baseline RMSE: 11.819556086271271


In [12]:
# (3) Split the data into training and testing sets (70-30 split) & search over polynomial orders,
# using a grid search over orders 0-21, and use cross-validation of 7 folds

# Load the dataset
file_path = '/content/radar_parameters.csv'
data = pd.read_csv(file_path)

In [13]:
# Prepare the dataset; rename/remove columns
data = data.rename(columns={
    'Zh (dBZ)': 'Zh', 'Zdr (dB)': 'Zdr', 'Ldr (dB)': 'Ldr',
    'Kdp (deg km-1)': 'Kdp', 'Ah (dBZ/km)': 'Ah', 'Adr (dB/km)': 'Adp',
    'R (mm/hr)': 'R'
}).drop(columns=['Unnamed: 0'], errors='ignore')


In [14]:
# Split the data into training and testing sets (70-30 split)
train_df, test_df = train_test_split(data, test_size=0.3, random_state=42)

In [15]:
# Define the features and target variable
features = ['Zh', 'Zdr', 'Ldr', 'Kdp', 'Ah', 'Adp']
target = 'R'

In [16]:
# Split features and target datasets for training and testing
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]


In [17]:
# Search over polynomial orders using a grid search over orders 0-10, and use
# cross-validation of 7 folds

# Set 7-fold cross-validation
cv_folds = 7

# Set the polynomial orders
poly_orders = range(1, 10)

# Setup pipeline with PolynomialFeatures and LinearRegression
pipeline = make_pipeline(PolynomialFeatures(), LinearRegression())

# Define parameter grid to search over
param_grid = {
    'polynomialfeatures__degree': poly_orders
}

# Setup grid search with 7-fold cross-validation
poly_grid_search = GridSearchCV(pipeline, param_grid, cv=cv_folds, scoring='r2', verbose=1)

# Perform grid search
poly_grid_search.fit(X_train, y_train)

# Extract best model info
best_poly_model = poly_grid_search.best_estimator_
best_poly_params = poly_grid_search.best_params_
best_poly_score = poly_grid_search.best_score_

# Predictions/performance on test set for best polynomial model
poly_test_pred = best_poly_model.predict(X_test)
poly_test_r2 = r2_score(y_test, poly_test_pred)

Fitting 7 folds for each of 9 candidates, totalling 63 fits


In [18]:
# Display values for regression vs baseline
print("Best Polynomial Order vs Baseline")
print("Training set:")
print(f"Polynomial R^2: {best_poly_score}, Baseline R^2: {baseline_train_r2}")
print("\nTesting set:")
print(f"Polynomial R^2: {poly_test_r2}, Baseline R^2: {baseline_test_r2}")


Best Polynomial Order vs Baseline
Training set:
Polynomial R^2: 0.9969985736506703, Baseline R^2: -0.7843154346178256

Testing set:
Polynomial R^2: 0.9995805761885587, Baseline R^2: -0.7389185953481798


In [3]:
# (4) Split the data into training and testing sets (70-30 split) & use a Random Forest
# to perform a grid_search

# Load the dataset
file_path = '/content/radar_parameters.csv'
data = pd.read_csv(file_path)

In [4]:
# Prepare the dataset; rename/remove columns
data = data.rename(columns={
    'Zh (dBZ)': 'Zh', 'Zdr (dB)': 'Zdr', 'Ldr (dB)': 'Ldr',
    'Kdp (deg km-1)': 'Kdp', 'Ah (dBZ/km)': 'Ah', 'Adr (dB/km)': 'Adp',
    'R (mm/hr)': 'R'
}).drop(columns=['Unnamed: 0'], errors='ignore')


In [5]:
# Split the data into training and testing sets (70-30 split)
train_df, test_df = train_test_split(data, test_size=0.3, random_state=42)

In [6]:
# Define the features and target variable
features = ['Zh', 'Zdr', 'Ldr', 'Kdp', 'Ah', 'Adp']
target = 'R'

In [7]:
# Split features and target datasets for training and testing
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]


In [11]:
# Define the parameter grid

param_grid_rf = {
    'bootstrap': [True, False],
    'max_depth': [10, 50],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 4],
    'min_samples_split': [2, 10],
    'n_estimators': [200, 1000]
}

# Setup the model and grid search
rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='r2', n_jobs=-1, verbose=2)

# Run the grid search
grid_search.fit(X_train, y_train)

# Display best parameters and scores
print("Best parameters:", grid_search.best_params_)
print("Best R^2 on CV:", grid_search.best_score_)

# Evaluate the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
print("R^2 on test set:", r2_score(y_test, predictions))
rmse = mean_squared_error(y_test, predictions, squared=False)
print("RMSE on test set:", rmse)


Fitting 5 folds for each of 64 candidates, totalling 320 fits


  warn(


Best parameters: {'bootstrap': True, 'max_depth': 50, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
Best R^2 on CV: 0.9795924819809978
R^2 on test set: 0.9880534083870811
RMSE on test set: 0.9796784362798733
