In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor

# Loading the data
df_raw = pd.read_csv('homework\\radar_parameters.csv')
df = df_raw.drop(columns=['Unnamed: 0'])

In [8]:
df.head()

Unnamed: 0,Zh (dBZ),Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km),R (mm/hr)
0,23.144878,0.418637,-41.757733,0.005395,0.00029,1.2e-05,2.39352
1,22.737156,0.32285,-43.772069,0.005194,0.00036,1.2e-05,3.502699
2,26.869826,0.330948,-43.577399,0.013385,0.000903,3e-05,8.627561
3,28.540561,0.39948,-42.139731,0.018872,0.001036,4.3e-05,8.424447
4,30.500127,0.543758,-39.763087,0.027438,0.001157,6.4e-05,8.189291


In [9]:
#Splitting the data into training and testing sets:

from sklearn.model_selection import train_test_split

X = df.drop('R (mm/hr)', axis=1)
y = df['R (mm/hr)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
#Training a multiple linear regression model:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr = LinearRegression()
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

print("Training R^2:", r2_score(y_train, y_train_pred))
print("Testing R^2:", r2_score(y_test, y_test_pred))
print("Training RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Testing RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))

Training R^2: 0.9879085512445995
Testing R^2: 0.9890992951689395
Training RMSE: 0.9229401590287889
Testing RMSE: 0.9358124742087004


In [5]:
#Baseline prediction
#Ensuring X_train['Zh (dBZ)'] and X_test['Zh (dBZ)'] are in the correct scale before applying the baseline_prediction

def baseline_prediction(Zh_dBZ):
    # Convert Zh from dBZ to linear Z
    Z_linear = 10**(Zh_dBZ / 10)
    # Now apply the Z-R relationship
    R = (Z_linear / 200)**(1/1.6)
    return R


baseline_train_pred = baseline_prediction(X_train['Zh (dBZ)'])
baseline_test_pred = baseline_prediction(X_test['Zh (dBZ)'])

print("Baseline Training R^2:", r2_score(y_train, baseline_train_pred))
print("Baseline Testing R^2:", r2_score(y_test, baseline_test_pred))
print("Baseline Training RMSE:", np.sqrt(mean_squared_error(y_train, baseline_train_pred)))
print("Baseline Testing RMSE:", np.sqrt(mean_squared_error(y_test, baseline_test_pred)))

Baseline Training R^2: 0.27555056242697507
Baseline Testing R^2: 0.35664291868109677
Baseline Training RMSE: 7.143950117300888
Baseline Testing RMSE: 7.189316160047872


In [None]:
#Grid search over polynomial orders

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

# Storing the best degree and score
best_degree = None
best_score = -np.inf

# Defining the total range of degrees
total_degrees = range(22)

# Splitting the total degrees into smaller batches if needed
degree_batches = [total_degrees[i:i+5] for i in range(0, len(total_degrees), 5)]

for batch in degree_batches:
    try:
        # Defining the range of degrees for the grid search
        poly_params = {'polynomialfeatures__degree': batch}

        # Creating a pipeline with PolynomialFeatures and LinearRegression
        poly_model = make_pipeline(PolynomialFeatures(), LinearRegression())

        # Setting up the grid search with 7-fold cross-validation
        poly_grid = GridSearchCV(poly_model, poly_params, cv=7, scoring='r2', n_jobs=1)

        # Fitting the grid search to the training data
        poly_grid.fit(X_train, y_train)

        # Checking if the best score in this batch is better than the overall best score
        if poly_grid.best_score_ > best_score:
            best_score = poly_grid.best_score_
            best_degree = poly_grid.best_params_['polynomialfeatures__degree']

    except MemoryError:
        print(f"Memory error occurred with batch: {batch}")
        continue
    except Exception as e:
        print(f"An error occurred with batch: {batch}: {e}")
        continue

print("Best polynomial degree overall:", best_degree)
print("Best polynomial R^2 overall:", best_score)

In [None]:
# Training a Random Forest Regressor with grid search

from sklearn.ensemble import RandomForestRegressor

rf_params = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

rf = RandomForestRegressor(random_state=42)
rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='r2', n_jobs=-1)
rf_grid.fit(X_train, y_train)

print("Best Random Forest parameters:", rf_grid.best_params_)
print("Best Random Forest R^2:", rf_grid.best_score_)