In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


df = pd.read_csv("merged_df.csv")

feature_cols = [
    "UNIXTime", "Temperature", "Pressure", "Humidity",
    "WindDirection(Degrees)", "Speed", "latitude", "longitude",
    "ws10m", "t2m"
]
target_cols = ["Radiation", "allsky_sfc_sw_dwn"]

X = df[feature_cols]
Y = df[target_cols]




X_train_raw, X_test_raw, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

#pca to find compoenents giving us 95 percnet monitor
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)

n_original = X_train_raw.shape[1]
n_pca = X_train_pca.shape[1]

print(f"Original features: {n_original}")
print(f"Components for 95% variance: {n_pca}")



print("Based on the results, we will use the entire dataset for modeling as all features contribute to 95 percent variance.")


In [None]:
X_train = X_train_scaled
X_test = X_test_scaled
y_train = Y_train.iloc[:, 0]
y_test = Y_test.iloc[:, 0]

#code for error for linreg, degree 1
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

linear_regression_prediction_training = linear_regression_model.predict(X_train)
linear_regression_prediction_testing = linear_regression_model.predict(X_test)

error_linear_train = mean_squared_error(y_train, linear_regression_prediction_training)
error_linear_test = mean_squared_error(y_test, linear_regression_prediction_testing)
print("Below are the outputs for  Linear Regression")
print("Test error: ", np.sqrt(error_linear_train))
print("Training error: ", np.sqrt(error_linear_test))

#code for polynomial linreg
poly_model = PolynomialFeatures(degree =2)
X_train_polynomial = poly_model.fit_transform(X_train)
X_test_polynomial = poly_model.transform(X_test)
polynomial_model = LinearRegression()
polynomial_model.fit(X_train_polynomial, y_train)
polynomial_regression_prediction_training = polynomial_model.predict(X_train_polynomial)
polynomial_regression_prediction_testing = polynomial_model.predict(X_test_polynomial)

error_polynomial_train = mean_squared_error(y_train, polynomial_regression_prediction_training)
error_polynomial_test = mean_squared_error(y_test, polynomial_regression_prediction_testing)
print("Below are the outputs for polynomial Linear Regression")
print("Train error: ", np.sqrt(error_polynomial_train))
print("Test error: ", np.sqrt(error_polynomial_test))

#code for ridge-based polynomial regression

ridge_model = Ridge(alpha = 100.0)
ridge_model.fit(X_train_polynomial, y_train)
ridge_regression_prediction_training = ridge_model.predict(X_train_polynomial)
ridge_regression_prediction_testing = ridge_model.predict(X_test_polynomial)

error_ridge_train = mean_squared_error(y_train, ridge_regression_prediction_training)
error_ridge_test = mean_squared_error(y_test, ridge_regression_prediction_testing)
print("Below are the outputs for Ridge-regularized Linear Regression")
print("Train error: ", np.sqrt(error_ridge_train))
print("Test error: ", np.sqrt(error_ridge_test))

#code for lasso-based polynomial regression

lasso_model = Lasso(alpha = .0001, max_iter = 100000)
lasso_model.fit(X_train_polynomial, y_train)
lasso_regression_prediction_training = ridge_model.predict(X_train_polynomial)
lasso_regression_prediction_testing = ridge_model.predict(X_test_polynomial)

error_lasso_train = mean_squared_error(y_train, ridge_regression_prediction_training)
error_lasso_test = mean_squared_error(y_test, ridge_regression_prediction_testing)
print("Below are the outputs for lasso-regularized Linear Regression")
print("Train error: ", np.sqrt(error_lasso_train))
print("Test error: ", np.sqrt(error_lasso_test))

#graphing polynomial, we deemed it best
plt.figure(figsize = (20,5))

plt.subplot(1,3,1)
plt.scatter(y_train, polynomial_regression_prediction_training, alpha = .5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw =2)
plt.xlabel("Actual values")
plt.ylabel("Predicted values")
plt.title("Actual vs Predicted Values for Polynomial Model")

residuals = y_test - polynomial_regression_prediction_testing
plt.subplot(1,3,2)
plt.scatter(polynomial_regression_prediction_testing, residuals, alpha = .5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw =2)
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.title("Residual Plot for Polynomial Model")


plt.subplot(1,3,3)
coef_values = polynomial_model.coef_
#just to make it more viewable in colab UI, we restricted to 8 features
coef_series = pd.Series(coef_values, index = poly_model.get_feature_names_out(X.columns))
coef_series_shortened = coef_series[:8]
coef_series_shortened.plot(kind = 'barh')
plt.tight_layout()
plt.title("Feature Importance of Polynomial Model")
plt.xlabel("Coefficient Value")
plt.show()

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
X_train_nn = np.array(X_train_scaled)
X_test_nn = np.array(X_test_scaled)
y_train_nn = Y_train.iloc[:, 0].values
y_test_nn = Y_test.iloc[:, 0].values

node_options = [16, 64, 256, 1024, 2048, 4096, 8192, 16384, 32768]


results = {
    "nodes": [],
    "avg_val_rmse": [],
    "avg_train_rmse": []
}


kf = KFold(n_splits=2, shuffle=True, random_state=41)


early_stop = EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True
)

for node_count in node_options:
    fold_val_rmse = []
    fold_train_rmse = []


    for train_index, val_index in kf.split(X_train_nn):
        X_fold_train = X_train_nn[train_index]
        X_fold_val = X_train_nn[val_index]
        y_fold_train = y_train_nn[train_index]
        y_fold_val = y_train_nn[val_index]

        model = Sequential()
        model.add(Input(shape=(X_train_nn.shape[1],)))
        model.add(Dense(node_count, activation="relu"))
        model.add(Dense(1, activation="linear"))

        model.compile(optimizer="adam", loss="mse")
        model.fit(
            X_fold_train,
            y_fold_train,
            validation_data=(X_fold_val, y_fold_val),
            epochs=37,
            batch_size=3,
            callbacks=[early_stop],
            verbose=0
        )

        y_pred_val = model.predict(X_fold_val, verbose=0).flatten()
        val_rmse = np.sqrt(mean_squared_error(y_fold_val, y_pred_val))
        fold_val_rmse.append(val_rmse)


        y_pred_train = model.predict(X_fold_train, verbose=0).flatten()
        train_rmse = np.sqrt(mean_squared_error(y_fold_train, y_pred_train))
        fold_train_rmse.append(train_rmse)


    avg_val_rmse = np.mean(fold_val_rmse)
    avg_train_rmse = np.mean(fold_train_rmse)

    results["nodes"].append(node_count)
    results["avg_val_rmse"].append(avg_val_rmse)
    results["avg_train_rmse"].append(avg_train_rmse)

    print(node_count, "|", round(avg_train_rmse, 4), "|", round(avg_val_rmse, 4))


plt.figure(figsize=(10, 6))

plt.plot(results["nodes"], results["avg_val_rmse"], marker="o", label="CV Validation Error (Avg)")
plt.plot(results["nodes"], results["avg_train_rmse"], marker="o", linestyle="--", label="CV Training Error (Avg)")

plt.title("5-Fold Cross Validation: Model Complexity vs. Error")
plt.xlabel("Number of Nodes")
plt.ylabel("Average RMSE")
plt.xscale("log")
plt.legend()
plt.show()


final_model = Sequential()
final_model.add(Input(shape=(X_train_nn.shape[1],)))
final_model.add(Dense(32768 , activation="relu"))
final_model.add(Dense(1, activation="linear"))
final_model.compile(optimizer="adam", loss="mse")

final_model.fit(
    X_train_nn,
    y_train_nn,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=[early_stop],
    verbose=0
)

y_final_pred = final_model.predict(X_test_nn, verbose=0).flatten()
final_test_rmse = np.sqrt(mean_squared_error(y_test_nn, y_final_pred))
final_r2 = r2_score(y_test_nn, y_final_pred)

print("Final Test error:", final_test_rmse)
print("Final Test r2:  ", final_r2)
