## Read Unseen DATA Watershed

In [2]:
import pandas as pd

# Load the data from the uploaded Excel file
file_path = 'Data/Samdeday/Validation_StudyArea3.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows of the dataframe to understand its structure
data.head()

Unnamed: 0,Date,Upstream Tributary Streamflow,Water Temperature,Cos(day of Year),E.coli,Turbidity,Precipitation,Solar Radiation,Agricultural Area,Slope,entropy,month,time_diff_days_global
0,01/31/2019,10.198862,7.8,0.860961,590,17.023714,0.0,140.6,0.018144,5.33,1.122054,1,42
1,02/14/2019,16.946472,9.4,0.714673,1500,40.665983,0.0,167.4,0.018144,5.33,1.122054,2,7
2,02/21/2019,146.27043,8.1,0.625411,2600,85.295922,8.1,75.3,0.018144,5.33,1.122054,2,7
3,02/28/2019,18.293785,10.1,0.527078,54,12.830474,8.8,105.0,0.018144,5.33,1.122054,2,7
4,03/07/2019,13.18232,9.0,0.421101,220,10.793657,0.0,220.8,0.018144,5.33,1.122054,3,7


In [None]:
import pandas as pd
import numpy as np

# Load the Study Area 3 validation data
new_data = pd.read_excel('Data/Samdeday/Validation_StudyArea3.xlsx')

# Create 'Log10_E.coli' column
new_data['Log10_E.coli'] = np.log10(new_data['E.coli'])

# Save the updated file if you want
new_data.to_excel('Data/Samdeday/Validation_(StudyArea3)_with_log10.xlsx', index=False)


## Pull Saved Model pipeline (From Testing) and  test zero-shot on validation data

In [8]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# === Load model and feature list ===
model = joblib.load('Data/Sameday/ETR_ecoli_model.pkl')
final_features = joblib.load('Data/Sameday/final_features.pkl')

# === Load new dataset ===
new_data = pd.read_excel('Data/Sameday/Validation_(StudyArea3)_with_log10.xlsx')

# Ensure all needed features are present
missing_features = [feat for feat in final_features if feat not in new_data.columns]
if missing_features:
    raise ValueError(f"Missing features in new data: {missing_features}")

# === Apply same transformation (consistent with training and testing)
X_new_log1p = np.log1p(new_data[final_features].where(new_data[final_features] > 0))
X_new_log1p = X_new_log1p.replace([np.inf, -np.inf], np.nan).fillna(0)

# === Run prediction ===
y_pred = model.predict(X_new_log1p)

# === Save predictions ===
new_data['Predicted_Value'] = y_pred
new_data.to_csv('Data/Sameday/new_data_with_predictions.csv', index=False)

# === Compute metrics ===
y_true = new_data['Log10_E.coli'].values
y_pred = new_data['Predicted_Value'].values

r2   = r2_score(y_true, y_pred)
mse  = mean_squared_error(y_true, y_pred)
mae  = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)  

print("\nValidation Metrics:")
print(f"R²:    {r2:.4f}")
print(f"MSE:   {mse:.4f}")
print(f"MAE:   {mae:.4f}")
print(f"MAPE:  {mape:.4f}")

# === Save observed vs predicted to Excel ===
output_df = pd.DataFrame({
    'Date': new_data['Date'].values,  
    'Observed_Log10_E.coli': y_true,
    'Predicted_Log10_E.coli': y_pred,
    'Observed_E.coli': 10 ** y_true,
    'Predicted_E.coli': 10 ** y_pred
})

output_path = 'C:/Users/as22dt/OneDrive - observed_vs_predicted_custom_interval.xlsx'
with pd.ExcelWriter(output_path, engine='xlsxwriter', datetime_format='yyyy-mm-dd') as writer:
    output_df.to_excel(writer, sheet_name='Results', index=False)

print(f"\nObserved vs. Predicted values saved to '{output_path}'.")



Validation Metrics:
R²:    0.6305
MSE:   0.1457
MAE:   0.3023
MAPE:  0.1373

Observed vs. Predicted values saved to 'C:/Users/as22dt/OneDrive - observed_vs_predicted_custom_interval.xlsx'.
