In [2]:
# ----------------------------------
# 1. Mount Google Drive and Imports
# ----------------------------------

from google.colab import drive
drive.mount('/content/drive')

import os
import asyncio
import nest_asyncio
import openai

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_error,
    median_absolute_error,
    explained_variance_score
)

nest_asyncio.apply()  # allows asyncio.run() in notebooks

# ----------------------------------
# 2. Configure OpenAI Client
# ----------------------------------

# Read your API key from Google Drive
with open('/content/drive/MyDrive/key.txt', 'r') as file:
    api_key = file.read().strip()

# Create a client using the same pattern as your working code
client = openai.AsyncOpenAI(api_key=api_key)

# ----------------------------------
# 3. Load and Preprocess the Dataset
# ----------------------------------

url = "https://raw.githubusercontent.com/apownukepcc/spring-2025-datathon/main/009-Dataset-For-Predictions-With-Specific-Emissions.csv"
df = pd.read_csv(url)

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Define the specific date to be included in the test set later
specific_test_date = pd.to_datetime("2022-07-15")

# Ensure the specific test date is in the dataset
if specific_test_date not in df['date'].values:
    raise ValueError(f"No data available for the date: {specific_test_date}")

# Extract the row for the specific test date and remove it from the main dataset
specific_test_row = df[df['date'] == specific_test_date]
df = df[df['date'] != specific_test_date]

# Define feature columns (weather parameters)
feature_cols = ['tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'pres']

# Define target columns (specific emissions metrics)
target_cols = [
    'SO2TONS_per_LOADMWBA', 'SO2TONS_per_LOADMWBT',
    'NH3TONS_per_LOADMWBA', 'NH3TONS_per_LOADMWBT',
    'NOXTONS_per_LOADMWBA', 'NOXTONS_per_LOADMWBT',
    'COTONS_per_LOADMWBA', 'COTONS_per_LOADMWBT'
]

# Ensure no missing values in features and targets
df = df.dropna(subset=feature_cols + target_cols)

# ----------------------------------
# 4. Train-Test Split and Model Training
# ----------------------------------

X = df[feature_cols]
y = df[target_cols]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Append the specific test row to the test set (it will appear as the last row)
X_test = pd.concat([X_test, specific_test_row[feature_cols]])
y_test = pd.concat([y_test, specific_test_row[target_cols]])

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
predictions = rf.predict(X_test)

# ----------------------------------
# 5. Compute Accuracy Metrics
# ----------------------------------

mse = mean_squared_error(y_test, predictions, multioutput='raw_values')
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions, multioutput='raw_values')
mae = mean_absolute_error(y_test, predictions, multioutput='raw_values')
med_ae = median_absolute_error(y_test, predictions, multioutput='raw_values')
explained_var = explained_variance_score(y_test, predictions, multioutput='raw_values')

results = pd.DataFrame({
    'Parameter': target_cols,
    'MSE': mse,
    'RMSE': rmse,
    'R2 Score': r2,
    'MAE': mae,
    'Median AE': med_ae,
    'Explained Variance': explained_var
})

print("Prediction Results with Additional Accuracy Metrics:")
print(results)

# Compare predictions with actual values for the specific test date
pred_df = pd.DataFrame(predictions, index=y_test.index, columns=target_cols)
specific_pred = pred_df.iloc[-1]
specific_actual = y_test.iloc[-1]

comparison_df = pd.DataFrame({
    'Actual': specific_actual,
    'Predicted': specific_pred
})

print(f"\nComparison for specific test date ({specific_test_date.date()}):")
print(comparison_df)

# ----------------------------------
# 6. Prepare Summary for ChatGPT
# ----------------------------------

summary = f"""
Prediction Results with Additional Accuracy Metrics:
{results.to_string(index=False)}

Comparison for specific test date ({specific_test_date.date()}):
{comparison_df.to_string()}

Predictive Method Description:
A Random Forest Regressor with 100 estimators (random_state=42) was trained on 80% of the dataset using weather features
(tavg, tmin, tmax, prcp, snow, wdir, wspd, pres) to predict emissions metrics (SO2, NH3, NOX, COTONS per LOADMWBA/BT).
A specific date (2022-07-15) was held out from the training set and then appended to the test set for a focused prediction
comparison. Performance was evaluated using multiple metrics: MSE, RMSE, R² Score, MAE, Median AE, and Explained Variance.

Question:
Based on the above results and methodology, please provide a comment on the accuracy of these predictions
and any recommendations for improvement.
"""

# ----------------------------------
# 7. Async Function to Query ChatGPT
# ----------------------------------
async def get_chatgpt_comment(user_message):
    response = await client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": user_message}],
        temperature=0.2
    )
    return response.choices[0].message.content.strip()

# ----------------------------------
# 8. Send Query and Print Response
# ----------------------------------
comment_on_accuracy = asyncio.run(get_chatgpt_comment(summary))
print("\n--- ChatGPT Comment on Accuracy ---\n")
print(comment_on_accuracy)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Prediction Results with Additional Accuracy Metrics:
              Parameter           MSE      RMSE   R2 Score       MAE  \
0  SO2TONS_per_LOADMWBA  4.197568e-10  0.000020  -0.630971  0.000014   
1  SO2TONS_per_LOADMWBT  1.461227e-10  0.000012  -2.246682  0.000001   
2  NH3TONS_per_LOADMWBA  1.787414e-07  0.000423  -0.067237  0.000175   
3  NH3TONS_per_LOADMWBT  1.528978e-07  0.000391  -0.054744  0.000033   
4  NOXTONS_per_LOADMWBA  5.148829e-06  0.002269 -10.141955  0.000408   
5  NOXTONS_per_LOADMWBT  5.088503e-06  0.002256 -10.581662  0.000176   
6   COTONS_per_LOADMWBA  3.623751e-06  0.001904  -7.736545  0.000257   
7   COTONS_per_LOADMWBT  3.623631e-06  0.001904  -7.832085  0.000161   

      Median AE  Explained Variance  
0  1.028400e-05           -0.621141  
1  6.888574e-08           -2.246039  
2  1.347781e-04           -0.064674  
3  8.121663e-06  