In [12]:
# Install necessary libraries
!pip install openai nest_asyncio scikit-learn --upgrade

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
import json
import openai
from openai import AsyncOpenAI
import nest_asyncio
import asyncio
import traceback

nest_asyncio.apply()

########################################################
# 1) Set up the OpenAI client (read your API key)
########################################################
with open("/content/drive/MyDrive/key.txt", "r") as f:
    api_key = f.read().strip()

client = AsyncOpenAI(api_key=api_key)

########################################################
# 2) Read the template notebook from file
#    (File name: RandomForestRegressorTemplate.ipynb.txt)
########################################################
template_path = "/content/drive/MyDrive/RandomForestRegressorTemplate.ipynb.txt"
with open(template_path, "r", encoding="utf-8") as f:
    template_notebook_content = f.read()

########################################################
# 3) Functions to Generate, Correct, and Test Notebooks
########################################################

async def generate_notebook_from_template(regressor_name, regressor_class):
    """
    Generates a new .ipynb notebook for the given regressor by using the template
    from RandomForestRegressorTemplate.ipynb.txt as a reference.
    IMPORTANT: The generated notebook must use exactly the same feature_cols and
    target_cols as defined in the template.
    Returns the raw notebook JSON string.
    """
    system_message = (
        "Below is an example Google Colab notebook for predicting emission load using "
        "RandomForestRegressor. Use it as a reference to produce a similar .ipynb notebook for another regressor.\n\n"
        f"{template_notebook_content}\n\n"
        "Ensure the output is a complete and valid JSON object in .ipynb format with code cells only."
    )

    user_prompt = (
        f"Create a Google Colab .ipynb notebook to predict emission load using {regressor_name} "
        f"(i.e., scikit-learn's {regressor_class}). Structure the notebook similarly to the reference above, "
        "but change only the parts needed for the new regressor. IMPORTANT: Do not modify the definitions of "
        "feature_cols and target_cols; use exactly the same values as in the template. Also, explicitly calculate "
        "RMSE using:\n\n  rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n\n"
        "If the regressor does not support multi-output regression, either select one target column (for example, the first one) "
        "or wrap it using MultiOutputRegressor.\n\n"
        "Return ONLY valid JSON in .ipynb format with code cells and no extra text."
    )

    completion = await client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.2
    )
    return completion.choices[0].message.content.strip()

async def correct_notebook_code(bad_notebook_json, error_message):
    """
    Sends the current notebook JSON along with the error details and the template reference to ChatGPT
    for correction. The prompt instructs that the feature_cols and target_cols must remain unchanged.
    """
    correction_prompt = (
        "Below is the reference template from RandomForestRegressorTemplate:\n\n"
        f"{template_notebook_content}\n\n"
        "The previously generated notebook (in .ipynb JSON format) produced the following error when executed:\n\n"
        f"ERROR:\n{error_message}\n\n"
        "Notebook JSON:\n"
        f"{bad_notebook_json}\n\n"
        "Please correct the notebook so that it is valid JSON and runs successfully. "
        "IMPORTANT: Do not change the definitions of feature_cols and target_cols; they must be identical to those in the template. "
        "If there is an error regarding y being multi-dimensional, modify the code appropriately (e.g. select a single target column or wrap the regressor with MultiOutputRegressor).\n\n"
        "Return ONLY valid JSON in .ipynb format with code cells and no extra text."
    )
    completion = await client.chat.completions.create(
         model="gpt-3.5-turbo",
         messages=[{"role": "user", "content": correction_prompt}],
         temperature=0.2
    )
    return completion.choices[0].message.content.strip()

def validate_notebook_json(nb_content):
    """
    Validates that nb_content is valid JSON.
    Returns a tuple (is_valid, error_message).
    """
    try:
        json.loads(nb_content)
        return True, ""
    except json.JSONDecodeError as e:
        return False, f"JSONDecodeError: {str(e)}"

def test_notebook(ipynb_path):
    """
    Parses the notebook JSON, extracts code cells, and executes them in a shared environment.
    Returns (True, "Notebook ran successfully.") if all cells run without error;
    otherwise, returns (False, error message).
    """
    try:
        with open(ipynb_path, "r", encoding="utf-8") as f:
            notebook_json = json.load(f)
    except Exception as e:
        return False, f"Failed to parse notebook JSON: {str(e)}"

    if "cells" not in notebook_json:
        return False, "Invalid notebook JSON: missing 'cells' key."

    global_env = {}
    for idx, cell in enumerate(notebook_json["cells"]):
        if cell.get("cell_type") == "code":
            source_lines = cell.get("source", [])
            code_str = "".join(source_lines) if isinstance(source_lines, list) else source_lines
            try:
                exec(code_str, global_env)
            except Exception as e:
                tb_str = traceback.format_exc()
                return False, f"Error in cell {idx}: {str(e)}\nTraceback:\n{tb_str}"

    return True, "Notebook ran successfully."

########################################################
# 4) Main Logic: Generate, Test, and Correct Notebooks
########################################################
async def main():
    output_dir = "/content/drive/MyDrive/GeneratedNotebooksTest"
    os.makedirs(output_dir, exist_ok=True)

    # Read your regressor list from regressor-list.txt
    regressor_file = "/content/drive/MyDrive/regressor-list.txt"
    with open(regressor_file, "r", encoding="utf-8") as f:
        lines = [ln.strip() for ln in f if ln.strip()]

    for line in lines:
        if "(" in line and ")" in line:
            display_name = line.split("(")[0].strip()
            identifier = line.split("(")[1].split(")")[0].strip()
        else:
            display_name = line
            identifier = line.replace(" ", "")

        print(f"\n=== Generating .ipynb for {display_name} ({identifier}) ===")
        # Prepare filenames: one as .ipynb and one as .ipynb.txt
        notebook_filename = f"{identifier}.ipynb"
        notebook_path = os.path.join(output_dir, notebook_filename)
        notebook_txt_path = os.path.join(output_dir, f"{identifier}.ipynb.txt")

        # Generate initial notebook JSON from template
        nb_content = await generate_notebook_from_template(display_name, identifier)

        # Validate JSON before proceeding
        valid, err_msg = validate_notebook_json(nb_content)
        if not valid:
            print(f"Initial generated notebook JSON is invalid: {err_msg}")

        max_tries = 5
        attempt = 1
        while attempt <= max_tries:
            # Save the notebook JSON to the .ipynb file
            with open(notebook_path, "w", encoding="utf-8") as f:
                f.write(nb_content)
            # Also save as a text file for reference
            with open(notebook_txt_path, "w", encoding="utf-8") as f:
                f.write(nb_content)

            print(f"\n[Attempt {attempt}] Testing {notebook_filename} ...")
            success, test_msg = test_notebook(notebook_path)
            if success:
                print(f"✔️  {notebook_filename} ran successfully. Done.")
                break
            else:
                print(f"❌  Error on attempt {attempt}: {test_msg}")
                nb_content = await correct_notebook_code(nb_content, test_msg)
                # Validate the corrected notebook JSON
                valid, err_msg = validate_notebook_json(nb_content)
                if not valid:
                    print(f"Corrected notebook JSON is still invalid: {err_msg}")
                attempt += 1

        if attempt > max_tries:
            print(f"❌ Failed to produce a working notebook for {identifier} after {max_tries} attempts.")

# Run the asynchronous main function
asyncio.run(main())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

=== Generating .ipynb for k-NN Regressor (KNeighborsRegressor) ===

[Attempt 1] Testing KNeighborsRegressor.ipynb ...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
❌  Error on attempt 1: Error in cell 2: name 'feature_cols' is not defined
Traceback:
Traceback (most recent call last):
  File "<ipython-input-12-23f921285168>", line 130, in test_notebook
    exec(code_str, global_env)
  File "<string>", line 14, in <module>
NameError: name 'feature_cols' is not defined


[Attempt 2] Testing KNeighborsRegressor.ipynb ...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Prediction Results with Additional Accuracy Metrics:
              Parameter           MSE      RMSE   R2 Score       MAE  \
0  SO2TONS



{'RMSE': np.float64(0.20277825312579467)}
✔️  MLPRegressor.ipynb ran successfully. Done.

=== Generating .ipynb for Linear Regression (LinearRegression) ===

[Attempt 1] Testing LinearRegression.ipynb ...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
❌  Error on attempt 1: Error in cell 2: name 'feature_cols' is not defined
Traceback:
Traceback (most recent call last):
  File "<ipython-input-12-23f921285168>", line 130, in test_notebook
    exec(code_str, global_env)
  File "<string>", line 14, in <module>
NameError: name 'feature_cols' is not defined


[Attempt 2] Testing LinearRegression.ipynb ...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✔️  LinearRegression.ipynb ran successfully. Done.

=== Generating .ipynb for CART Regressor (DecisionTreeRegressor) ===

[Attempt 1] Testing DecisionTreeRegressor.ipynb ...
Drive a

KeyboardInterrupt: 