In [None]:
import json
import os

print(os.getcwd())

# import logging

import matplotlib.pyplot as plt
import numpy as np
from autogen import OpenAIWrapper


def load_and_structure_data(jsonl_file_paths):
    structured_data = {}
    model_map = {
        "combined_stats_a2a.jsonl": "a2a_gpt4",
        "combined_stats_a2a_turbo.jsonl": "a2a_turbo",
        "combined_stats_baseGPT.jsonl": "base_gpt4",
        "combined_stats_base_turbo.jsonl": "base_turbo",
    }

    for file_path in jsonl_file_paths:
        model = model_map[os.path.basename(file_path)]

        with open(file_path, "r") as file:
            for line in file:
                data = json.loads(line)
                task_desc = data["task_description"]
                if task_desc not in structured_data:
                    structured_data[task_desc] = {"attempts": []}

                for feedback in data["exe_feedback"]:
                    code = feedback.get("code")
                    exit_code = feedback.get("exit_code")

                    structured_data[task_desc]["attempts"].append(
                        {
                            "model": model,
                            "code": code,
                            "exit_code": exit_code,
                        }
                    )
    return structured_data


# jsonl_file_paths = ["path_to_your_first_jsonl_file.jsonl", "path_to_your_second_jsonl_file.jsonl"]
# structured_data = load_and_structure_data(jsonl_file_paths)


file_paths = [
    "./stats/tasks_and_code_exe_results/combined_stats_a2a.jsonl",
    "./stats/tasks_and_code_exe_results/combined_stats_a2a_turbo.jsonl",
    "./stats/tasks_and_code_exe_results/combined_stats_baseGPT.jsonl",
    "./stats/tasks_and_code_exe_results/combined_stats_base_turbo.jsonl",
]


x = load_and_structure_data(file_paths)
# _, codes = [get_code_attempts(path) for path in file_paths[1:]]
# print(len(x[0]))
# x

In [None]:
# x['Create a python file to help me understand why empirical risk minimization is NP hard.']
config_list = [
    {
        "model": "gpt-4",
        "api_key": os.environ.get("OPENAI_APIKEY2"),
    },
    {
        "model": "gpt-4-0125-preview",
        "api_key": os.environ.get("OPENAI_APIKEY2"),
    },
    {
        "model": "gpt-4-1106-preview",
        "api_key": os.environ.get("OPENAI_APIKEY2"),
    },
    # {
    #     "model": "gpt-3.5-turbo-0125",
    #     "api_key": os.environ.get("OPENAI_APIKEY2"),
    # },
    # {
    #     "model": "gpt-3.5-turbo",
    #     "api_key": os.environ.get("OPENAI_APIKEY2"),
    # },
    # {
    #     "model": "gpt-3.5-turbo-16k",
    #     "api_key": os.environ.get("OPENAI_APIKEY2"),
    # },
]

base_cfg = {
    # "use_cache": False,
    # "seed": 22,
    "config_list": config_list,
    "temperature": 1.0,
}

In [None]:
len(x['Create a python file to help me understand why empirical risk minimization is NP hard.']['attempts'])

In [None]:
import json
import os

client = OpenAIWrapper(config_list=base_cfg["config_list"])


def semantic_code_analysis(task_description, code):
    response = client.create(
        messages=[
            {
                "role": "system",
                "content": "You are tasked with providing a review of the over all robustness and quality of the generated code. Use criteria such as correctness, complexity, and MOST IMPORTANTLY adherence to the desire of the task description from the user. Ensure you provide concise step by step reasoning, and to do your best, this is important. After providing a brief analysis of the code quality I want you to give it a rating from 1-10 as an evaluation score in the following way:\n<Your REVIEW HERE>\n<Your SCORE (between 1-10) HERE, for example 5>",
            },
            {
                "role": "user",
                "content": f"Given the task description: {task_description}\nThe given code:\n{code}\n Please provide a review of the code quality and a score based on the given task.",
            },
        ],
    )
    return response.choices[0].message.content


def integrate_semantic_analysis_with_data(structured_data):

    for task_desc, task_data in structured_data.items():
        if (
            task_desc
            == "Create a python file to help me understand why empirical risk minimization is NP hard."
        ):
            continue
        for attempt in task_data["attempts"]:
            code = attempt["code"]
            # model = attempt["model"]

            review = semantic_code_analysis(task_desc, code)

            attempt["code_review"] = review
        print(task_desc)
        # break
    return structured_data


structured_data_with_reviews = integrate_semantic_analysis_with_data(x)

In [None]:
with open(".//stats/gpt_code_review/t.json", "w") as f:
    json.dump(
        structured_data_with_reviews,
        f,
        indent=4,
    )
# structured_data_with_reviews[
#     "Create a python file to help me understand why empirical risk minimization is NP hard."
# ]
# structured_data_with_reviews['Create a python file to help me understand why empirical risk minimization is NP hard.']

In [None]:
len(structured_data_with_reviews['Create a python file to help me understand why empirical risk minimization is NP hard.']['attempts'])

In [None]:
def verify_code_reviews(structured_data):
    total_reviews = 0
    tasks_without_review = []

    for task_desc, task_data in structured_data.items():
        for attempt in task_data["attempts"]:
            if "code_review" not in attempt:
                tasks_without_review.append(task_desc)
            else:
                total_reviews += 1

    return total_reviews, tasks_without_review


# structured_data_with_reviews = integrate_semantic_analysis_with_data(x)
total_reviews, tasks_without_review = verify_code_reviews(structured_data_with_reviews)

print(f"Total code reviews: {total_reviews}")
if tasks_without_review:
    print(f"Tasks without code review: {tasks_without_review}")
else:
    print("All attempts have code reviews.")

In [None]:
def generate_latex_table(structured_data):
    latex_code = (
        "\\begin{table}[H]\n\\centering\n\\begin{tabular}{|l|l|l|p{5cm}|}\n\\hline\n"
    )
    latex_code += "Model & Task & Code (Truncated) & Review (Truncated) \\\\\\hline\n"

    for task_desc, task_data in structured_data.items():
        for attempt in task_data["attempts"]:
            model = attempt["model"]
            code = attempt["code"][:100] + "..."  # Truncate code for brevity
            review = (
                attempt["code_review"].split("\n")[0] + "..."
            )  # Truncate review for brevity
            latex_code += f"{model} & \\parbox[t]{{5cm}}{{{task_desc[:50]}...}} & {code} & {review} \\\\\\hline\n"

    latex_code += "\\end{tabular}\n\\caption{Summary of Code Reviews}\n\\label{tab:code_reviews}\n\\end{table}"
    return latex_code


with open("./stats/gpt_code_review/t.json", "r") as f:
    x = json.load(f)
# latex_code = generate_latex_table(x)

# Print or save the latex_code to a file
# print(latex_code)
# with open('code_reviews_table.tex', 'w') as f:
#     f.write(latex_code)

In [None]:
# code initially generated by GPT-4 :)

def generate_latex_table_and_appendix_single_attempt_per_model(structured_data):
    appendix_section = "\n\\begin{appendix}\n"
    appendix_section += (
        "    \\section{Appendix: Detailed Code Reviews and Generated Code}\n"
    )

    table_code = (
        "\\begin{table}[H]\n\\centering\n"
        "\\begin{tabular}{|l|l|p{6cm}|}\n\\hline\n"
        "Model & Task Description & Code and Review Reference \\\\\\hline\n"
    )

    task_counter = 1
    for task_desc, task_data in structured_data.items():
        appendix_section += f"    \\subsection{{Task {task_counter}: {task_desc}}}\n"
        appendix_section += f"    \\label{{sec:code{task_counter}}}\n\n"

        processed_models = set()  # Track which models have been processed for this task

        for attempt in task_data["attempts"]:
            model = attempt["model"]

            # Skip if this model's attempt has already been processed for this task
            if model in processed_models:
                continue

            processed_models.add(model)  # Mark this model as processed for this task

            code = attempt["code"].replace("\n", "\n    ")  # Format code for LaTeX
            review = attempt["code_review"].replace(
                "\n", "\n    "
            )  # Format review for LaTeX

            # Add to table and appendix
            table_code += f"{model} & {task_desc[:50]}... & See Appendix, Section \\ref{{sec:code{task_counter}-{model}}} \\\\\\hline\n"
            appendix_section += f"    \\subsubsection{{Model: {model}}}\n"
            appendix_section += f"    \\label{{sec:code{task_counter}-{model}}}\n\n"
            appendix_section += (
                "    \\textbf{Generated Code:}\n\n    \\begin{verbatim}\n"
            )
            appendix_section += f"    {code}\n " + "\\end{verbatim}\n\n"
            appendix_section += "    \\textbf{Review:}\n\n" + f"{review}\n\n"

        task_counter += 1

    table_code += "\\end{tabular}\n\\caption{Summary of Code Reviews and Generated Code}\n\\label{tab:code-reviews}\n\\end{table}\n"
    appendix_section += "\\end{appendix}\n"

    return table_code, appendix_section


# Assuming `x` is your structured data
latex_table_code, latex_appendix_section = (
    generate_latex_table_and_appendix_single_attempt_per_model(x)
)

# Print or save to a file
# print(latex_table_code)
print(latex_appendix_section)
# You can write these to a .tex file as needed

In [None]:
latex_table_code