In [1]:
import os
import getpass

import langsmith

from human_eval.data import read_problems, write_jsonl

from codechain.generation import HumanEvalChain, CompleteCodeChain
from codechain.evaluation import HumanEvalEvaluator

from langchain.chat_models import ChatOpenAI
from langchain.smith import arun_on_dataset, RunEvalConfig

import datetime

In [3]:
"""Setup."""


# Set environment variables
def _set_if_undefined(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass(f"Please provide your {var}")
_set_if_undefined("OPENAI_API_KEY")
_set_if_undefined("LANGCHAIN_API_KEY")
_set_if_undefined("TAVILY_API_KEY")


# Add tracing in LangSmith.
os.environ["LANGCHAIN_TRACING_V2"] = "true"


coding_agent_version = "v1"

if coding_agent_version == "v1":
    os.environ["LANGCHAIN_PROJECT"] = "Coding Agetn v1: HumanEval Coding Benchmark"
elif coding_agent_version == "v2":
    os.environ["LANGCHAIN_PROJECT"] = "Coding Agetn v2: HumanEval Coding Benchmark"
elif coding_agent_version == "v3":
    os.environ["LANGCHAIN_PROJECT"] = "Coding Agetn v3: HumanEval Coding Benchmark"


# Dataset settings
description = "HumanEval dataset"
dataset_name= "humaneval-small" #"humaneval-all"
max_problems = 3 #False
repetitions_per_problem = 5


# LLM settings
provider = "openai"
model_name = "gpt-3.5-turbo-0125" #"gpt-4"
temperature = 0.2


client = langsmith.Client()
client

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"])

prompt = ChatPromptTemplate .from_messages ([
    ("system", "You are world class technical documentation writer."),
    ("user", "{input}")
])

output_parser = StrOutputParser ()

chain = prompt | llm | output_parser

# chain.invoke({"input": "how can langsmith help with testing?"})

In [None]:


def generate_one_completion(problem):
    
    
    return

In [4]:
"""Dataset creation."""

# Get the HumanEval dataset up to max_problems.
problems = read_problems()
if max_problems:
    problems = {key: problems[key] for key in list(problems.keys())[:max_problems]}



samples = [
    dict(task_id=task_id, completion=generate_one_completion(problems[task_id]["prompt"]))
    for task_id in problems
    for _ in range(repetitions_per_problem)
]
write_jsonl("samples.jsonl", samples)




In [5]:
# """Dataset creation."""

# # Get the HumanEval dataset up to max_problems.
# problems = read_problems()
# if max_problems:
#     problems = {key: problems[key] for key in list(problems.keys())[:max_problems]}


# # If the dataset is new, update it to the LangSmith server.
# if dataset_name not in set([dataset.name for dataset in client.list_datasets()]):
#     dataset = client.create_dataset(dataset_name, description=description)
#     for key, value in problems.items():
#         client.create_example(
#             inputs={
#                 "prompt": value["prompt"],
#                 "task_id": key
#                 },
#             outputs={
#                 "canonical_solution": value["canonical_solution"],
#                 },
#             dataset_id=dataset.id
#         )

# """Coding agent creation."""

# # Factory for the generation chain
# def chain_factory():
#     """Create a code generation chain."""

#     llm_args = {
#         "model_name": model_name,
#         "temperature": temperature
#     }
#     if provider == "openai":
#       llm = ChatOpenAI(**llm_args)

#     return HumanEvalChain.from_llm(llm)

# """Evaluation."""


# # Evaluator configuration
# evaluation = RunEvalConfig(
#     custom_evaluators=[HumanEvalEvaluator()],
#     input_key="task_id"
#     )

# timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# # Run all generations and evaluations
# for index in range(repetitions_per_problem):

#     chain_results = await arun_on_dataset(
#         client=client,
#         dataset_name=dataset_name,
#         project_name=f"HumanEval {timestamp} — {index}",
#         concurrency_level=10,
#         llm_or_chain_factory=chain_factory,
#         evaluation=evaluation,
#         tags=["HumanEval"],
#         verbose=True
#     )

# import pandas as pd
# from google.colab import data_table

# project_names = [f"HumanEval {timestamp} — {index}" for index in range(0, repetitions_per_problem)]

# # Retrieve a project from LangSmith
# def get_project_as_df(project_name):
#     print(f"Downloading {project_name}")
#     runs = list(client.list_runs(project_name=project_name, execution_order=1))
#     return pd.DataFrame(item.dict() for item in runs)

# # Retrieve all projects and combine
# dfs = [get_project_as_df(project_name) for project_name in project_names]
# df = pd.concat(dfs, ignore_index=True)

# # Display the results
# data_table.enable_dataframe_formatter()
# df