## Context preparation

Infill issue information, including title, output, PR, etc., into the context.

For different group, change `group` into one in the comments.

The output is under the `llm_label/{group}` directory.

In [None]:
import os
import json
import requests

TOKEN = "ghp_YOUR_TOKEN"


def get_pr(timeline: str):
    timeline = requests.get(
        timeline,
        headers={
            "Accept": "application/vnd.github+json",
            "Authorization": f"Bearer {TOKEN}",
        },
    ).json()

    for t in timeline:
        if t["event"] != "cross-referenced":
            continue
        if t["source"]["type"] != "issue":
            continue
        issue = t["source"]["issue"]
        if issue["repository"]["full_name"] != "pytorch/pytorch":
            continue
        if "pull_request" not in issue:
            continue
        return title

    return ""


group = "gpu"  # "operator", "syntax", "typing"

group_path = os.path.join("grouped_code", group)
out_path = os.path.join("llm_label", group)

with open("full_data.json", "r") as f:
    data = json.load(f)

with open("releases.json", "r") as f:
    releases = json.load(f)

for file in os.listdir(group_path):
    if not file.endswith(".py"):
        continue

    number = int(file.split("_")[0])

    issue = None
    for d in data:
        if d["number"] == number:
            issue = d
            break
    if issue is None:
        continue

    title = issue["title"]

    output = "N/A"
    if os.path.exists(os.path.join(group_path, f"{number}_output.txt")):
        with open(os.path.join(group_path, f"{number}_output.txt"), "r") as f:
            output = f.read()

    pr_title = get_pr(issue["timeline_url"])

    version = None
    if os.path.exists(os.path.join(out_path, f"{number}_env.txt")):
        with open(os.path.join(out_path, f"{number}_env.txt"), "r") as f:
            for line in f.readlines():
                if "PyTorch version:" in line:
                    version = line.strip()
                    break

    if version is None:
        create = issue["created_at"]
        for release in releases:
            if release["published_at"] < create:
                tag_name = release["tag_name"]
                version = f"PyTorch version: {tag_name[1:]}"
                break

    labels = ""
    for label in issue["labels"]:
        if len(labels) > 0:
            labels += ", "
        labels += f'{label["name"]}'

    with open(os.path.join(group_path, file), "r") as f:
        code = f.read()

    with open(os.path.join(out_path, f"{number}.txt"), "w") as f:
        f.write(f"# Title: {title}\n")
        f.write(f'"""\nOutput:\n{output}\n"""\n')
        f.write(f"# Version: {version}\n")
        f.write(f"# Labels: {labels}\n")
        f.write(f"# PR Title: {pr_title}\n")
        f.write(f"{code}\n")
        pass

## Text summarization

LLM generation for bug description summarization.

The following cell is using [llama.cpp](https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://github.com/ggerganov/llama.cpp) and CodeLLama Python 7B.

For different categories, change `input_category` into one in the comments.

In [None]:
import os
import subprocess

base_path = "llm_label"
input_category = "gpu"  # "operator", "syntax", "typing"

context = os.path.join(base_path, "context.txt")
input_path = os.path.join(base_path, input_category)

intermidiate_path = os.path.join(base_path, "full_context")
output_path = os.path.join(base_path, "llm_output")

with open(context, "r") as f:
    context = f.read()

for file in os.listdir(input_path):
    if not file.endswith(".txt"):
        continue

    if os.path.exists(os.path.join(output_path, file)):
        continue

    with open(os.path.join(input_path, file), "r") as f:
        snippet = f.read()

    while "\n\n" in snippet:
        snippet = snippet.replace("\n\n", "\n")

    with open(os.path.join(intermidiate_path, file), "w") as f:
        f.write(f"{context}\n{snippet}# API:")

    result = subprocess.run(
        [
            "llama-cli",
            "-m",
            "/path/to/model",
            "-c",
            "16000",
            "-n",
            "2048",
            "-ngl",
            "1",
            "-f",
            os.path.join(intermidiate_path, file),
        ],
        check=True,
        capture_output=True,
    )

    with open(os.path.join(output_path, file), "w") as f:
        f.write(result.stdout.decode("utf-8"))

## Format the output

From the generated text in `llm_label/llm_output`, we will format the output to match the the concentation component of the final context.

The formated output for each issue will be saved in `llm_label/labeld`.

In [None]:
group_path = os.path.join("grouped_code", input_category)
group = input_category

summary_path = os.path.join(base_path, "llm_output")
out_path = os.path.join(base_path, "labeled", group)

for file in os.listdir(group_path):
    if not file.endswith(".py"):
        continue

    number = int(file.split("_")[0])

    if os.path.exists(os.path.join(out_path, f"{number}.txt")):
        continue

    if not os.path.exists(os.path.join(summary_path, f"{number}.txt")):
        continue

    with open(os.path.join(group_path, file), "r") as f:
        code = f.read()

    with open(os.path.join(summary_path, f"{number}.txt"), "r") as f:
        summaries = []
        for line in f.readlines():
            if len(line.strip()) == 0 and len(summaries) > 0:
                break
            if line.startswith("# API:"):
                break
            summaries.append(line.strip())
        summary = "\n".join(summaries)

    with open(os.path.join(out_path, f"{number}.txt"), "w") as f:
        f.write(f"# API: {summary}\n")
        f.write(f"{code}\n")
        pass