# This code is for reproducing the step 1 (generating miuse rules): 1_build_misuse_rules.py

In [18]:
# imports

from dotenv import load_dotenv
import os
import json
from openai import OpenAI #older code removed
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

# load environment variables
load_dotenv()
#openai.api_key = os.getenv("OPENAI_API_KEY") # get the API key from the environment

True

In [20]:
# NEW - Use `openai.ChatCompletion.create()` format for `openai>=1.0.0`
@retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(3))
def completion_with_backoff(**kwargs):
    #client = openai.OpenAI()  # Create a client instance
    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
    )

    return client.chat.completions.create(**kwargs)  # Updated API Call

In [21]:
#input_path = "/home/akazad/test_tool_perfapim/perfapim/dataset/perf_api_misuse_cpp_final_test.jsonl"
input_path = "/home/akazad/test_tool_perfapim/perfapim/dataset/output_794_context_enhanced_merged_latest.jsonl"
output_path = "data/cpp_fix_rules_794_context_all_latest.json"

In [None]:
# import json
# import random

# # Define input and output file paths
# input_path = "data/200_proj_891_600.json"
# output_path = "data/sample.json"

# # Load the JSON data
# with open(input_path, "r") as f:
#     data = json.load(f)

# # Sample 50 random datapoints
# sampled_data = random.sample(data, 50)

# # Write the sampled data to a new file
# with open(output_path, "w") as f:
#     json.dump(sampled_data, f, indent=4)

# print(f"Sampled 50 data points and saved to {output_path}")

Sampled 50 data points and saved to data/sample.json


In [None]:
# input_path = "data/sample.json"
# output_path = "data_azad/200_proj_891_50_output.json"

In [22]:
import json

data_dict = {}
with open(input_path, encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            record = json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue
        data_dict[record["number"]] = record

# Now data_dict contains the records keyed by the "number" field.

In [None]:
# # read the sampled data and convert it into a dictionary
# data_dict = {}
# with open(input_path, encoding="utf-8") as f:
#     data_manual = json.load(f) # reading 50 data points from the input file
#     for line in data_manual:
#         data_dict[line["number"]] = line # creating a dictionary with the number as the key

In [23]:
print(len(data_dict))

697


In [24]:
# what the following code is doing?
data = []
for key in data_dict.keys():
    data.append(data_dict[key]) # appending the data dictionary to the data list
    # if data_dict[key]["label"] == "yes":
    #     data.append(data_dict[key])

# only keep these keys in data: number, change, commit_message, code_change_explaination
for i in range(0, len(data)): # simply reordering the keys from the list of dictionaries
    data[i] = {
        "change": data[i]["change"],
        "number": data[i]["number"],
        "context": data[i]["enhanced_context"],
        "label": "yes",
    }


print(len(data))
print(data[0].keys())

697
dict_keys(['change', 'number', 'context', 'label'])


In [25]:
template_1 = """
You are an experienced software developer focusing on performance-related API misuse fixes. Analyze the code changes below for performance improvements, possibly with additional context (if available) and identify the pattern for fixing the API method.
If there are no clear patterns, please  return "NA" in <pattern>. Make sure to follow this answer format as shown in the examples below.

EXAMPLE:
code removed:
```
while (getline(std::cin, line, '\n')) {{
```
code added:
```
std::ifstream ifs_stdin("/dev/stdin");
std::istream* istream_p = ifs_stdin.is_open() ? &ifs_stdin : &std::cin;
while (getline(*istream_p, line, '\n')) {{
```
<pattern>: if a loop is detected that reads input using std::cin directly with getline, then refactor the code to first attempt opening "/dev/stdin" via an std::ifstream and use that stream if available; otherwise, fall back to std::cin.


EXAMPLE:
code removed:
'''
for (int i = 0; i < (render_width * render_height * 3); i++) {{
    this->buffer[i] = (float) 0;
}}
'''
code added:
'''
memset(this->buffer, (float)0, (render_width*render_height*3) * sizeof(float));
'''
<pattern>: if a loop manually zeroes out a buffer, replace it with memset(...) for better performance

EXAMPLE:
code removed:
'''
return std::regex_replace(
    typestr,
    std::regex("^(const volatile\\s+)|^(const\\s+)|^(volatile\\s+)|\\*(\\s*restrict)$"),
    ""
);
'''
code added:
'''
static std::regex re("^(const volatile\\s+)|^(const\\s+)|^(volatile\\s+)|\\*(\\s*restrict)$");
return std::regex_replace(typestr, re, "");
'''
<pattern>: if std::regex is re-constructed in every call, move std::regex object to a static variable to avoid repeated compilation overhead

QUESTION:
code removed:
'''
{removed_code}
'''
code added:
'''
{added_code}
'''

additional context:
{context}
<pattern>:
"""

In [None]:
# # # the prompt template to extract misuse rules


# template_1 = """
# You are an experienced software developer. Please identify the pattern for Fixing API method problem in the following code change.
# If there are no clear patterns, please  return "NA" in <pattern>.

# EXAMPLE:
# code removed:
# '''
# attention_scores = F.normalize(query_layer, dim=-1) @ F.normalize(key_layer, dim=-1).transpose(-2, -1)
# '''
# code added:
# '''
# attention_scores = nn.functional.normalize(query_layer, dim=-1) @ nn.functional.normalize(
# key_layer, dim=-1
# ).transpose(-2, -1)
# '''
# <pattern>: if deprecated API F.normalize( detected, replace with nn.functional.normalize(

# EXAMPLE:
# code removed:
# '''
# bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens)",
# cand_offsets = torch.arange(0, cand_size).type_as(tokens)",
# '''
# code added:
# '''
# bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens).to(src_tokens.device)",
# cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(src_tokens.device)",,
# '''
# <pattern>: if an offset tensor is detected without .to(), add .to(device) to the end of the API call


# QUESTION:
# code removed:
# '''
# {removed_code}
# '''
# code added:
# '''
# {added_code}
# '''
# <pattern>:"""



In [26]:
print(len(data))

697


In [27]:
# 48
for i in range(0, len(data)):
    print("current_index:", i, "/", len(data))
    change = ""
    code_before_change = ""
    code_after_change = ""
    context = ""
    added_code = ""
    removed_code = ""

    # you can check one input and output
    for j in range(0, len(data[i]["change"])):
        line = data[i]["change"][j]
        change += "{}\n".format(data[i]["change"][j])
        if line.startswith("+"):
            code_after_change += "{}\n".format(data[i]["change"][j][1:])
            added_code += "{}\n".format(data[i]["change"][j][1:])
        elif line.startswith("-"):
            code_before_change += "{}\n".format(data[i]["change"][j][1:])
            removed_code += "{}\n".format(data[i]["change"][j][1:])
        else:
            context += "{}\n".format(data[i]["change"][j])
            code_before_change += "{}\n".format(data[i]["change"][j])
            code_after_change += "{}\n".format(data[i]["change"][j])


    number = data[i]["number"]
    label = data[i]["label"]
    add_context = data[i]["context"]
    if label == "yes":
        print("number", number)
        print("change", len(change))

        
        # pass the code change to the template
        prompt_1 = template_1.format(
            added_code=added_code, removed_code=removed_code, context=add_context
        )
        print("prompt_1", len(prompt_1))
        print(prompt_1)

        # apply llm
        fix_pattern = completion_with_backoff(
            model="gpt-4o-mini",
            messages=[
                {"role": "user", "content": prompt_1}
            ]
        )

        fix_pattern = fix_pattern.choices[0].message.content

        output = {
            "number": number,
            "change": change,
            "fix_pattern": fix_pattern,
        }

        with open(output_path, 'a') as f:
            json.dump(output, f)
            f.write(os.linesep)


current_index: 0 / 697
number 1
change 661
prompt_1 2278

You are an experienced software developer focusing on performance-related API misuse fixes. Analyze the code changes below for performance improvements, possibly with additional context (if available) and identify the pattern for fixing the API method.
If there are no clear patterns, please  return "NA" in <pattern>. Make sure to follow this answer format as shown in the examples below.

EXAMPLE:
code removed:
```
while (getline(std::cin, line, '
')) {
```
code added:
```
std::ifstream ifs_stdin("/dev/stdin");
std::istream* istream_p = ifs_stdin.is_open() ? &ifs_stdin : &std::cin;
while (getline(*istream_p, line, '
')) {
```
<pattern>: if a loop is detected that reads input using std::cin directly with getline, then refactor the code to first attempt opening "/dev/stdin" via an std::ifstream and use that stream if available; otherwise, fall back to std::cin.


EXAMPLE:
code removed:
'''
for (int i = 0; i < (render_width * render

# Verification Step

In [10]:
import pandas as pd
pd_test = pd.read_json("data/cpp_fix_rules_test_146.json", lines=True)

In [11]:
pd_test


Unnamed: 0,number,change,fix_pattern
0,6,"@@ -1058,7 +1058,7 @@ Bool_t TXMLPlayer::Produ...",<pattern>: if using strncat for string concate...
1,7,"@@ -60,10 +60,11 @@ void TIndexTable::Dictiona...",<pattern>: if dynamically allocating a buffer ...
2,15,"@@ -46,7 +46,7 @@ void uThread::destory(bool f...",<pattern>: if memory was previously freed usin...
3,23,"@@ -463,7 +463,7 @@ bool XdgDesktopFileData::s...",<pattern>: NA
4,38,"@@ -2006,7 +2006,7 @@ void MainWindow::saveCam...",<pattern>: if a QFileInfo object is created ju...
...,...,...,...
141,1094,"@@ -117,6 +117,11 @@ x_client_thumbnail::updat...",<pattern>: if OpenGL scissor test setup is per...
142,1099,"@@ -2207,7 +2207,10 @@ GLDrawContext_New(TQ3Vi...",<pattern>: if a call to glClear is made with p...
143,1105,"@@ -359,6 +359,22 @@ int main(int argc, char *...",<pattern>: if the code checks for a specific c...
144,1118,"@@ -95,9 +95,9 @@ KAboutPerson KAboutPerson::f...",<pattern>: if accessing QJsonObject values usi...


In [13]:
na_count = pd_test['fix_pattern'].str.contains("NA").sum()

In [14]:
na_count

93