In [1]:
import json
import re

In [2]:
with open("java_spring_api.json", 'r', encoding='utf-8') as f:
    raw_data = json.load(f)
    
len(raw_data.keys()) # Number of APIs

176

In [3]:
related_files = []
total_num = 0
for api_name, usage_list in raw_data.items():
    # print(len(usage_list))
    total_num += len(usage_list)
    for single_use in usage_list:
        related_files.append(single_use['git_name'])
        
print(total_num) # Total number of benchmarks 
print(len(set(related_files))) # Total number of unique projects

4146
853


In [4]:
def construct_how_to_use(api, code):
    # 使用 re.escape 确保 api 中的特殊字符被正确处理
    escaped_api = re.escape(api)
    
    # 动态构造正则表达式，匹配 api 函数调用
    pattern = rf'({escaped_api}\()'
    
    contents = []

    for match in re.finditer(pattern, code, re.DOTALL):
        start_pos = match.start(1)
        # 寻找对应的闭合括号，考虑括号的嵌套
        open_brackets = 1
        cursor = match.end(1)
        while open_brackets > 0 and cursor < len(code):
            if code[cursor] == '(':
                open_brackets += 1
            elif code[cursor] == ')':
                open_brackets -= 1
            cursor += 1
        
        # 提取内容
        content = code[match.end(1):cursor-1]

        # 提取前文和后文
        context_pre = code[:start_pos] + api + '('
        context_post = code[cursor-1:]
        
        contents.append({"context_pre":context_pre, 
                         "content":content, 
                         "context_post":context_post})
    
    return contents

def find_matching_parenthesis(code, start_pos):
    # 寻找匹配的闭合括号
    open_brackets = 1
    cursor = start_pos
    while open_brackets > 0 and cursor < len(code):
        if code[cursor] == '(':
            open_brackets += 1
        elif code[cursor] == ')':
            open_brackets -= 1
        cursor += 1
    return cursor  # 返回找到的匹配的闭合括号的位置


def construct_when_to_use(api, code):
    # 使用 re.escape 确保 api 中的特殊字符被正确处理
    escaped_api = re.escape(api)
    pattern = rf'({escaped_api}\()'
    
    usages = []

    for match in re.finditer(pattern, code):
        start_pos = match.start(1) + len(api) + 1  # 定位到API调用的开括号后的第一个字符
        end_pos = find_matching_parenthesis(code, start_pos)  # 找到匹配的闭合括号位置

        # 提取内容，即API调用及其完整表达式
        content = code[match.start(1):end_pos]
        
        # 提取API调用之前的代码作为前文
        context_pre = code[:match.start(1)].strip()
        
        # 提取API调用表达式之后的代码作为后文
        context_post = code[end_pos:].strip()
        
        usages.append({"context_pre":context_pre, 
                         "content":content, 
                         "context_post":context_post})
    
    return usages



## Setting Parameters

In [7]:
USE_COMMENT = False
USE_FILE_CONTEXT = False
LINE_BEFORE = 10
FILL_IN_THE_MIDDLE = False
USE_IMPORT_MESSAGE = True


## Construct how to use

In [9]:
total_data = []

for api_name, usage_list in raw_data.items():
    current_data = []
    for single_use in usage_list:
        api = '.'.join(api_name.split('.')[-2:])
        comment = single_use['comment']
        file_left_context = single_use['left_context']
        file_right_context = single_use['right_context']
        pure_code = single_use['code'].replace(comment, '').strip()
        import_list = ["import "+i for i in single_use['import_text']]
        input_data = pure_code 
        if USE_COMMENT:
            input_data = comment + '\n' + input_data
            
        if USE_FILE_CONTEXT:
            input_data = file_left_context.split('\n')[-LINE_BEFORE:] + '\n' + input_data
            
        if USE_IMPORT_MESSAGE:
            input_data = '\n'.join(import_list) + "\n" + input_data
        current_data += construct_how_to_use(api, input_data)
    total_data.append({api_name:current_data})
    #     print(construct_how_to_use(api, single_use['code'])[0][0])
    #     print(construct_how_to_use(api, single_use['code'])[0][1])
    #     print(construct_how_to_use(api, single_use['code'])[0][2])
    #     break
    # break
    
print(len(total_data))


with open("how_to_use_function_import.json", 'w', encoding='utf-8') as f:
    json.dump(total_data, f, ensure_ascii=False, indent=4)

176


In [5]:


when_to_use = []

for api_name, usage_list in raw_data.items():
    current_data = []
    for single_use in usage_list:
        
        api = '.'.join(api_name.split('.')[-2:])
        current_data += construct_when_to_use(api, single_use['code'])
    when_to_use.append({api_name:current_data})
        # print(construct_when_to_use(api, single_use['code'])[0]["context_pre"])
        # print("#################")
        # print(construct_when_to_use(api, single_use['code'])[0]["content"])
        # print("#################")
        # print(construct_when_to_use(api, single_use['code'])[0]["context_post"])
        
        
with open("when_to_use.json", 'w', encoding='utf-8') as f:
    json.dump(when_to_use, f, ensure_ascii=False, indent=4)
