# Setup OpenAI

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gpt_interface
import random
import bz2
import json
from tqdm import auto as tqdm
import logging
logging.basicConfig(level=logging.CRITICAL)  # turn off logging
mode = 'openai'
k_shot = 5
secrets = gpt_interface.setup_openai('secrets.json', mode=mode)
# load data
with bz2.open('API_inquiry_annotate.json.bz2', 'rt') as f:
    data = json.load(f)

# load val ids
with open("API_instruction_testval_query_ids.json", 'r') as file:
    files_ids = json.load(file)

test = [dict(query=row['query'], gold=row['api_name']) for row in [i for i in data if i['query_id'] in files_ids['test']]]
val = [dict(query=row['query'], gold=row['api_name']) for row in [i for i in data if i['query_id'] in files_ids['val']]]
print(len(test))
print(len(val))
# add K-shot 
shuffled = [dict(query=row['query'], gold=row['api_name']) for row in [i for i in data if i['query_id'] not in files_ids['val'] and i['query_id'] not in files_ids['test']]]
random.Random(0).shuffle(shuffled)
print(len(shuffled))
print(len(data))
assert len(data)==len(test)+len(val)+len(shuffled)
train = shuffled[:k_shot]
# all-apis
import re, os
from string import punctuation
end_of_docstring_summary = re.compile(r'[{}\n]+'.format(re.escape(punctuation)))
all_apis = {x['api_name']: end_of_docstring_summary.split(x['Docstring'])[0].strip() for x in data}
all_apis = list(all_apis.items())
all_apis_json = {i[0]:i[1] for i in all_apis}
print(len(all_apis), len(all_apis_json))

182
361
1443
1986
182 182


In [151]:
from sentence_transformers import SentenceTransformer, util
class ToolRetriever:
    def __init__(self, corpus_tsv_path = "", model_path="",shuffled_data=[]):
        self.build_retrieval_corpus(corpus_tsv_path, model_path,shuffled_data)
    def build_retrieval_corpus(self, corpus_tsv_path, model_path,shuffled_data):
        print("Building corpus...")
        self.corpus_tsv_path = corpus_tsv_path
        self.model_path = model_path
        documents_df = pd.read_csv(self.corpus_tsv_path, sep='\t')
        corpus, self.corpus2tool = process_retrieval_document_query_version(documents_df)
        corpus_ids = list(corpus.keys())
        corpus = [corpus[cid] for cid in corpus_ids]
        self.corpus = corpus
        self.embedder = SentenceTransformer(self.model_path, device=device)
        self.corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True)
        self.shuffled_data = shuffled_data
        self.shuffled_queries = [item['query'] for item in shuffled_data]
        self.shuffled_query_embeddings = self.embedder.encode(self.shuffled_queries, convert_to_tensor=True)
    def retrieving(self, query, top_k):
        query_embedding = self.embedder.encode(query, convert_to_tensor=True)
        hits = util.semantic_search(query_embedding, self.corpus_embeddings, top_k=top_k, score_function=util.cos_sim) #170*
        retrieved_apis = [self.corpus2tool[self.corpus[hit['corpus_id']]] for hit in hits[0]]
        #scores = [hit['score'] for hit in hits[0]]
        return retrieved_apis[:top_k]
    def retrieve_similar_queries(self, query, shot_k=5):
        query_embedding = self.embedder.encode(query, convert_to_tensor=True)
        hits = util.semantic_search(query_embedding, self.shuffled_query_embeddings, top_k=shot_k, score_function=util.cos_sim)
        #similar_queries = [shuffled_data[hit['corpus_id']] for hit in hits[0]]
        similar_queries = ["\nInstruction: " + self.shuffled_data[hit['corpus_id']]['query'] + "\nFunction: " + self.shuffled_data[hit['corpus_id']]['gold'] for hit in hits[0]]
        return ''.join(similar_queries)

def process_retrieval_document_query_version(documents_df):
    ir_corpus = {}
    corpus2tool = {}
    for row in documents_df.itertuples():
        doc = json.loads(row.document_content)
        ir_corpus[row.docid] = compress_api_str_from_list_query_version(doc)
        corpus2tool[compress_api_str_from_list_query_version(doc)] = doc['api_calling'][0].split('(')[0]
    return ir_corpus, corpus2tool

def compress_api_str_from_list_query_version(api):
    api_name = api['api_calling'][0].split('(')[0]
    api_desc_truncated = api['api_description'].split('\n')[0]
    req_params = json.dumps(api['required_parameters'])
    opt_params = json.dumps(api['optional_parameters'])
    return_schema = json.dumps(api['Returns'])
    compressed_str = f"{api_name}, {api_desc_truncated}, required_params: {req_params}, optional_params: {opt_params}, return_schema: {return_schema}"
    return compressed_str

def load_errors(fname):
    with open(fname, 'rt') as f:
        data = json.load(f)
    wrong = [ex for ex in data if not ex['correct']]
    random.Random(0).shuffle(wrong)
    return wrong

#load_errors('5-shot-generate/gpt-4.json')[:5]

import pandas as pd
device = 'cuda:0'
#retriever = ToolRetriever(corpus_tsv_path="/Users/doradong/Repo/2023_yanglu_biochatbot/src/data/standard_process/scanpy/retriever_train_data/corpus.tsv", model_path="/Users/doradong/hugging_models/retrievers/",shuffled_data=shuffled)
retriever = ToolRetriever(corpus_tsv_path="/home/z6dong/BioChat/refer/src/2023_yanglu_biochatbot/src/data/standard_process/scanpy/retriever_train_data/corpus.tsv", model_path="/home/z6dong/BioChat/hugging_models/retriever_model_finetuned/scanpy/assigned/",shuffled_data=shuffled)

Building corpus...


In [None]:
mode_index = 'similarseed' # 'similarseed' or randomseed
# whether use similar shot example retriving mode, the similar shot example retriving is 
# to retrieve the similar queries that similar to the input query
# noted that this mode always retrieve 5 shot queries for the same API, as the query for same API is always similar
oracle_index = 'noncorrected' # 'noncorrected' or corrected
# whether use the corrected mode after retrieving API. The retrieved API is different from retrieved query, 
# retrieved API will provide an API list for gpt to select
# if under correct mode, we will put the ground truth API into the retrieved API list, 
# and delete the last one API under the retrieved API list
retrieved_index = 'retrieved' # nonretrieved or retrieved
# if using retrieved mode, then we will provide a filtered retrieved list
# otherwise, we will provide the whole API list for gpt to select


# Query to API selection

## K-shot

Here, GPT does not see candidate list of APIs, it just tries to tell the correct function from memory.

In [None]:
prompt = """
Task: name the function from the ScanPy library that should be used for the instruction. Only use function whose names start with scanpy. Do not give arguments.

{similar_queries}

Instruction: {query}
Function: 
"""
prompt = prompt.strip('\n')
print(prompt)

def run_gpt(test, gpt_model, prompt, dout, mode, max_tokens=20,title=""):
    correct = []
    for ex in (pbar := tqdm.tqdm(test)):
        if mode_index == 'similarseed':
            similar_queries = retriever.retrieve_similar_queries(ex['query'],shot_k=5)
        elif mode_index == 'randomseed':
            sampled_shuffled = random.sample(shuffled, 5)
            similar_queries = "".join(["\nInstruction: " + ex['query'] + "\nFunction: " + ex['gold'] for ex in sampled_shuffled])
        else:
            raise NotImplementedError
        print(prompt.format(query=ex['query'],similar_queries=similar_queries))
        p = gpt_interface.query_openai(prompt.format(query=ex['query'],similar_queries=similar_queries), mode=mode, model=gpt_model, max_tokens=max_tokens)
        p = p.split(',')[0]  # hack for if GPT answers this or that
        p = p.split('(')[0]
        p = p.split(' or ')[0]
        p = p.strip()
        ex['pred'] = p
        ex['correct'] = c = ex['pred'] == ex['gold']
        correct.append(c)
        pbar.set_description('correct: {}'.format(sum(correct)/len(correct)))
    with open(os.path.join(dout, '{}.json'.format(title)), 'wt') as f:
        json.dump(test, f, indent=2)


In [5]:
import os
folder_name = "{}-shot-generate".format(k_shot)
os.makedirs(folder_name, exist_ok=True)

In [None]:
run_gpt(test, 'gpt-3.5-turbo-16k', prompt, '{}-shot-generate'.format(k_shot), mode,title=title)

In [99]:
run_gpt(test, 'gpt-4', prompt, '{}-shot-generate'.format(k_shot), mode,title=title)

'correct: 0.4065934065934066: 100%|█████████████████████████████| 182/182 [04:34<00:00,  1.51s/it]'

In [None]:
title = f'gpt-3.5-turbo-16k-trainsample'
run_gpt(val, 'gpt-3.5-turbo-16k', prompt, '{}-shot-generate'.format(k_shot), mode,title=title)

In [None]:
title = f'gpt-4-trainsample'
run_gpt(val, 'gpt-4', prompt, '{}-shot-generate'.format(k_shot), mode,title=title)

# Classification
Here, GPT sees the list of available APIs and tries to pick out the correct one

In [37]:
prompt = """Task: choose one of the following functions to use for the instruction."""
prompt += """\n{retrieved_apis}"""
prompt+="""\n{similar_queries}"""
prompt += """
Instruction: {query}
Function: 
"""
prompt = prompt.strip('\n')
print(prompt)

def run_gpt_new(test, gpt_model, prompt, dout, mode, max_tokens=20,top_k=3,title=""):
    correct = []
    for ex in (pbar := tqdm.tqdm(test)):
        retrieved_api_list = retriever.retrieving(ex['query'], top_k=top_k)
        if mode_index == 'similarseed':
            similar_queries = retriever.retrieve_similar_queries(ex['query'],shot_k=5)
        elif mode_index == 'randomseed':
            sampled_shuffled = random.sample(shuffled, 5)
            similar_queries = "".join(["\nInstruction: " + ex['query'] + "\nFunction: " + ex['gold'] for ex in sampled_shuffled])
        else:
            raise NotImplementedError
        
        if retrieved_index=='retrieved':
            retrieved_api_list = retriever.retrieving(ex['query'], top_k=top_k)
            if oracle_index=='corrected':
                if ex['gold'] not in retrieved_api_list:
                    retrieved_api_list = [ex['gold']] + retrieved_api_list[:-1]
                assert ex['gold'] in retrieved_api_list
            elif oracle_index=='noncorrected':
                pass
            else:
                raise NotImplementedError
            retrieved_apis = ""
            for api in retrieved_api_list:
                retrieved_apis+=api+":"+all_apis_json[api]+"\n"
        elif retrieved_index=='nonretrieved':
            retrieved_apis = ""
            for api in all_apis_json:
                retrieved_apis+=api+":"+all_apis_json[api]+"\n"
        else:
            raise NotImplemented
        #print(prompt.format(query=ex['query'],retrieved_apis=retrieved_apis,similar_queries=similar_queries))
        print('--asking--')
        p = gpt_interface.query_openai(prompt.format(query=ex['query'],retrieved_apis=retrieved_apis,similar_queries=similar_queries), mode=mode, model=gpt_model, max_tokens=max_tokens)
        print('--done--')
        p = p.split(',')[0]  # hack for if GPT answers this or that
        p = p.split('(')[0]
        p = p.split(' or ')[0]
        p = p.strip()
        ex['pred'] = p
        ex['correct'] = c = ex['pred'] == ex['gold']
        ex['retrieved_apis'] = retrieved_api_list
        correct.append(c)
        pbar.set_description('correct: {}'.format(sum(correct)/len(correct)))
    with open(os.path.join(dout, '{}.json'.format(title)), 'wt') as f:
        json.dump(test, f, indent=2)


In [None]:
import os
folder_name = "{}-shot-classify".format(k_shot)
os.makedirs(folder_name, exist_ok=True)

In [104]:
top_k = 3
title = f'gpt-3.5-turbo-16k-topk-{top_k}'
run_gpt_new(test, 'gpt-3.5-turbo-16k', prompt, '{}-shot-classify'.format(k_shot), mode,top_k=top_k,title=title)


'correct: 0.6813186813186813: 100%|█████████████████████████████| 182/182 [05:42<00:00,  1.88s/it]'

In [118]:
top_k = 5
title = f'gpt-3.5-turbo-16k-topk-{top_k}'
run_gpt_new(test, 'gpt-3.5-turbo-16k', prompt, '{}-shot-classify'.format(k_shot), mode,top_k=top_k,title=title)


'correct: 0.6593406593406593: 100%|██| 182/182 [01:52<00:00,  1.62it/s]'

In [119]:
top_k = 10
title = f'gpt-3.5-turbo-16k-topk-{top_k}'
run_gpt_new(test, 'gpt-3.5-turbo-16k', prompt, '{}-shot-classify'.format(k_shot), mode,top_k=top_k,title=title)


'correct: 0.6703296703296703: 100%|██| 182/182 [13:39<00:00,  4.50s/it]'

In [107]:
top_k = 3
title = f'gpt-4-topk-{top_k}'
run_gpt_new(test, 'gpt-4', prompt, '{}-shot-classify'.format(k_shot), mode, top_k=top_k,title=title)


'correct: 0.7802197802197802: 100%|█| 182/182 [10:15<00:00,  3.3'

In [108]:
top_k = 5
title = f'gpt-4-topk-{top_k}'
run_gpt_new(test, 'gpt-4', prompt, '{}-shot-classify'.format(k_shot), mode, top_k=top_k,title=title)
#load_errors('5-shot-classify/gpt-4-topk-3.json')[:5]

'correct: 0.7912087912087912: 100%|██| 182/182 [10:36<00:00,  3.50s/it]'

In [109]:
top_k = 10
title = f'gpt-4-topk-{top_k}'
run_gpt_new(test, 'gpt-4', prompt, '{}-shot-classify'.format(k_shot), mode, top_k=top_k,title=title)
#load_errors('5-shot-classify/gpt-4-topk-3.json')[:5]

'correct: 0.7857142857142857: 100%|█| 182/182 [07:02<00:00,  2.32s/it]'

In [110]:
top_k = 3
title = f'gpt-3.5-turbo-16k-topk-{top_k}-trainsample'
run_gpt_new(val, 'gpt-3.5-turbo-16k', prompt, '{}-shot-classify'.format(k_shot), mode,top_k=top_k,title=title)
#load_errors(f'5-shot-classify/{title}.json')[:5]

'correct: 0.7582417582417582: 100%|█| 182/182 [04:17<00:00,  1.4'

In [111]:
top_k = 5
title = f'gpt-3.5-turbo-16k-topk-{top_k}-trainsample'
run_gpt_new(val, 'gpt-3.5-turbo-16k', prompt, '{}-shot-classify'.format(k_shot), mode,top_k=top_k,title=title)
#load_errors(f'5-shot-classify/{title}.json')[:5]

'correct: 0.7527472527472527: 100%|██| 182/182 [04:18<00:00,  1.42s/it]'

In [115]:
top_k = 10
title = f'gpt-3.5-turbo-16k-topk-{top_k}-trainsample'
run_gpt_new(val, 'gpt-3.5-turbo-16k', prompt, '{}-shot-classify'.format(k_shot), mode,top_k=top_k,title=title)
#load_errors(f'5-shot-classify/{title}.json')[:5]

'correct: 0.7582417582417582: 100%|██| 182/182 [02:24<00:00,  1.26it/s]'

In [116]:
top_k = 3
title = f'gpt-4-topk-{top_k}-trainsample'
run_gpt_new(val, 'gpt-4', prompt, '{}-shot-classify'.format(k_shot), mode, top_k=top_k,title=title)
#load_errors('5-shot-classify/gpt-4-topk-3.json')[:5]

'correct: 0.8516483516483516: 100%|█| 182/182 [06:41<00:00,  2.2'

In [None]:
top_k = 5
title = f'gpt-4-topk-{top_k}-trainsample'
run_gpt_new(val, 'gpt-4', prompt, '{}-shot-classify'.format(k_shot), mode, top_k=top_k,title=title)
#load_errors('5-shot-classify/gpt-4-topk-3.json')[:5]

In [117]:
top_k = 10
title = f'gpt-4-topk-{top_k}-trainsample'
run_gpt_new(val, 'gpt-4', prompt, '{}-shot-classify'.format(k_shot), mode, top_k=top_k,title=title)
#load_errors('5-shot-classify/gpt-4-topk-3.json')[:5]

'correct: 0.8516483516483516: 100%|██| 182/182 [07:21<00:00,  2.42s/it]'

In [64]:
# For accuracy without ambiguous pair
from collections import defaultdict
with open("./API_composite.json", "r") as file:
    api_composite_data = json.load(file)
    
api_composite_data = {key:api_composite_data[key] for key in api_composite_data if api_composite_data[key]['api_type']!='class'}

# 1: description
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
def find_similar_api_pairs(api_descriptions):
    descriptions = list(api_descriptions.values())
    api_names = list(api_descriptions.keys())
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(descriptions)
    cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
    similar_pairs = []
    for i in range(len(api_names)):
        for j in range(i+1, len(api_names)):
            if cosine_similarities[i, j] > 0.999:  # threshold can be adjusted
                similar_pairs.append((api_names[i], api_names[j]))
    return similar_pairs

similar_api_pairs = find_similar_api_pairs(all_apis_json)

# 2: 
require_same_depth=False
api_list = list(api_composite_data.keys())
groups = defaultdict(list)
for api in api_list:
    parts = api.split('.')
    if require_same_depth:
        key = (parts[-1], len(parts))
    else:
        key = parts[-1]
    groups[key].append(api)
similar_pairs = [group for group in groups.values() if len(group) > 1]# Filter out groups that only contain 1 API (no similar pairs).
#for pair in similar_pairs:
#    print(pair)

list_1 = similar_api_pairs
list_2 = similar_pairs
pairs_from_list_2 = [(apis[i], apis[j]) for apis in list_2 for i in range(len(apis)) for j in range(i+1, len(apis))]
print(len(list_1), len(list_2), len(pairs_from_list_2))
merged_pairs = list(set(list_1 + pairs_from_list_2))
#merged_pairs = list_2
len(merged_pairs)

52 23 43


71

In [89]:
import glob
import pandas as pd
import os
import json
import re

results = []

def is_pair_in_merged_pairs(gold, pred, merged_pairs):
    # Check if the pair (gold, pred) or (pred, gold) exists in merged_pairs
    return (gold, pred) in merged_pairs or (pred, gold) in merged_pairs

all_apis_from_pairs = set(api for pair in merged_pairs for api in pair)

for fname in glob.glob('bak_similar5seed_example/*/*.json'):
    with open(fname) as f:
        res = json.load(f)
    original_correct = [ex['correct'] for ex in res]
    original_c = [i for i in original_correct if i]
    original_accuracy = sum(original_correct) / len(original_correct) if res else 0
    #filtered_res = [item for item in res if item['gold'] not in all_apis_from_pairs]
    filtered_res = [item for item in res if not is_pair_in_merged_pairs(item['gold'], item['pred'], merged_pairs)]
    
    filtered_correct = [ex['correct'] for ex in filtered_res]
    filtered_c = [i for i in filtered_correct if i]
    filtered_accuracy = sum(filtered_correct) / len(filtered_res) if filtered_res else 0
    parent_dir = os.path.dirname(fname)
    match = re.search('-topk-(\d+)', os.path.basename(fname))
    top_k = int(match.group(1)) if match else '-'
    if os.path.basename(fname).replace('.json', '').startswith('gpt-4'):
        model_name = "gpt-4"
    else:
        model_name = "gpt-3.5-turbo-16k"
    if os.path.basename(fname).replace('.json', '').endswith('trainsample'):
        test_val = 'synthetic'
    else:
        test_val = 'human annotate'
    results.append(dict(
        task=parent_dir,
        model_name=model_name,
        accuracy=original_accuracy,
        total=len(res),
        #total_c = len(original_c),
        filtered_accuracy=filtered_accuracy,
        #filtered_c = len(filtered_c),
        filter_total=len(filtered_res),
        top_k=top_k,
        test_val=test_val,
    ))
results = pd.DataFrame(results)
results = results.sort_values(by=['task', 'model_name', 'top_k','test_val'])
results


Unnamed: 0,task,model_name,accuracy,total,filtered_accuracy,filter_total,top_k,test_val
2,bak_similar5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.884615,182,0.925287,174,3,human annotate
5,bak_similar5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.878116,361,0.921512,344,3,synthetic
6,bak_similar5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.884615,182,0.92,175,5,human annotate
11,bak_similar5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.875346,361,0.913295,346,5,synthetic
4,bak_similar5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.868132,182,0.913295,173,10,human annotate
8,bak_similar5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.894737,361,0.936232,345,10,synthetic
7,bak_similar5seed_example/5-shot-classify,gpt-4,0.89011,182,0.952941,170,3,human annotate
9,bak_similar5seed_example/5-shot-classify,gpt-4,0.916898,361,0.970674,341,3,synthetic
3,bak_similar5seed_example/5-shot-classify,gpt-4,0.879121,182,0.941176,170,5,human annotate
13,bak_similar5seed_example/5-shot-classify,gpt-4,0.908587,361,0.967552,339,5,synthetic


In [90]:
csv_text = results.to_csv(index=False)
print(csv_text)

task,model_name,accuracy,total,filtered_accuracy,filter_total,top_k,test_val
bak_similar5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.8846153846153846,182,0.9252873563218391,174,3,human annotate
bak_similar5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.8781163434903048,361,0.9215116279069767,344,3,synthetic
bak_similar5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.8846153846153846,182,0.92,175,5,human annotate
bak_similar5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.8753462603878116,361,0.9132947976878613,346,5,synthetic
bak_similar5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.8681318681318682,182,0.9132947976878613,173,10,human annotate
bak_similar5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.8947368421052632,361,0.936231884057971,345,10,synthetic
bak_similar5seed_example/5-shot-classify,gpt-4,0.8901098901098901,182,0.9529411764705882,170,3,human annotate
bak_similar5seed_example/5-shot-classify,gpt-4,0.9168975069252078,361,0.9706744868035191,341,3,synthetic
bak_

In [91]:
# Sample code for the user's requirement
import glob
import pandas as pd
import os
import json
import re

results = []

def is_pair_in_merged_pairs(gold, pred, merged_pairs):
    # Check if the pair (gold, pred) or (pred, gold) exists in merged_pairs
    return (gold, pred) in merged_pairs or (pred, gold) in merged_pairs

# 将所有API从merged_pairs列表中提取出来
all_apis_from_pairs = set(api for pair in merged_pairs for api in pair)

for fname in glob.glob('bak_random5seed_example/*/*.json'):
    with open(fname) as f:
        res = json.load(f)
    original_correct = [ex['correct'] for ex in res]
    original_c = [i for i in original_correct if i]
    original_accuracy = sum(original_correct) / len(original_correct) if res else 0
    #filtered_res = [item for item in res if item['gold'] not in all_apis_from_pairs]
    filtered_res = [item for item in res if not is_pair_in_merged_pairs(item['gold'], item['pred'], merged_pairs)]
    
    filtered_correct = [ex['correct'] for ex in filtered_res]
    filtered_c = [i for i in filtered_correct if i]
    filtered_accuracy = sum(filtered_correct) / len(filtered_res) if filtered_res else 0
    parent_dir = os.path.dirname(fname)  # 获取母路径
    match = re.search('-topk-(\d+)', os.path.basename(fname))
    top_k = int(match.group(1)) if match else '-'
    if os.path.basename(fname).replace('.json', '').startswith('gpt-4'):
        model_name = "gpt-4"
    else:
        model_name = "gpt-3.5-turbo-16k"
    if os.path.basename(fname).replace('.json', '').endswith('trainsample'):
        test_val = 'synthetic'
    else:
        test_val = 'human annotate'
    results.append(dict(
        task=parent_dir,  # 添加母路径到结果中
        model_name=model_name,
        accuracy=original_accuracy,
        total=len(res),
        #total_c = len(original_c),
        filtered_accuracy=filtered_accuracy,
        #filtered_c = len(filtered_c),
        filter_total=len(filtered_res),
        top_k=top_k,
        test_val=test_val,
    ))
results = pd.DataFrame(results)
results = results.sort_values(by=['task', 'model_name', 'top_k','test_val'])
results


Unnamed: 0,task,model_name,accuracy,total,filtered_accuracy,filter_total,top_k,test_val
4,bak_random5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.785714,182,0.836257,171,3,human annotate
7,bak_random5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.775623,361,0.825959,339,3,synthetic
8,bak_random5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.769231,182,0.804598,174,5,human annotate
13,bak_random5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.786704,361,0.830409,342,5,synthetic
6,bak_random5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.78022,182,0.816092,174,10,human annotate
10,bak_random5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.770083,361,0.820059,339,10,synthetic
9,bak_random5seed_example/5-shot-classify,gpt-4,0.868132,182,0.913295,173,3,human annotate
11,bak_random5seed_example/5-shot-classify,gpt-4,0.867036,361,0.912536,343,3,synthetic
5,bak_random5seed_example/5-shot-classify,gpt-4,0.813187,182,0.87574,169,5,human annotate
15,bak_random5seed_example/5-shot-classify,gpt-4,0.855956,361,0.903509,342,5,synthetic


In [92]:
csv_text = results.to_csv(index=False)
print(csv_text)

task,model_name,accuracy,total,filtered_accuracy,filter_total,top_k,test_val
bak_random5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.7857142857142857,182,0.8362573099415205,171,3,human annotate
bak_random5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.775623268698061,361,0.8259587020648967,339,3,synthetic
bak_random5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.7692307692307693,182,0.8045977011494253,174,5,human annotate
bak_random5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.7867036011080333,361,0.8304093567251462,342,5,synthetic
bak_random5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.7802197802197802,182,0.8160919540229885,174,10,human annotate
bak_random5seed_example/5-shot-classify,gpt-3.5-turbo-16k,0.7700831024930748,361,0.8200589970501475,339,10,synthetic
bak_random5seed_example/5-shot-classify,gpt-4,0.8681318681318682,182,0.9132947976878613,173,3,human annotate
bak_random5seed_example/5-shot-classify,gpt-4,0.8670360110803325,361,0.9125364431486881,343,3,syntheti

In [71]:
len(list_1),len(list_2)

(52, 23)

In [72]:
len(set(api for pair in list_1 for api in pair)), len(set(api for pair in list_2 for api in pair))

(45, 56)