In [None]:
!pip install gdown
!gdown 1aaOaAM0iidmkR0mHkPOIwddpkpzBmlWT

In [48]:
import re
import ast
import json
from glob import glob
from collections import Counter
from scipy.stats import pearsonr,spearmanr
from numpy import mean, median

In [88]:
def extract_apis(code):
    tree = ast.parse(code)
    api_list = []
    imported_modules = {}

    class ApiExtractor(ast.NodeVisitor):
        def visit_Import(self, node):
            for alias in node.names:
                module_name = alias.name
                alias_name = alias.asname or alias.name
                imported_modules[alias_name] = module_name
                # Add submodule and top-level module
                submodule_parts = module_name.split('.')
                for i in range(1, len(submodule_parts) + 1):
                    submodule = '.'.join(submodule_parts[:i])
                    imported_modules[submodule] = submodule
            self.generic_visit(node)

        def visit_ImportFrom(self, node):
            module = node.module
            if module:
                for alias in node.names:
                    full_name = f'{module}.{alias.name}'
                    alias_name = alias.asname or alias.name
                    imported_modules[alias_name] = full_name
            self.generic_visit(node)

        def visit_Attribute(self, node):
            if isinstance(node.value, ast.Name) and node.value.id in imported_modules:
                base_module = imported_modules[node.value.id]
                api_call = f"{base_module}.{node.attr}"
                if api_call not in api_list:
                    api_list.append(api_call)
            self.generic_visit(node)

        def visit_Call(self, node):
            if isinstance(node.func, ast.Attribute):
                attr_parts = []
                current = node.func
                while isinstance(current, ast.Attribute):
                    attr_parts.append(current.attr)
                    current = current.value

                if isinstance(current, ast.Name) and current.id in imported_modules:
                    base_module = imported_modules[current.id]
                    attr_parts.append(base_module)
                    attr_parts.reverse()
                    api_call = '.'.join(attr_parts)
                    if api_call not in api_list:
                        api_list.append(api_call)
            elif isinstance(node.func, ast.Name) and node.func.id in imported_modules:
                api_call = imported_modules[node.func.id]
                if api_call not in api_list:
                    api_list.append(api_call)

            self.generic_visit(node)

    ApiExtractor().visit(tree)
    return list(set(api_list))  # Remove duplicates

def count_test_cases(class_string):
    # Parse string containing class definition into an AST tree
    tree = ast.parse(class_string)

    # Function to check if a function node in AST represents a test case
    def is_test_method(node):
        return isinstance(node, ast.FunctionDef) and node.name.startswith('test_')

    # Initialize a count of test methods
    test_method_count = 0

    # Traverse AST tree
    for node in ast.walk(tree):
        # Increment count if node is a test method
        if is_test_method(node):
            test_method_count += 1

    return test_method_count

def extract_sample_apis(sample):
    if sample["library"]:
        compiled_content = sample["test_start"].split("\ndef check")[0].strip()+"\n\n"\
                +sample["prompt"]+sample["canonical_solution"]
        compiled_content = compiled_content.replace("\t","    ")
        return extract_apis(compiled_content)
    else:
        return []

In [94]:
benchmark_data = [json.loads(l) for l in open("../data/open-eval.jsonl").read().splitlines()]
stack_data = [json.loads(l) for l in open("stack-dedup-python-lib-api.json").read().splitlines()]
odex_data = [json.loads(l) for file in glob("../data_collection/round_0/odex/*.jsonl") for l in open(file).read().splitlines()]

## Basic Stats

![](benchmark_length.png)

In [68]:
# Average Number of Test Cases
print("Average Number of Test Cases:", round(mean([count_test_cases(b["test"]) for b in benchmark_data]),1))
# Average Prompt Chars
print("Average Prompt Chars:", round(mean([len(b["prompt"]) for b in benchmark_data]),1))
# Average Prompt Lines
print("Average Prompt Lines:", round(mean([b["prompt"].count("\n") for b in benchmark_data]),1))
# Averge Solution Chars
print("Average Solution Chars:", round(mean([len(b["canonical_solution"]) for b in benchmark_data]),1))
# Average Solution Lines
print("Average Solution Lines:", round(mean([b["canonical_solution"].count("\n") for b in benchmark_data]),1))

Average Number of Test Cases: 5.9
Average Prompt Chars: 1117.7
Average Prompt Lines: 32.7
Average Solution Chars: 467.3
Average Solution Lines: 13.1


## Function Calling Stats

In [96]:
with open("lib2domain.json") as f:
    lib2domain = json.load(f)

In [86]:
# Total Libraries in Stack
print("Total Libraries in Stack:", len(set([lib for l in stack_data for lib in l["library"]])))
# Average Number of Libraries in Stack
print("Average Number of Libraries Per File in Stack:", round(mean([len(l["library"]) for l in stack_data]),1))
# Total Number of APIs in Stack
print("Total Number of APIs in Stack:", len(set([api for l in stack_data for api in l["api"]])))
# Average Number of API Calls in Stack
print("Average Number of API Calls Per File in Stack:", round(mean([len(l["api"]) for l in stack_data]),1))

Total Libraries in the Stack: 765602
Average Number of Libraries Per File in The Stack: 3.1
Total Number of APIs in The Stack: 14959933
Average Number of API Calls Per File in The Stack: 9.0


In [127]:
# Total Libraries in OpenEval
print("Total Libraries in OpenEval:", len(set([lib for b in benchmark_data for lib in b["libs"]])))
# Average Number of Libraries in OpenEval
print("Average Number of Libraries Per Tasks:", round(mean([len(b["libs"]) for b in benchmark_data]),1))
benchmark_lib = set([lib for b in benchmark_data for lib in b["libs"]])
# Total Number of APIs in OpenEval
print("Total Number of APIs in OpenEval:", len(set([api for b in benchmark_data for api in b["apis"]])))
# Average Number of API Calls in OpenEval
print("Average Number of API Calls Per Tasks:", round(mean([len(b["apis"]) for b in benchmark_data]),1))
# Domain Distribution of Libraries in OpenEval
print("Domain Distribution of Libraries in OpenEval:", Counter([lib2domain[lib] for lib in benchmark_lib]).most_common())
# Domain Freq of Stack Counter for OpenEval
print("Domain Freq of Stack Counter for OpenEval:", Counter([lib2domain[lib] for b in benchmark_data for lib in b["libs"]]).most_common())
# Total Number of Different Lib Combo in OpenEval
print("Total Number of Different Lib Combo in OpenEval:", len(set([tuple(sorted(b["libs"])) for b in benchmark_data])))
# Total Number of Different API Combo in OpenEval
print("Total Number of Different API Combo in OpenEval:", len(set([tuple(sorted(b["apis"])) for b in benchmark_data])))
# Total Number of Different Domain Combo in OpenEval
print("Total Number of Different Domain Combo in OpenEval:", len(set([tuple(sorted([lib2domain[lib] for lib in b["libs"]])) for b in benchmark_data])))

Total Libraries in OpenEval: 112
Average Number of Libraries Per Tasks: 2.7
Total Number of APIs in OpenEval: 507
Average Number of API Calls Per Tasks: 4.2
Domain Distribution of Libraries in OpenEval: [('System', 27), ('General', 25), ('Network', 25), ('Computation', 15), ('Cryptography', 10), ('Time', 5), ('Visualization', 5)]
Domain Freq of Stack Counter for OpenEval: [('Computation', 508), ('General', 274), ('System', 255), ('Visualization', 187), ('Network', 101), ('Cryptography', 60), ('Time', 52)]
Total Number of Different Lib Combo in OpenEval: 308
Total Number of Different API Combo in OpenEval: 502
Total Number of Different Domain Combo in OpenEval: 117
