In [None]:
%%bash 
#Data accessible via: 
git clone https://huggingface.co/datasets/liminghao1630/API-Bank

In [None]:
import os 
DATA_ROOT = 'API-Bank' 
OUTPUT_PATH = '../Datasets/'

In [None]:
try:
    os.path.join(DATA_ROOT,'test-data/level-3.json')
except FileNotFoundError as e:
    print(e)

In [None]:
import pandas as pd
import random
RANDOM_STATE = 42
            

def load_jsons_from_dir(test_data_path):
    """Get the JSONs from a given directory, load them into a DF and return a list of DFs"""
    test_files = os.listdir(test_data_path)
    dfs = []
    for file in test_files:
        fp = os.path.join(test_data_path,file)
        df = pd.read_json(fp)
        # dfs.append(df)
        if "expected_output" in df.columns:
            df = df.rename({'expected_output':'output'},axis=1)
            print(df.columns)

        df_cols=df.columns 
        # drop any column which is not 'input','output', or 'instruction'
        keep_cols =['input','output', 'instruction']

        for col in df_cols:
            if not (col in keep_cols):
                df=df.drop(columns=col)
                print(f"dropped {col}")
        dfs.append(df)
    return dfs

In [None]:
import pandas as pd

# Concat all the test dfs
test_dfs = load_jsons_from_dir(os.path.join(DATA_ROOT,'test-data'))
test_df = pd.concat(test_dfs)
test_df['split'] = 'test'

# Concat all the train DFs
train_dfs = load_jsons_from_dir(os.path.join(DATA_ROOT,'training-data'))
train_df = pd.concat(train_dfs)
train_df['split'] = 'train'

In [None]:
# Concat test and train DFs into a single DF
api_df = pd.concat([train_df,test_df],axis=0).reset_index(drop=True)
api_df = api_df.drop_duplicates()
api_df

In [None]:
# Identify API request versus API response rows
api_df['query_type'] = api_df['output'].apply(lambda x: 'API_request' if str(x).startswith('API-Request') else 'AI' if str(x).startswith('AI') else 'Other')
api_df.groupby(['query_type','split']).count()

In [None]:
print(api_df.shape[0])
# Keep only the API requests (see report for reasoning)
api_df = api_df[api_df['query_type']=='API_request']
print(api_df.shape[0])

In [None]:
# TIdy
api_df['cleaned_output'] = api_df['output'].str.replace("API-Request:","").str.strip()

In [None]:
# Get the instruction queries
api_df['instruction_query_type'] = api_df['instruction'].apply(lambda x: x.split('\n')[1])
api_df.groupby(['instruction_query_type','split']).count()

In [None]:
import re

def parse_func_calls(input_string):
    """Pull out the function calls"""
    pattern = r'\b\w+\([^()]*\)'
    matches = re.findall(pattern, input_string)
    return matches


In [None]:
import tqdm
tqdm.tqdm.pandas()

In [None]:
# Get all the functions out
api_df['function_calls'] = api_df['cleaned_output'].apply(parse_func_calls)


In [None]:
import matplotlib.pyplot as plt
# Despite what the API Bank paper says, only one function call is used at a tie
api_df['number_of_calls'] = api_df['function_calls'].apply(len)
api_df['number_of_calls'].hist()

In [None]:
# Since there is only one function call at a time, just keep that one
api_df['function_call'] = api_df['function_calls'].apply(lambda x: x[0] if len(x)>0 else "None")

In [None]:
# Pull out the function name -- used in later eval metrics
api_df['function_name'] = api_df['function_call'].apply(lambda x: x.split("(")[0] if ("(" in x) else "None")
plt.xlim((0,50))
api_df.groupby('function_name').count()['function_call'].sort_values(ascending=False).plot(kind='hist',bins=2000,)

In [None]:
# Concat instruction and inputs into a single prompt
api_df['prompt'] = api_df['instruction'] + "\n"+ api_df['input']

In [None]:
# How many in each?
api_df.groupby('split').count()

api_df

In [None]:
# Rename cleaned_output to completion
api_df = api_df.rename({'cleaned_output':'completion'},axis=1)
api_df['split'] = api_df['split'].apply(lambda x: 'eval' if x == 'train' and random.random() < 0.1 else x)
api_df.groupby('split').count()

In [None]:
# output
api_df.to_csv(os.path.join(OUTPUT_PATH,'cleaned_api_bank_data.csv'))

In [None]:
# delete the old folder
import shutil 
shutil.rmtree('API-Bank')