In [1]:
import pandas as pd
from pathlib import Path
import structlog
from IPython.display import display, clear_output
import regex as re
from sklearn.metrics import f1_score
import api_calls as api
from running_model import _find_label
import os
import numpy as np

In [2]:
logger = structlog.get_logger()

In [3]:
TESTING_DIR = Path("./data/testing/all_testing")

In [4]:
def make_dir(dir: str) -> None:
    if not os.path.exists(dir):
        logger.info("creating dir", path=dir)
        os.makedirs(dir)

In [5]:
async def test_results(model: str, instructions: list[str], prompt_name: str) -> None:
    make_dir(TESTING_DIR/f"{model}_{prompt_name}")
    finished_files = [str(ele) for ele in Path(TESTING_DIR/f"{model}_{prompt_name}").glob("*.csv")]
    counter = 1
    for file_path in TESTING_DIR.glob("*.xlsx"):
        clear_output(wait=True)
        title = re.search(r'data/testing/all_testing/lab-manual-(.*).xlsx', str(file_path)).group(1)
        if any(title in ele for ele in finished_files):
            counter += 1
            continue
        display(f"{counter}: {title}")
        data = pd.read_excel(file_path, usecols= ["sentence", "label"])
        results = await api.get_multiple_api_calls(model, instructions, data["sentence"])
        if len(instructions) == 1:
            results = [_find_label(reason[0]) for reason in results]
        else:
            results = [_find_label(reason[1]) for reason in results]
        data[f"{model}_{prompt_name}"] = results
        data.to_csv(TESTING_DIR/f"{model}_{prompt_name}"/f"{model}_{prompt_name}_{title}.csv")
        counter += 1

In [6]:
with open("./data/prompts/fine_tune_prompt_0.txt", "r") as file:
    fine_tune_prompt_0 = file.read().splitlines()
    

In [7]:
os.getenv("FINE_TUNE_PROMPT_0")

'ft:gpt-3.5-turbo-0613:personal::8BMMOD86'

In [8]:
await test_results(os.getenv("FINE_TUNE_PROMPT_0"), fine_tune_prompt_0, "fine_tune_prompt_0")

'24: sp-split-test-944601'

[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1m200[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1mcompleted 8 / 206[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1m200[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1mcompleted 11 / 206[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1m200[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1mcompleted 10 / 206[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1m200[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1mcompleted 2 / 206[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1m200[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1mcompleted 1 / 206[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1m200[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1mcompleted 5 / 206[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1m200[0m
[2m2023-10-19 14:04:48[0m [[32m[1minfo     [0m] [1mcompleted 6 

In [10]:
def refine_filenames(keep: list[str], ignore: list[str], file_names: list[str]):
        result = []
        for file_name in file_names:
            if all(word_to_keep in file_name for word_to_keep in keep) \
            and \
            all(word_to_ignore not in file_name for word_to_ignore in ignore):
                result.append(file_name)
        return result

In [11]:
def get_big_df(model: str, prompt_name: str, keep: list[str], ignore: list[str], file_names: list[str]):
    return pd.concat(
        [
            pd.read_csv(file_name, usecols= ["sentence", "label", f"{model}_{prompt_name}"])
            for file_name in 
            refine_filenames(keep, ignore, file_names)]
        ).reset_index(drop=True)

In [26]:
def get_f1(df: pd.DataFrame):
    # return f1_score(df.iloc[:, -2], df.iloc[:, -1], average="weighted")
    true = np.array(df.iloc[:,-2])
    pred = np.array(df.iloc[:, -1])
    correct = (true == pred)
    return correct.sum() / correct.size

In [27]:
def get_all_f1(model: str, prompt_name: str):
    file_names = [str(file_name) for file_name in Path(TESTING_DIR/f"{model}_{prompt_name}").glob("*.csv")]
    all_dfs = {
        "mm" : get_big_df(model, prompt_name, ["mm"], ["split"], file_names),
        "mm_split" : get_big_df(model, prompt_name, ["mm", "split"], [], file_names),
        "pc" : get_big_df(model, prompt_name, ["pc"], ["split"], file_names),
        "pc_split" : get_big_df(model, prompt_name, ["pc", "split"], [], file_names),
        "sp" : get_big_df(model, prompt_name, ["sp"], ["split"], file_names),
        "sp_split" : get_big_df(model, prompt_name, ["sp", "split"], [], file_names),
        "combine" : get_big_df(model, prompt_name, ["combine"], ["split"], file_names),
        "combine_split" : get_big_df(model, prompt_name, ["combine", "split"], [], file_names)
        }
    for key, value in all_dfs.items():
        broken_df = value.copy()
        nulls = value[value.iloc[:,-1].isnull()].index.tolist()
        if len(nulls) > 2:
            raise ValueError("too many nulls")
        for index in nulls:
            broken_df.iloc[index, -1] = 2
        all_dfs[key] = broken_df
    return {key : get_f1(value) for key, value in all_dfs.items()}


In [28]:
get_all_f1("gpt-3.5-turbo", "prompt_0")

{'mm': 0.5903426791277259,
 'mm_split': 0.604992657856094,
 'pc': 0.5291005291005291,
 'pc_split': 0.5948717948717949,
 'sp': 0.6767169179229481,
 'sp_split': 0.6203890006706908,
 'combine': 0.615546218487395,
 'combine_split': 0.6108870967741935}

In [29]:
get_all_f1("gpt-3.5-turbo", "prompt_1")

{'mm': 0.6277258566978193,
 'mm_split': 0.6593245227606461,
 'pc': 0.6031746031746031,
 'pc_split': 0.6307692307692307,
 'sp': 0.6381909547738693,
 'sp_split': 0.6368209255533199,
 'combine': 0.6211484593837535,
 'combine_split': 0.633736559139785}

In [30]:
get_all_f1(os.getenv("FINE_TUNE_LARGE"), "fine_tune")

{'mm': 0.7118380062305296,
 'mm_split': 0.73568281938326,
 'pc': 0.6772486772486772,
 'pc_split': 0.7282051282051282,
 'sp': 0.7571189279731994,
 'sp_split': 0.7344064386317908,
 'combine': 0.7310924369747899,
 'combine_split': 0.7278225806451613}

In [31]:
get_all_f1("gpt-4", "prompt_2")


{'mm': 0.6806853582554517,
 'mm_split': 0.6989720998531571,
 'pc': 0.6878306878306878,
 'pc_split': 0.6974358974358974,
 'sp': 0.7269681742043551,
 'sp_split': 0.6971830985915493,
 'combine': 0.6995798319327731,
 'combine_split': 0.6935483870967742}

In [32]:
get_all_f1(os.getenv("FINE_TUNE_PROMPT_2"), "fine_tune_prompt_2")


{'mm': 0.838006230529595,
 'mm_split': 0.8399412628487518,
 'pc': 0.8465608465608465,
 'pc_split': 0.8512820512820513,
 'sp': 0.8492462311557789,
 'sp_split': 0.8370221327967807,
 'combine': 0.8270308123249299,
 'combine_split': 0.8266129032258065}

In [33]:
get_all_f1(os.getenv("FINE_TUNE_PROMPT_0"), "fine_tune_prompt_0")

{'mm': 0.7398753894080997,
 'mm_split': 0.723935389133627,
 'pc': 0.7195767195767195,
 'pc_split': 0.7487179487179487,
 'sp': 0.7671691792294807,
 'sp_split': 0.7260228034875922,
 'combine': 0.7296918767507002,
 'combine_split': 0.717741935483871}

In [44]:
file_names = [str(file_name) for file_name in Path(TESTING_DIR/f"gpt-4_prompt_2").glob("*.csv")]

get_f1(get_big_df("gpt-4", "prompt_2", ["mm"], [], file_names))


0.6978105509540922