In [28]:
%pip install pandas


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [29]:
import pandas as pd


In [31]:
COLS2READ = ['puzzleId', 'gpt-results']


In [32]:
# gpt-3.5-turbo datasets
GPT3_TEMP_0 = pd.read_csv(
    'out/26_07-data-gpt-3.5-turbo-16k-0.csv', usecols=COLS2READ).head(50)
GPT3_TEMP_1 = pd.read_csv(
    'out/26_07-data-gpt-3.5-turbo-16k-1.csv', usecols=COLS2READ).head(50)

# gpt-4 datasets
GPT4_TEMP_1 = pd.read_csv(
    'out/26_07-data-gpt-4-1.csv', usecols=COLS2READ).head(50)
GPT4_TEMP_0 = pd.read_csv(
    'out/27_07-data-gpt-4-0.csv', usecols=COLS2READ).head(50)


In [33]:
def calculate_ratio(row):
    values = row.split()  # Split the space-separated string into a list of values
    true_count = values.count('True')
    total_count = len(values)
    return round(true_count / total_count, 2)


def build_ratio_df(dfA: pd.DataFrame, dfB: pd.DataFrame, columns: list):
    # merge gpt columns results of both dataframe to compare base on puzzleId
    df = dfA.merge(dfB, on='puzzleId', how='left')
    df = df.rename(
        columns={'gpt-results_x': columns[0], 'gpt-results_y': columns[1]})

    # create a dataframe where the rows of gpt-results are different
    _df = df.loc[
        df[columns[0]] != df[columns[1]],
        ['puzzleId', columns[0], columns[1]]
    ].reset_index(drop=True).dropna()

    # calculate ratios
    _df[f'{columns[0]}_RATIO'] = _df[columns[0]].apply(calculate_ratio)
    _df[f'{columns[1]}_RATIO'] = _df[columns[1]].apply(calculate_ratio)

    # delete non-necessary columns
    _df = _df.drop(columns[0], axis=1)
    _df = _df.drop(columns[1], axis=1)

    # return builded df
    return _df


In [34]:
# gpt3_temp-0 VS gpt3_temp-1
columns = ['GPT3_TEMP_0', 'GPT3_TEMP_1']

MERGED = build_ratio_df(dfA=GPT3_TEMP_0, dfB=GPT3_TEMP_1, columns=columns)

print(MERGED)
print(
    f"[gpt3_temp_0-performance] : {round(MERGED[f'{columns[0]}_RATIO'].mean(), 2)}")
print(
    f"[gpt3_temp_1-performance] : {round(MERGED[f'{columns[1]}_RATIO'].mean(), 2)}")


   puzzleId  GPT3_TEMP_0_RATIO  GPT3_TEMP_1_RATIO
0     004iZ                0.0               0.33
1     008Nz                1.0               0.67
2     008o6                1.0               0.33
4     00FHX                1.0               0.33
5     00GRa                0.0               0.33
6     00H1C                0.0               0.33
7     00HHN                0.0               0.33
8     00IPp                0.0               0.33
9     00JO7                1.0               0.67
10    00P7n                1.0               0.67
11    00SMl                1.0               0.67
12    00SOy                1.0               0.00
13    00Xfn                0.0               0.33
14    00Xiu                0.0               0.33
[gpt3_temp_0-performance] : 0.5
[gpt3_temp_1-performance] : 0.4


In [35]:
# gpt4_temp-0 VS gpt4_temp-1
columns = ['GPT4_TEMP_0', 'GPT4_TEMP_1']

MERGED = build_ratio_df(dfA=GPT4_TEMP_0, dfB=GPT4_TEMP_1, columns=columns)

print(MERGED)
print(
    f"[gpt4_temp_0-performance] : {round(MERGED[f'{columns[0]}_RATIO'].mean(), 2)}")
print(
    f"[gpt4_temp_1-performance] : {round(MERGED[f'{columns[1]}_RATIO'].mean(), 2)}")


   puzzleId  GPT4_TEMP_0_RATIO  GPT4_TEMP_1_RATIO
0     001gi               0.33               0.00
2     004iZ               1.00               0.67
3     00DPQ               1.00               0.67
4     00H9n               1.00               0.00
5     00HoG               1.00               0.33
6     00IaZ               0.00               0.67
7     00JO7               0.00               0.67
8     00OPk               0.00               0.33
9     00Or5               1.00               0.67
10    00Ozz               0.67               0.33
11    00QZV               1.00               0.00
12    00SMl               0.67               0.67
13    00STy               1.00               0.33
14    00SeK               0.00               0.33
15    00SfT               0.00               0.33
16    00X1l               0.33               0.00
17    00X2S               0.33               0.00
18    00Zh6               0.00               0.67
[gpt4_temp_0-performance] : 0.52
[gpt4_temp_1-perf

In [36]:
# gpt3_temp-1 VS gpt4_temp-1
columns = ['GPT3_TEMP_1', 'GPT4_TEMP_1']

MERGED = build_ratio_df(dfA=GPT3_TEMP_1, dfB=GPT4_TEMP_1, columns=columns)

print(MERGED)
print(
    f"[gpt3_temp_1-performance] : {round(MERGED[f'{columns[0]}_RATIO'].mean(), 2)}")
print(
    f"[gpt4_temp_1-performance] : {round(MERGED[f'{columns[1]}_RATIO'].mean(), 2)}")


   puzzleId  GPT3_TEMP_1_RATIO  GPT4_TEMP_1_RATIO
1     004iZ               0.33               0.67
2     008Nz               0.67               1.00
3     008o6               0.33               1.00
4     00Bm8               0.00               1.00
5     00DPQ               0.00               0.67
6     00FHX               0.33               1.00
7     00G0z               0.00               1.00
8     00GRa               0.33               1.00
9     00H1C               0.33               0.00
10    00HHN               0.33               0.00
11    00HoG               0.00               0.33
12    00IPp               0.33               0.00
13    00ITc               0.00               1.00
14    00IaZ               0.00               0.67
15    00JO7               0.67               0.67
16    00KYE               0.00               1.00
17    00OPk               0.00               0.33
18    00Or5               0.00               0.67
19    00Ozz               0.00               0.33


In [38]:
# gpt3_temp-0 VS gpt4_temp-0
columns = ['GPT3_TEMP_0', 'GPT4_TEMP_0']

MERGED = build_ratio_df(dfA=GPT3_TEMP_0, dfB=GPT4_TEMP_0, columns=columns)

print(MERGED)
print(
    f"[gpt3_temp_0-performance] : {round(MERGED[f'{columns[0]}_RATIO'].mean(), 2)}")
print(
    f"[gpt4_temp_0-performance] : {round(MERGED[f'{columns[1]}_RATIO'].mean(), 2)}")


   puzzleId  GPT3_TEMP_0_RATIO  GPT4_TEMP_0_RATIO
0     001gi                0.0               0.33
1     002CP                0.0               1.00
2     004iZ                0.0               1.00
3     00B2k                0.0               1.00
4     00Bm8                0.0               1.00
5     00DPQ                0.0               1.00
6     00G0z                0.0               1.00
7     00GRa                0.0               1.00
8     00H9n                0.0               1.00
9     00HoG                0.0               1.00
10    00ITc                0.0               1.00
11    00JO7                1.0               0.00
12    00KYE                0.0               1.00
13    00Or5                0.0               1.00
14    00Ozz                0.0               0.67
15    00P7n                1.0               0.00
16    00QZV                0.0               1.00
17    00SMl                1.0               0.67
18    00SOy                1.0               0.00
