In [4]:
import os
import json
import openai
from ds.supported import load_dataset
from metrics.rouge import Rouge
import openai
import tiktoken
from tqdm import tqdm

In [5]:
config = {
    "dataset": "arxiv",
    "preview": False,
    "samples": "max",
    "min_input_size": 0,
    "max_length": 4096,
    "max_new_tokens": 256,
    "batch_size": 1,
    "model_hf_key": "gpt-3.5-turbo-0613"
}


dataset = config["dataset"]
preview = config["preview"]
samples = config["samples"]
min_input_size = config["min_input_size"]
max_len = config["max_length"]
max_new_tokens = config["max_new_tokens"]
model = config["model_hf_key"]

In [6]:
log_dir = "results"
exec_path = f"{model}-{dataset}"

if not os.path.exists(os.path.join(log_dir, exec_path)):
    os.mkdir(os.path.join(log_dir, exec_path))
outputs_path = os.path.join(log_dir, exec_path, "log.out")
metrics_path = os.path.join(log_dir, exec_path, "metrics.json")


In [7]:
# load/process ds
dataset = load_dataset(
    dataset=dataset,
    preview=preview,
    samples=samples,
    min_input_size=min_input_size,
)
dtest = dataset.get_split("test")


100%|██████████| 3/3 [00:00<00:00, 127.32it/s]


In [8]:
tok = tiktoken.encoding_for_model(model)

tokens_per_message = 4
tokens_per_prompt = 3

def preprocess(batch):
    text = batch["text"]
    msg = f"You are an expert at summarization. Proceed to summarize the following text. {text}"
    enc_msg = tok.encode(msg)
    # per openai docs each message <|start|>{role/name}\n{content}<|end|>\n
    # every reply is primed with <|start|>assistant<|message|>
    filtered_msg = enc_msg[
        : max_len - max_new_tokens - tokens_per_message - tokens_per_prompt - 1
    ]
    dec_filtered_msg = tok.decode(filtered_msg)

    return {"messages": [{"role": "system", "content": dec_filtered_msg}]}


parsed_dtest = dtest.map(preprocess)


100%|██████████| 6440/6440 [00:41<00:00, 155.89ex/s]


In [9]:
openai.api_key = "sk-" # fill with key suffix

In [12]:

def generate_summary(sample):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-0613",
        messages=sample,
        max_tokens=max_new_tokens,
    )

    return response["choices"][0]["message"]["content"]


outputs = []
failed = []
for idx, sample in enumerate(tqdm(parsed_dtest["messages"])):
    try:
        outputs.append(generate_summary(sample))
    except:
        print(f"failed in iteration {idx}")
        failed.append(idx)
        with open(outputs_path, "w") as fp:
            json.dump(outputs, fp)

  1%|▏         | 86/6440 [08:51<22:41:11, 12.85s/it]

failed in iteration 85


  8%|▊         | 486/6440 [48:38<23:57:24, 14.49s/it]

failed in iteration 485


  9%|▊         | 554/6440 [57:42<24:02:02, 14.70s/it]

failed in iteration 553


 10%|█         | 645/6440 [1:09:14<21:20:14, 13.26s/it]

failed in iteration 644


 13%|█▎        | 843/6440 [1:31:29<21:23:21, 13.76s/it]

failed in iteration 842


 23%|██▎       | 1505/6440 [2:44:17<19:09:40, 13.98s/it]

failed in iteration 1504


 25%|██▍       | 1601/6440 [2:54:59<18:32:33, 13.79s/it]

failed in iteration 1600


 25%|██▌       | 1627/6440 [2:58:08<18:49:22, 14.08s/it]

failed in iteration 1626


 27%|██▋       | 1746/6440 [3:11:27<17:30:11, 13.42s/it]

failed in iteration 1745


 27%|██▋       | 1761/6440 [3:13:31<18:36:08, 14.31s/it]

failed in iteration 1760


 30%|███       | 1938/6440 [3:32:45<17:52:18, 14.29s/it]

failed in iteration 1937


 30%|███       | 1957/6440 [3:35:21<16:39:48, 13.38s/it]

failed in iteration 1956


 34%|███▍      | 2179/6440 [3:59:10<16:33:42, 13.99s/it]

failed in iteration 2178


 34%|███▍      | 2183/6440 [4:00:01<18:29:14, 15.63s/it]

failed in iteration 2182


 39%|███▉      | 2518/6440 [4:35:34<14:56:26, 13.71s/it]

failed in iteration 2517


 44%|████▍     | 2862/6440 [5:09:30<13:04:59, 13.16s/it]

failed in iteration 2861


 50%|█████     | 3227/6440 [5:44:34<12:11:21, 13.66s/it]

failed in iteration 3226


 52%|█████▏    | 3318/6440 [5:54:16<11:20:09, 13.07s/it]

failed in iteration 3317


 52%|█████▏    | 3333/6440 [5:56:12<12:04:45, 14.00s/it]

failed in iteration 3332


 53%|█████▎    | 3383/6440 [6:00:58<10:54:19, 12.84s/it]

failed in iteration 3382


 54%|█████▍    | 3497/6440 [6:12:01<10:08:14, 12.40s/it]

failed in iteration 3496


 61%|██████    | 3941/6440 [6:54:13<9:05:37, 13.10s/it] 

failed in iteration 3940


 67%|██████▋   | 4315/6440 [7:28:32<7:48:28, 13.23s/it]

failed in iteration 4314


 70%|██████▉   | 4490/6440 [7:45:16<7:10:51, 13.26s/it]

failed in iteration 4489


 75%|███████▌  | 4845/6440 [8:19:16<5:56:17, 13.40s/it]

failed in iteration 4844


 75%|███████▌  | 4856/6440 [8:20:41<5:34:43, 12.68s/it]

failed in iteration 4855


 79%|███████▉  | 5096/6440 [8:43:40<4:47:35, 12.84s/it]

failed in iteration 5095


 81%|████████▏ | 5247/6440 [8:58:33<4:23:48, 13.27s/it]

failed in iteration 5246


 82%|████████▏ | 5289/6440 [9:02:58<4:07:58, 12.93s/it]

failed in iteration 5288


 88%|████████▊ | 5652/6440 [9:36:50<2:49:03, 12.87s/it]

failed in iteration 5651


 99%|█████████▉| 6366/6440 [10:43:03<15:11, 12.31s/it] 

failed in iteration 6365


100%|██████████| 6440/6440 [10:49:59<00:00,  6.06s/it]


In [13]:
with open(outputs_path, "w") as fp:
            json.dump(outputs, fp)

In [18]:
failds = parsed_dtest.select(failed)

In [20]:

failfail = []
for idx, sample in enumerate(tqdm(failds["messages"])):
    try:
        outputs.insert(failed[idx], generate_summary(sample))
    except:
        print(idx)
        failfail.append(idx)



100%|██████████| 31/31 [02:44<00:00,  5.30s/it]


In [21]:
len(outputs)

6440

In [22]:
rouge = Rouge()
metrics = rouge.compute(predictions=outputs, references=dtest["summary"])

In [23]:

with open(metrics_path, "w") as fp:
    json.dump(metrics, fp)

with open(outputs_path, "w") as fp:
    json.dump(outputs, fp)


In [24]:
metrics

{'rouge1': 0.4279752865589931,
 'rouge2': 0.1434104083900558,
 'rougeL': 0.22396662439703166,
 'rougeLsum': 0.32750137915478617}

In [4]:
import pandas as pd

df = pd.read_excel("results/result.xlsx")

import numpy as np

def geo_mean(iterable):
    a = np.array(iterable)
    return a.prod()**(1.0/len(a))*100


df['geo_mean'] = df[['Rouge1','Rouge2','RougeL']].agg(geo_mean, 1)
df['Rouge1'] = df['Rouge1']*100
df['Rouge2'] = df['Rouge2']*100
df['RougeL'] = df['RougeL']*100

df.sort_values(by=['Folder'])

Unnamed: 0,Folder,RougeAvg,Rouge1,Rouge2,RougeL,geo_mean
7,baichuan7b-arxiv,0.143309,22.24576,6.945236,13.801589,12.871262
8,baichuan7b-govreport,0.198462,35.940279,9.205102,14.393246,16.823755
9,baichuan7b-pubmed,0.10647,18.855206,4.416562,8.669112,8.970777
11,bart-arxiv,0.204987,34.360302,8.924503,18.211335,17.741661
10,bart-arxiv-1024-19062023153234,0.074199,15.678481,0.42792,6.153443,3.456167
13,bart-govreport,0.292944,49.457589,17.999579,20.425909,26.296154
12,bart-govreport-1024-21062023001839,0.051566,10.654403,0.05064,4.764792,1.369899
15,bart-pubmed,0.20675,33.985606,9.785639,18.253797,18.242281
14,bart-pubmed-1024-19062023133026,0.065601,13.722191,0.369774,5.588256,3.049384
16,bigbirdpegasus-arxiv,0.153142,24.511903,5.606568,15.82401,12.95578
