In [19]:
import collections
from abc import ABC
import datasets
import json
import openai
import numpy as np
from scipy.special import softmax
import textwrap
import matplotlib.pyplot as plt
from IPython.display import clear_output

OPENAI_SECRET_KEY = None


In [20]:
if OPENAI_SECRET_KEY is None:
  print("Please paste your OpenAI API key here:")
  OPENAI_SECRET_KEY = input().strip()
openai.api_key = OPENAI_SECRET_KEY
clear_output()

class OpenAIEngine():
  def __init__(self, model_name):
    self.model_name = model_name

  def score(self, text):
    """Tokenizes and scores a piece of text.

    This only works for the OpenAI models which support the legacy `Completion`
    API.

    The score is log-likelihood. A higher score means a token was more
    likely according to the model.

    Returns a list of tokens and a list of scores.
    """
    response = openai.Completion.create(
        engine=self.model_name,
        prompt=text,
        max_tokens=0,
        logprobs=1,
        echo=True)

    tokens = response["choices"][0]["logprobs"]["tokens"]
    logprobs = response["choices"][0]["logprobs"]["token_logprobs"]
    if logprobs and logprobs[0] is None:
      # GPT-3 API does not return logprob of the first token
      logprobs[0] = 0.0
    return tokens, logprobs

  def perplexity(self, text):
    """Compute the perplexity of the provided text."""
    completion = openai.Completion.create(
        model=self.model_name,
        prompt=text,
        logprobs=0,
        max_tokens=0,
        temperature=1.0,
        echo=True)
    token_logprobs = completion['choices'][0]['logprobs']['token_logprobs']
    nll = np.mean([i for i in token_logprobs if i is not None])
    ppl = np.exp(-nll)
    return ppl

  def generate(self,
               prompt,
               top_p=1.0,
               num_tokens=32,
               num_samples=1,
               frequency_penalty=0.0,
              presence_penalty=0.0):
    """Generates text given the provided prompt text.

    This only works for the OpenAI models which support the legacy `Completion`
    API.

    If num_samples is 1, a single generated string is returned.
    If num_samples > 1, a list of num_samples generated strings is returned.
    """
    response = openai.Completion.create(
      engine=self.model_name,
      prompt=prompt,
      temperature=1.0,
      max_tokens=num_tokens,
      top_p=top_p,
      n=num_samples,
      frequency_penalty=frequency_penalty,
      presence_penalty=presence_penalty,
      logprobs=1,
    )
    outputs = [r["text"] for r in response["choices"]]
    return outputs[0] if num_samples == 1 else outputs


  def chat_generate(self,
                    previous_messages,
                    top_p=1.0,
                    num_tokens=32,
                    num_samples=1,
                    frequency_penalty=0.0,
                    presence_penalty=0.0):
    response = openai.ChatCompletion.create(
      model=self.model_name,
      messages=previous_messages,
      temperature=1.0,
      max_tokens=num_tokens,
      top_p=top_p,
      frequency_penalty=frequency_penalty,
      presence_penalty=presence_penalty,
      n=num_samples,
    )
    return response

# MeetingBank dataset:
-----

### GPT3-D3 zero-shot prompting

In [55]:
data_folder = "../../MeetingBank"


In [58]:
import os
import json
val_path = os.path.join(data_folder, "test.json")
with open(val_path, 'r') as f:
    val_data = json.load(f)
print(val_data[20]['source'])

JSONDecodeError: Extra data: line 2 column 1 (char 60565)

In [14]:

import collections
from abc import ABC
import datasets
import json
import openai
import numpy as np
from scipy.special import softmax
import textwrap
import matplotlib.pyplot as plt
from IPython.display import clear_output

OPENAI_SECRET_KEY = None

In [32]:
if OPENAI_SECRET_KEY is None:
  print("Please paste your OpenAI API key here:")
  OPENAI_SECRET_KEY = input().strip()
openai.api_key = OPENAI_SECRET_KEY
clear_output()

class OpenAIEngine():
  def __init__(self, model_name):
    self.model_name = model_name

  def score(self, text):
    """Tokenizes and scores a piece of text.

    This only works for the OpenAI models which support the legacy `Completion`
    API.

    The score is log-likelihood. A higher score means a token was more
    likely according to the model.

    Returns a list of tokens and a list of scores.
    """
    response = openai.Completion.create(
        engine=self.model_name,
        prompt=text,
        max_tokens=0,
        logprobs=1,
        echo=True)

    tokens = response["choices"][0]["logprobs"]["tokens"]
    logprobs = response["choices"][0]["logprobs"]["token_logprobs"]
    if logprobs and logprobs[0] is None:
      # GPT-3 API does not return logprob of the first token
      logprobs[0] = 0.0
    return tokens, logprobs

  def perplexity(self, text):
    """Compute the perplexity of the provided text."""
    completion = openai.Completion.create(
        model=self.model_name,
        prompt=text,
        logprobs=0,
        max_tokens=0,
        temperature=1.0,
        echo=True)
    token_logprobs = completion['choices'][0]['logprobs']['token_logprobs']
    nll = np.mean([i for i in token_logprobs if i is not None])
    ppl = np.exp(-nll)
    return ppl

  def generate(self,
               prompt,
               top_p = 1):
    """Generates text given the provided prompt text.

    This only works for the OpenAI models which support the legacy `Completion`
    API.

    If num_samples is 1, a single generated string is returned.
    If num_samples > 1, a list of num_samples generated strings is returned.
    """
    
    response = openai.ChatCompletion.create(
      model=self.model_name,
      messages=[{"role": "user", "content": prompt}],
      top_p=top_p
    
    )
    return response['choices'][0]['message']['content']


  def chat_generate(self,
                    previous_messages,
                    top_p=1.0,
                    num_tokens=32,
                    num_samples=1,
                    frequency_penalty=0.0,
                    presence_penalty=0.0):
    response = openai.ChatCompletion.create(
      model=self.model_name,
      messages=previous_messages,
      temperature=1.0,
      max_tokens=num_tokens,
      top_p=top_p,
      frequency_penalty=frequency_penalty,
      presence_penalty=presence_penalty,
      n=num_samples,
    )
    return response

## Calculate openai cost

In [53]:

#val data loaded, now use openai api to get summary sample by sample
#the result will be writen to a json for later evaluation
import re
from tqdm import tqdm
results = []
total_cost = 0
len_stat = [0,0,0]
for sample in tqdm(val_data):
    prompt = f"{sample['source']}\nSummarize the above article in 2 sentences."
    _seq_len = len(prompt)/4
    if _seq_len<4000:
        total_cost+=0.0015*_seq_len/1000
        len_stat[0]+=1
    elif _seq_len<16000:
        total_cost+=0.003*_seq_len/1000
        len_stat[1]+=1
    elif _seq_len<32000:
        total_cost+=0.06*_seq_len/1000
        len_stat[2]+=1
    else:
        total_cost+=0.06*32000/1000
print(len(val_data))
print(f"total cost: {total_cost} dollars (excluding output cost)")
print(f"#tokens <4000: {len_stat[0]} samples,  {round(len_stat[0]*100/len(val_data))}%")
print(f"#tokens <16000: {len_stat[1]} samples,  {round (len_stat[1]*100/len(val_data))}%")
print(f"#tokens <32000: {len_stat[2]} samples,  {round (len_stat[2]*100/len(val_data))}%")

100%|██████████| 862/862 [00:00<00:00, 308068.34it/s]

862
total cost: 60.59104199999995 dollars (excluding output cost)
#tokens <4000: 637 samples,  74%
#tokens <16000: 188 samples,  22%
#tokens <32000: 25 samples,  3%





## Inference

In [33]:
MODEL_NAME = "gpt-3.5-turbo-16k"
engine = OpenAIEngine(MODEL_NAME)

In [35]:

#val data loaded, now use openai api to get summary sample by sample
#the result will be writen to a json for later evaluation
import re
from tqdm import tqdm
results = []
for sample in tqdm(val_data):
    prompt = f"{sample['source']}\nSummarize the above article in 2 sentences."
    sample_result = dict()
    sample_result["id"] = sample["id"]
    sample_result["target"] = output_text = re.sub(r'\n', '', sample["summary"]) #remove all \n
    generation = engine.generate(prompt, top_p=0.5)
    sample_result["prediction"] = re.sub(r'\n', '', generation)
    results.append(sample_result)
with open("output.json", "w") as json_file:
    for item in results:
        json.dump(item, json_file)
        json_file.write('\n')


  2%|▏         | 20/861 [01:40<1:10:42,  5.04s/it]


InvalidRequestError: This model's maximum context length is 16385 tokens. However, your messages resulted in 16933 tokens. Please reduce the length of the messages.

# QMSum dataset:
-----

In [None]:
data_folder = "../../QMSum-main/MEETPEFT_data/ALL/jsonl/"

In [72]:
import os
import json
val_path = os.path.join(data_folder, "test.jsonl")
with open(val_path, 'r', encoding="utf-8") as f:
    val_data = [json.loads(line) for idx, line in enumerate(f)]
print(val_data[20]['conversations'])
print(val_data.__len__())

Lynne Neagle AM:Okay, good morning, everyone. Welcome to the Children, Young People and Education Committee this morning. I've received apologies for absence from Suzy Davies and Hefin David, and we've got no substitutions. Can I ask if Members have any declarations of interest? Can I just, then, declare for the record that I chair the cross-party group on suicide prevention and that Samaritans Cymru, who are appearing before us later, provide the secretariat for that group, just for that to be on the record? Item 2, then, is an evidence session for our inquiry on education otherwise than at school, and I'm very pleased to welcome our panel of witnesses this morning: Sharon Davies, head of learning, Torfaen County Borough Council and representing the Association of Directors of Education in Wales; Nick Williams, director of education, Swansea city and county council, and representing the Association of Directors of Education in Wales; and David Hopkins, interim head of education at the

In [75]:
def generate(model_name,
               prompt,
               top_p = 1):
    """Generates text given the provided prompt text.

    This only works for the OpenAI models which support the legacy `Completion`
    API.

    If num_samples is 1, a single generated string is returned.
    If num_samples > 1, a list of num_samples generated strings is returned.
    """
    
    response = openai.ChatCompletion.create(
      model=model_name,
      messages=[{"role": "user", "content": prompt}],
      top_p=top_p
    
    )
    return response['choices'][0]['message']['content']


In [79]:

#val data loaded, now use openai api to get summary sample by sample
#the result will be writen to a json for later evaluation
import re
from tqdm import tqdm
import tiktoken
ENCODE_MODE = {"gpt-3.5-turbo", "gpt-4"}
results = []
total_cost = 0
len_stat = [0,0,0]
gpt4_enc = tiktoken.encoding_for_model("gpt-4")
gpt35_enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
max_len= 0
for sample in tqdm(val_data):
    prompt = f"{sample['conversations']}\nSummarize the above article in 2 sentences."
    _seq_len = len(gpt35_enc.encode(prompt))
    '''_seq_len4 = len(gpt4_enc.encode(prompt))
    
    assert _seq_len == _seq_len4'''
    if _seq_len>max_len:
        max_len = _seq_len
    #print((_seq_len,_seq_len4))
    if _seq_len<4000:
        total_cost+=0.0015*_seq_len/1000
        len_stat[0]+=1
    elif _seq_len<16000:
        total_cost+=0.003*_seq_len/1000
        len_stat[1]+=1
    elif _seq_len<32000:
        total_cost+=0.06*_seq_len/1000
        len_stat[2]+=1
    else:
        total_cost+=0.06*32000/1000
print(len(val_data))
print(f"max: {max_len}")
print(f"total cost: {total_cost} dollars (excluding output cost)")
print(f"#tokens <4000: {len_stat[0]} samples,  {round(len_stat[0]*100/len(val_data))}%")
print(f"#tokens <16000: {len_stat[1]} samples,  {round (len_stat[1]*100/len(val_data))}%")
print(f"#tokens <32000: {len_stat[2]} samples,  {round (len_stat[2]*100/len(val_data))}%")

100%|██████████| 35/35 [00:00<00:00, 136.36it/s]

35
max: 33613
total cost: 16.079838000000002 dollars (excluding output cost)
#tokens <4000: 1 samples,  3%
#tokens <16000: 22 samples,  63%
#tokens <32000: 11 samples,  31%





## Inference

In [78]:

#val data loaded, now use openai api to get summary sample by sample
#the result will be writen to a json for later evaluation
import re
from tqdm import tqdm
import tiktoken
gpt35_enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
MODELS = ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4-32k"]
with open("output.json", "w") as json_file:
    for sample in tqdm(val_data):
        prompt = f"{sample['conversations']}\nSummarize the above article in 2 sentences."
        _seq_len = len(gpt35_enc.encode(prompt))
        sample_result = dict()
        sample_result["id"] = sample["id"]
        sample_result["target"] = output_text = re.sub(r'\n', '', sample["summary"]) #remove all \n
        if _seq_len<4000:
            generation = generate("gpt-3.5-turbo", prompt, top_p=0.5)
        elif _seq_len<16000:
            generation = generate("gpt-3.5-turbo-16k", prompt, top_p=0.5)
        elif _seq_len<32000:
            generation = generate("gpt-4-32k", prompt, top_p=0.5)
        else:
            print(f"# skipped sample id:{sample['id']} for being too long.")
            continue
        sample_result["prediction"] = re.sub(r'\n', '', generation)
        
        json.dump(sample_result, json_file)
        json_file.write('\n')




  9%|▊         | 3/35 [00:23<04:13,  7.93s/it]


InvalidRequestError: The model `gpt-4-32k` does not exist or you do not have access to it. Learn more: https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4.

In [4]:
!python /Users/zhuzengliang/Documents/GitHub/MeetPEFT/Benchmarks/utils/ResultsEval.py /Users/zhuzengliang/Documents/GitHub/MeetPEFT/Benchmarks/GPT/output.json

Traceback (most recent call last):
  File "/Users/zhuzengliang/Documents/GitHub/MeetPEFT/Benchmarks/utils/ResultsEval.py", line 7, in <module>
    from summertime.evaluation import Rouge, RougeWe, BertScore, Bleu, Meteor
  File "/Users/zhuzengliang/.local/lib/python3.8/site-packages/summertime/evaluation/__init__.py", line 8, in <module>
    from .rouge_metric import Rouge
  File "/Users/zhuzengliang/.local/lib/python3.8/site-packages/summertime/evaluation/rouge_metric.py", line 1, in <module>
    from summ_eval.rouge_metric import RougeMetric
ModuleNotFoundError: No module named 'summ_eval'


In [3]:
!pip install fire

Defaulting to user installation because normal site-packages is not writeable
Collecting fire
  Downloading fire-0.5.0.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.3/88.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting termcolor (from fire)
  Downloading termcolor-2.3.0-py3-none-any.whl (6.9 kB)
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25ldone
[?25h  Created wheel for fire: filename=fire-0.5.0-py2.py3-none-any.whl size=116931 sha256=e8dcdba70566eada8495ce7ee3a56b33d1b08f80556cc5356030c93c1c8da9f8
  Stored in directory: /Users/zhuzengliang/Library/Caches/pip/wheels/5b/eb/43/7295e71293b218ddfd627f935229bf54af9018add7fbb5aac6
Successfully built fire
Installing collected packages: termcolor, fire
Successfully installed fire-0.5.0 termcolor-2.3.0


In [7]:
!pip install -r /Users/zhuzengliang/Documents/GitHub/MeetPEFT/Benchmarks/SummerTime/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting git+https://github.com/bheinzerling/pyrouge.git@08e9cc35d713f718a05b02bf3bb2e29947d436ce (from -r /Users/zhuzengliang/Documents/GitHub/MeetPEFT/Benchmarks/SummerTime/requirements.txt (line 14))
  Cloning https://github.com/bheinzerling/pyrouge.git (to revision 08e9cc35d713f718a05b02bf3bb2e29947d436ce) to /private/var/folders/c6/xr1zg45d77ldyhcxb4ff5_9m0000gn/T/pip-req-build-s9uc8rst
  Running command git clone --filter=blob:none --quiet https://github.com/bheinzerling/pyrouge.git /private/var/folders/c6/xr1zg45d77ldyhcxb4ff5_9m0000gn/T/pip-req-build-s9uc8rst
  Running command git rev-parse -q --verify 'sha^08e9cc35d713f718a05b02bf3bb2e29947d436ce'
  Running command git fetch -q https://github.com/bheinzerling/pyrouge.git 08e9cc35d713f718a05b02bf3bb2e29947d436ce
  Resolved https://github.com/bheinzerling/pyrouge.git to commit 08e9cc35d713f