In [1]:
import collections
from abc import ABC
import datasets
import json
import openai
import numpy as np
from scipy.special import softmax
import textwrap
import matplotlib.pyplot as plt
from IPython.display import clear_output

OPENAI_SECRET_KEY = None




In [2]:
if OPENAI_SECRET_KEY is None:
  print("Please paste your OpenAI API key here:")
  OPENAI_SECRET_KEY = input().strip()
openai.api_key = OPENAI_SECRET_KEY
clear_output()

class OpenAIEngine():
  def __init__(self, model_name):
    self.model_name = model_name

  def score(self, text):
    """Tokenizes and scores a piece of text.

    This only works for the OpenAI models which support the legacy `Completion`
    API.

    The score is log-likelihood. A higher score means a token was more
    likely according to the model.

    Returns a list of tokens and a list of scores.
    """
    response = openai.Completion.create(
        engine=self.model_name,
        prompt=text,
        max_tokens=0,
        logprobs=1,
        echo=True)

    tokens = response["choices"][0]["logprobs"]["tokens"]
    logprobs = response["choices"][0]["logprobs"]["token_logprobs"]
    if logprobs and logprobs[0] is None:
      # GPT-3 API does not return logprob of the first token
      logprobs[0] = 0.0
    return tokens, logprobs

  def perplexity(self, text):
    """Compute the perplexity of the provided text."""
    completion = openai.Completion.create(
        model=self.model_name,
        prompt=text,
        logprobs=0,
        max_tokens=0,
        temperature=1.0,
        echo=True)
    token_logprobs = completion['choices'][0]['logprobs']['token_logprobs']
    nll = np.mean([i for i in token_logprobs if i is not None])
    ppl = np.exp(-nll)
    return ppl

  def generate(self,
               prompt,
               top_p=1.0,
               num_tokens=32,
               num_samples=1,
               frequency_penalty=0.0,
              presence_penalty=0.0):
    """Generates text given the provided prompt text.

    This only works for the OpenAI models which support the legacy `Completion`
    API.

    If num_samples is 1, a single generated string is returned.
    If num_samples > 1, a list of num_samples generated strings is returned.
    """
    response = openai.Completion.create(
      engine=self.model_name,
      prompt=prompt,
      temperature=1.0,
      max_tokens=num_tokens,
      top_p=top_p,
      n=num_samples,
      frequency_penalty=frequency_penalty,
      presence_penalty=presence_penalty,
      logprobs=1,
    )
    outputs = [r["text"] for r in response["choices"]]
    return outputs[0] if num_samples == 1 else outputs


  def chat_generate(self,
                    previous_messages,
                    top_p=1.0,
                    num_tokens=32,
                    num_samples=1,
                    frequency_penalty=0.0,
                    presence_penalty=0.0):
    response = openai.ChatCompletion.create(
      model=self.model_name,
      messages=previous_messages,
      temperature=1.0,
      max_tokens=num_tokens,
      top_p=top_p,
      frequency_penalty=frequency_penalty,
      presence_penalty=presence_penalty,
      n=num_samples,
    )
    return response

# MeetingBank dataset:
-----

### GPT3-D3 zero-shot prompting

In [4]:
data_folder = "../../MeetingBank"


In [5]:
import os
import json
val_path = os.path.join(data_folder, "test.json")
with open(val_path, 'r') as f:
    val_data = json.load(f)
print(val_data[20]['source'])

speaker 0: Great. Thank you. Next item, please.
speaker 1: Item 27 Report from Economic Development. Recommendation to execute a Second Amendment to Exclusive Negotiation Agreement with Howard CDB for the continuation of negotiations in connection with the proposed development of the former Long Beach Armory at 854 East Seventh Street District one.
speaker 0: Can have a second, please. Thank you, Mr. Modica. Just just briefly, because I'm I'm very personally very interested in this project. I think it's a phenomenal opportunity and we've been discussing it for for many years, and it seems to be in a very good place.
speaker 0: Can you can Stafford provide a just a brief update on this?
speaker 3: Yes, John Keisler. Honorable mayor and members of the city council. It's so good to see you. And I'm going to ask our deputy director, Sergio Ramirez, to give a short staff report about this amazing project. Thank you, John.
speaker 3: Good evening, honorable mayor and council members. We have

In [14]:

import collections
from abc import ABC
import datasets
import json
import openai
import numpy as np
from scipy.special import softmax
import textwrap
import matplotlib.pyplot as plt
from IPython.display import clear_output

OPENAI_SECRET_KEY = None

In [32]:
if OPENAI_SECRET_KEY is None:
  print("Please paste your OpenAI API key here:")
  OPENAI_SECRET_KEY = input().strip()
openai.api_key = OPENAI_SECRET_KEY
clear_output()

class OpenAIEngine():
  def __init__(self, model_name):
    self.model_name = model_name

  def score(self, text):
    """Tokenizes and scores a piece of text.

    This only works for the OpenAI models which support the legacy `Completion`
    API.

    The score is log-likelihood. A higher score means a token was more
    likely according to the model.

    Returns a list of tokens and a list of scores.
    """
    response = openai.Completion.create(
        engine=self.model_name,
        prompt=text,
        max_tokens=0,
        logprobs=1,
        echo=True)

    tokens = response["choices"][0]["logprobs"]["tokens"]
    logprobs = response["choices"][0]["logprobs"]["token_logprobs"]
    if logprobs and logprobs[0] is None:
      # GPT-3 API does not return logprob of the first token
      logprobs[0] = 0.0
    return tokens, logprobs

  def perplexity(self, text):
    """Compute the perplexity of the provided text."""
    completion = openai.Completion.create(
        model=self.model_name,
        prompt=text,
        logprobs=0,
        max_tokens=0,
        temperature=1.0,
        echo=True)
    token_logprobs = completion['choices'][0]['logprobs']['token_logprobs']
    nll = np.mean([i for i in token_logprobs if i is not None])
    ppl = np.exp(-nll)
    return ppl

  def generate(self,
               prompt,
               top_p = 1):
    """Generates text given the provided prompt text.

    This only works for the OpenAI models which support the legacy `Completion`
    API.

    If num_samples is 1, a single generated string is returned.
    If num_samples > 1, a list of num_samples generated strings is returned.
    """
    
    response = openai.ChatCompletion.create(
      model=self.model_name,
      messages=[{"role": "user", "content": prompt}],
      top_p=top_p
    
    )
    return response['choices'][0]['message']['content']


  def chat_generate(self,
                    previous_messages,
                    top_p=1.0,
                    num_tokens=32,
                    num_samples=1,
                    frequency_penalty=0.0,
                    presence_penalty=0.0):
    response = openai.ChatCompletion.create(
      model=self.model_name,
      messages=previous_messages,
      temperature=1.0,
      max_tokens=num_tokens,
      top_p=top_p,
      frequency_penalty=frequency_penalty,
      presence_penalty=presence_penalty,
      n=num_samples,
    )
    return response

## Calculate openai cost

In [53]:

#val data loaded, now use openai api to get summary sample by sample
#the result will be writen to a json for later evaluation
import re
from tqdm import tqdm
results = []
total_cost = 0
len_stat = [0,0,0]
for sample in tqdm(val_data):
    prompt = f"{sample['source']}\nSummarize the above article in 2 sentences."
    _seq_len = len(prompt)/4
    if _seq_len<4000:
        total_cost+=0.0015*_seq_len/1000
        len_stat[0]+=1
    elif _seq_len<16000:
        total_cost+=0.003*_seq_len/1000
        len_stat[1]+=1
    elif _seq_len<32000:
        total_cost+=0.06*_seq_len/1000
        len_stat[2]+=1
    else:
        total_cost+=0.06*32000/1000
print(len(val_data))
print(f"total cost: {total_cost} dollars (excluding output cost)")
print(f"#tokens <4000: {len_stat[0]} samples,  {round(len_stat[0]*100/len(val_data))}%")
print(f"#tokens <16000: {len_stat[1]} samples,  {round (len_stat[1]*100/len(val_data))}%")
print(f"#tokens <32000: {len_stat[2]} samples,  {round (len_stat[2]*100/len(val_data))}%")

100%|██████████| 862/862 [00:00<00:00, 308068.34it/s]

862
total cost: 60.59104199999995 dollars (excluding output cost)
#tokens <4000: 637 samples,  74%
#tokens <16000: 188 samples,  22%
#tokens <32000: 25 samples,  3%





## Inference

In [33]:
MODEL_NAME = "gpt-3.5-turbo-16k"
engine = OpenAIEngine(MODEL_NAME)

In [35]:

#val data loaded, now use openai api to get summary sample by sample
#the result will be writen to a json for later evaluation
import re
from tqdm import tqdm
results = []
for sample in tqdm(val_data):
    prompt = f"{sample['source']}\nSummarize the above article in 2 sentences."
    sample_result = dict()
    sample_result["id"] = sample["id"]
    sample_result["target"] = output_text = re.sub(r'\n', '', sample["summary"]) #remove all \n
    generation = engine.generate(prompt, top_p=0.5)
    sample_result["prediction"] = re.sub(r'\n', '', generation)
    results.append(sample_result)
with open("output.json", "w") as json_file:
    for item in results:
        json.dump(item, json_file)
        json_file.write('\n')


  2%|▏         | 20/861 [01:40<1:10:42,  5.04s/it]


InvalidRequestError: This model's maximum context length is 16385 tokens. However, your messages resulted in 16933 tokens. Please reduce the length of the messages.

# QMSum dataset:
-----

In [6]:
data_folder = "../../QMSum-main/MEETPEFT_data/ALL/jsonl/"

In [7]:
import os
import json
val_path = os.path.join(data_folder, "test.jsonl")
with open(val_path, 'r', encoding="utf-8") as f:
    val_data = [json.loads(line) for idx, line in enumerate(f)]
print(val_data[20]['conversations'])
print(val_data.__len__())

Lynne Neagle AM:Okay, good morning, everyone. Welcome to the Children, Young People and Education Committee this morning. I've received apologies for absence from Suzy Davies and Hefin David, and we've got no substitutions. Can I ask if Members have any declarations of interest? Can I just, then, declare for the record that I chair the cross-party group on suicide prevention and that Samaritans Cymru, who are appearing before us later, provide the secretariat for that group, just for that to be on the record? Item 2, then, is an evidence session for our inquiry on education otherwise than at school, and I'm very pleased to welcome our panel of witnesses this morning: Sharon Davies, head of learning, Torfaen County Borough Council and representing the Association of Directors of Education in Wales; Nick Williams, director of education, Swansea city and county council, and representing the Association of Directors of Education in Wales; and David Hopkins, interim head of education at the

In [8]:
def generate(model_name,
               prompt,
               top_p = 1):
    """Generates text given the provided prompt text.

    This only works for the OpenAI models which support the legacy `Completion`
    API.

    If num_samples is 1, a single generated string is returned.
    If num_samples > 1, a list of num_samples generated strings is returned.
    """
    
    response = openai.ChatCompletion.create(
      model=model_name,
      messages=[{"role": "user", "content": prompt}],
      top_p=top_p
    
    )
    return response['choices'][0]['message']['content']


In [79]:

#val data loaded, now use openai api to get summary sample by sample
#the result will be writen to a json for later evaluation
import re
from tqdm import tqdm
import tiktoken
ENCODE_MODE = {"gpt-3.5-turbo", "gpt-4"}
results = []
total_cost = 0
len_stat = [0,0,0]
gpt4_enc = tiktoken.encoding_for_model("gpt-4")
gpt35_enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
max_len= 0
for sample in tqdm(val_data):
    prompt = f"{sample['conversations']}\nSummarize the above article in 2 sentences."
    _seq_len = len(gpt35_enc.encode(prompt))
    '''_seq_len4 = len(gpt4_enc.encode(prompt))
    
    assert _seq_len == _seq_len4'''
    if _seq_len>max_len:
        max_len = _seq_len
    #print((_seq_len,_seq_len4))
    if _seq_len<4000:
        total_cost+=0.0015*_seq_len/1000
        len_stat[0]+=1
    elif _seq_len<16000:
        total_cost+=0.003*_seq_len/1000
        len_stat[1]+=1
    elif _seq_len<32000:
        total_cost+=0.06*_seq_len/1000
        len_stat[2]+=1
    else:
        total_cost+=0.06*32000/1000
print(len(val_data))
print(f"max: {max_len}")
print(f"total cost: {total_cost} dollars (excluding output cost)")
print(f"#tokens <4000: {len_stat[0]} samples,  {round(len_stat[0]*100/len(val_data))}%")
print(f"#tokens <16000: {len_stat[1]} samples,  {round (len_stat[1]*100/len(val_data))}%")
print(f"#tokens <32000: {len_stat[2]} samples,  {round (len_stat[2]*100/len(val_data))}%")

100%|██████████| 35/35 [00:00<00:00, 136.36it/s]

35
max: 33613
total cost: 16.079838000000002 dollars (excluding output cost)
#tokens <4000: 1 samples,  3%
#tokens <16000: 22 samples,  63%
#tokens <32000: 11 samples,  31%





## Inference

In [10]:

#val data loaded, now use openai api to get summary sample by sample
#the result will be writen to a json for later evaluation
import re
from tqdm import tqdm
import tiktoken
gpt35_enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
MODELS = ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4-32k"]
with open("output.json", "w") as json_file:
    for sample in tqdm(val_data):
        if sample["id"]<21:
            continue
        prompt = f"{sample['conversations']}\nSummarize the above article sentences."
        _seq_len = len(gpt35_enc.encode(prompt))
        sample_result = dict()
        sample_result["id"] = sample["id"]
        sample_result["target"] = output_text = re.sub(r'\n', '', sample["summary"]) #remove all \n
        if _seq_len<4000:
            generation = generate("gpt-3.5-turbo", prompt, top_p=0.5)
        elif _seq_len<16000:
            generation = generate("gpt-3.5-turbo-16k", prompt, top_p=0.5)
        elif _seq_len<32000:
            con = sample['conversations'].split('\n')
            _half_pos = len(con)//2
            first_half = "\n".join(con[:_half_pos])
            sec_half = "\n".join(con[_half_pos:])
            prompt = f"{first_half}\nSummarize the above article."
            generation1 = generate("gpt-3.5-turbo-16k", prompt, top_p=0.5)
            prompt = f"{sec_half}\nSummarize the above article."
            print("1")
            generation2 = generate("gpt-3.5-turbo-16k", prompt, top_p=0.5)
            prompt = f"{generation1+generation2}\nSummarize the above article."
            print("2")
            generation = generate("gpt-3.5-turbo-16k", prompt, top_p=0.5)
        else:
            print(f"# skipped sample id:{sample['id']} for being too long.")
            continue
        sample_result["prediction"] = re.sub(r'\n', '', generation)
        
        json.dump(sample_result, json_file)
        json_file.write('\n')




  0%|          | 0/35 [00:00<?, ?it/s]

 74%|███████▍  | 26/35 [01:06<00:41,  4.64s/it]

1
2


 86%|████████▌ | 30/35 [02:10<00:48,  9.61s/it]

1
2


 89%|████████▊ | 31/35 [02:48<01:06, 16.62s/it]

1
2


100%|██████████| 35/35 [03:56<00:00,  6.76s/it]


In [1]:
import sys
sys.path.insert(1, "../utils")
from ResultsEval import run_eval

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhuzengliang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/zhuzengliang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
run_eval("output.json")

{"id": 21, "target": "This is the second meeting of the design group. Project Manager introduced new project requirements at first. The management required a remote control only for television and aimed at customers under 40. In terms of user interface design, User Interface focused on user-friendliness but still thought multi-function should be considered. Industrial Designer agreed and proposed to substitute voice recognition for the ten digits. He also suggested that they should use infrared so that the remote control could be connected with most TV sets. After that, in order to solve the problem of energy source, the team decided to include a cradle so that the remote control could be recharged.", "prediction": "The project manager welcomes everyone back and checks who is present. The user interface mentions a problem with the CD-ROM box. The industrial designer asks for a cup of coffee but is denied. The project manager discusses the agenda for the meeting, including new project r

100%|██████████| 14/14 [00:00<00:00, 2717.40it/s]

['This is the second meeting of the design group. Project Manager introduced new project requirements at first. The management required a remote control only for television and aimed at customers under 40. In terms of user interface design, User Interface focused on user-friendliness but still thought multi-function should be considered. Industrial Designer agreed and proposed to substitute voice recognition for the ten digits. He also suggested that they should use infrared so that the remote control could be connected with most TV sets. After that, in order to solve the problem of energy source, the team decided to include a cradle so that the remote control could be recharged.', 'Project Manager initiated the meeting topic on the detailed design. Specifically, the prototype presentation was given by Industrial Designer, and calculation on each evaluation criteria was led by Marketing. Next, group members discussed changing aspects of the remote. Finally, they decided on changing the




TypeError: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): This is the second meeting of the design group. Project Manager introduced new project requirements at first. The management required a remote control only for television and aimed at customers under 40. In terms of user interface design, User Interface focused on user-friendliness but still thought multi-function should be considered. Industrial Designer agreed and proposed to substitute voice recognition for the ten digits. He also suggested that they should use infrared so that the remote control could be connected with most TV sets. After that, in order to solve the problem of energy source, the team decided to include a cradle so that the remote control could be recharged.