In [1]:
import pickle
import random
from tqdm import tqdm

In [2]:
import os
import openai

os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
prompt_start = '''Generate a dialogue between you and another person based on the following paper. You have access to the paper. In the first utterance you should write a short summary. The other person sees only your summary and asks four (4) questions, separated by your answers.'''

In [4]:
def davinci_complete(text):
    
    prompt = prompt_start + '\n\n' + text
    
    response = openai.Completion.create(
      model="text-davinci-003",
      prompt=prompt,
      temperature=0.7,
      max_tokens=1500,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    
    return response[ "choices"][0]["text"]

In [6]:
def join_segments(raw_segments, max_len=3500):
    '''
    Join paper segments (by section_type) up to max length
    '''
    
    indeces_sections = [[0,1]]
    
    prev_title = raw_segments[2][-1]
    i = 2
    cur_section = []
    while i < len(raw_segments):
        cur_title = raw_segments[i][-1]
        if cur_title == prev_title:
            cur_section.append(i)
        else:
            indeces_sections.append(cur_section)
            cur_section = [i]
            prev_title = cur_title
        i += 1
    
    if len(cur_section) > 0:
        indeces_sections.append(cur_section)
    
    joined_segments = []
    for sec in indeces_sections:
        cur_text = ''
        cur_split = []
        
        for idx in sec:
            if len(cur_text + '\n' + raw_segments[idx][1]) < max_len or len(cur_text) == 0:
                cur_split.append({'id': raw_segments[idx][0],
                                  'title': raw_segments[idx][-2], 'section_type': raw_segments[idx][-1]})
                cur_text = cur_text + '\n' + raw_segments[idx][1]
                cur_text = cur_text.strip()
            
            else:
                joined_segments.append((cur_text, cur_split))
                cur_text = ''
                cur_split = []
        
        if len(cur_text) > 0:
            joined_segments.append((cur_text, cur_split))
                
    return joined_segments

### Run in cycle

In [15]:
with open('segmented_papers.pkl', 'rb') as f:
    data = pickle.load(f)

In [16]:
len(data)

24875

In [None]:
davinci_dialogues = [] # or from the checkpoint

In [25]:
processed_papers = set([d['meta_paper']['paper_id'] for d in davinci_dialogues])

In [None]:
for i in tqdm(range(510, 5000)): # for subsample
    if data[i]['paper_id'] in processed_papers:
        continue
        
    processed_papers.add(data[i]['paper_id'])
    
    segmented_paper = join_segments(data[i]['segments'])
    
    #  select random segment
    j = random.randint(0, len(segmented_paper) - 1)
    random_segment = segmented_paper[j]
    
    # skip acknowledgements
    if random_segment[1][-1]['section_type'].startswith('acknowledgement'):
        continue
    
    if len(random_segment[0]) < 1000:
        continue
        
    with open('logs/processed.txt', 'a') as f:
        f.write(f'Processing {i}\n')
        
    try:
        result = davinci_complete(random_segment[0])
    except:
        continue
    
    davinci_dialogues.append({
        'text': random_segment[0],
        'dialogue': result,
        'meta_segments': random_segment[1],
        'meta_paper': {'title': data[i]['title'], 'paper_id': data[i]['paper_id']},
    })
    
    with open('davinci_dialogues.pkl', 'wb') as f:
        pickle.dump(davinci_dialogues, f)
        
    with open('logs/processed.txt', 'a') as f:
        f.write(f'Saved for {i}, segments {j}\n')

  3%|▎         | 115/4490 [17:49<14:19:23, 11.79s/it]

In [70]:
# out example
davinci_dialogues[0]

{'text': 'In this section, we describe our proposed MNRE framework in detail. The key motivation of MNRE is that, for each relational fact, the relation patterns in sentences of different languages should be substantially consistent, and MNRE can utilize the pattern consistency and complementarity among languages to achieve better results for relation extraction.\nFormally, given two entities, their corresponding sentences in m different languages are defined as T = {S 1 , S 2 , . . . , S m }, where S j = {x 1 j , x 2 j , . . . , x n j j } corresponds to the sentence set in the jth language with n j sentences. Our model measures a score f (T, r) for each relation r, which is expected to be high when r is the valid one, otherwise low. The MNRE framework contains two main components:\n1. Sentence Encoder. Given a sentence x and two target entities, we employ CNN to encode relation patterns in x into a distributed representation x. The sentence encoder can also be implemented with GRU (Ch

### Postprocess constructed dialogues

Parse dialogues into summary, person and bot utterances; remove special tokens

In [7]:
import re

In [8]:
with open('papers_segmented_data/davinci_dialogues_full_v2.pkl', 'rb') as f:
    davinci_dialogues = pickle.load(f)

In [9]:
def check_summary(text):
    return 'summary:' in text.lower()


def clear_parts(text, part_to_remove):
    for s in part_to_remove:
        text = text.replace(s, '')
    return text.strip()


def postproc_davinci_dialogue(full_text_raw):
    bot_parts = ['Me:', 'You:', 'Answer:']
    person_parts = ['Person:', 'Other person:', 'Other:']
    summary_parts = ['Summary:', 'summary:']
    
    if 'person 1' in full_text_raw.lower() and 'person 2' in full_text_raw.lower():
        if re.search('person 1', full_text_raw.lower()).span()[0] < \
                    re.search('person 2', full_text_raw.lower()).span()[0]:
            person_parts.append('Person 1:')
            bot_parts.append('Person 2:')
        else:
            person_parts.append('Person 2:')
            bot_parts.append('Person 1:')
    elif 'person 2' in full_text_raw.lower():
        person_parts.append('Person 2:')
        
    part_to_remove = bot_parts + person_parts + summary_parts
    
    for p in part_to_remove:
        full_text_raw = re.sub(f'({p})' + r'\s+(\S)',  r'\1 \2', full_text_raw)
    
    items = [s for s in full_text_raw.strip().split('\n') if len(s.strip()) > 0]
    
    dial_parsed = {}
    
    summ_idx = -1
    for i in range(len(items)):
        if check_summary(items[i]):
            summ_idx = i
            break
    
    if summ_idx == -1:
        for i in range(len(items)):
            is_person, is_bot = False, False
            for part in person_parts:
                if part.lower() in items[i].lower()[:10]:
                    is_person = True
            if is_person:
                break
            for part in bot_parts:
                if part.lower() in items[i].lower()[:10]:
                    summ_idx = i
                    is_bot = True
            if is_bot:
                break
    
    if summ_idx >= 0:
        dial_parsed['summary'] = clear_parts(items[summ_idx], part_to_remove)
    else:
        dial_parsed['summary'] = ''

    dial_parsed['turns'] = []
    
    person_start = 0
    correct_order = True
    for j, utterance in enumerate(items[summ_idx+1:]):
        speaker = ['person', 'bot'][(j + person_start) % 2]

        for part in bot_parts:
            if part.lower() in utterance.lower()[:10] and speaker == 'person':
                speaker = 'bot'
                correct_order = False
                person_start += 1

        for part in person_parts:
            if part.lower() in utterance.lower()[:10] and speaker == 'bot':
                speaker = 'person'
                correct_order = False
                person_start += 1

        dial_parsed['turns'].append({'speaker': speaker, 'text': clear_parts(utterance, part_to_remove)})

    dial_parsed['correct_order'] = correct_order
    
    return dial_parsed

In [10]:
for i in tqdm(range(len(davinci_dialogues))):
    davinci_dialogues[i]['parsed_dialogue'] = postproc_davinci_dialogue(davinci_dialogues[i]['dialogue'])

100%|██████████| 3588/3588 [00:00<00:00, 13280.61it/s]


In [11]:
with open('davinci_dialogues_postproc.pkl', 'wb') as f:
    pickle.dump(davinci_dialogues, f)