In [248]:
import openai
from dotenv import load_dotenv
import os
import re
import random
import backoff

load_dotenv()

class OpenAIGenerator():
    
    def __init__(self) -> None:
        ''' init '''
        self.API_KEY = os.environ.get("OPENAI_API_KEY")
        openai.api_key = self.API_KEY
        self.models = openai.Model.list()
        self.engines = openai.Engine.list()
    
    
    def format_text_regex(self, text):
        
        lines = text.strip().split("\n")
        # question_pattern = re.compile(r"\s*(?P<question>.+\?)")
        question_pattern = re.compile(r"\s*(?P<question>.+\?)")
        answer_pattern = re.compile(r"answer:\s*(?P<answer>.+)")
        choice_pattern = re.compile(r"incorrect[0-9]:\s*(?P<choice>.+)")
        distractors = []

        for line in lines:
            question_match = question_pattern.search(line)
            answer_match = answer_pattern.search(line)
            choice_match = choice_pattern.search(line)
            if question_match:
                question = question_match.group('question').replace('\n', '')
            elif answer_match:
                answer = answer_match.group('answer')
            elif choice_match:
                choice = choice_match.group('choice')
                distractors.append(choice)
        
        return question, answer, distractors
    
    def generate(self, context: str, desired_count: int = 1, lang='en') -> tuple:
        
        questions_list = []
        answers_list = []
        distractors_list = []
                
        @backoff.on_exception(backoff.expo, openai.error.RateLimitError)
        def generate_question_with_backoff(**kwargs):
            return openai.Completion.create(
            model="babbage:ft-personal-2023-02-09-12-44-15",
            prompt=(f'''{context}'''),
            max_tokens=256,
            n = 1,
            best_of = 1,
            stop = " [END]",
            temperature = 0.2,
            top_p = 1,
            presence_penalty = 2,
            frequency_penalty = 1,
        )
        
        response = generate_question_with_backoff()
                
        output = response["choices"]

        for qad_pair in output:
            print(qad_pair)
            try:
                question, answer, distractors = self.format_text_regex(qad_pair['text'])
            except Exception:
                break
            questions_list.append(question)
            answers_list.append(answer)
            distractors_list.append(distractors)

        if len(answers_list) < 1:
            answers_list.append('')
            
        if len(questions_list) < 1:
            questions_list.append('')
            
        if len(distractors_list) < 1:
            distractors_list.append([])
                    
        while len(distractors_list[0]) < 3:
            distractors_list[0].append('')
        
        return answers_list[0], questions_list[0], distractors_list[0][0], distractors_list[0][1], distractors_list[0][2]
    


In [249]:
generator = OpenAIGenerator()

In [250]:
context = '''The gross state domestic product (GSDP) of Delhi at current prices for the year 2011-12 has been estimated at Rs 3.13 lakh crore, which is an increase of 18.7 per cent over the previous fiscal.'''

In [251]:
output = generator.generate(context)

{
  "finish_reason": "stop",
  "index": 0,
  "logprobs": null,
  "text": "\n\n###\n\n What is the gsdp of delhi?\n\nanswer: Rs 3.13 lakh crore, which is an increase of 18.7 per cent over the previous fiscal.\nincorrect1: Rs 3.13 lakh crore, which is an increase in GDP for 2011-12 and 2012-2013 at current prices.\"\nincorrect2: The gross state domestic product (GSDP) has been estimated at Rs 2 billion each year since 2010 to 2013]"
}


In [252]:
output

('Rs 3.13 lakh crore, which is an increase of 18.7 per cent over the previous fiscal.',
 'What is the gsdp of delhi?',
 'Rs 3.13 lakh crore, which is an increase in GDP for 2011-12 and 2012-2013 at current prices."',
 'The gross state domestic product (GSDP) has been estimated at Rs 2 billion each year since 2010 to 2013]',
 '')

In [151]:
answers

NameError: name 'answers' is not defined

In [2]:
import pandas as pd
from transformers import (
    T5TokenizerFast as T5Tokenizer
    )
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [262]:

model_name = 'google/flan-t5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [263]:
dataset_path = 'datasets/davinci/wikipedia-50k-en.csv'
df = pd.read_csv(dataset_path).drop_duplicates(subset=['context'])

In [264]:

total_tokens = 0.


for i in tqdm(range(len(df))):
    total_tokens += len(tokenizer(df.iloc[i]['context'])['input_ids'])


100%|██████████| 46740/46740 [00:14<00:00, 3302.97it/s]


In [3]:
dataset_path = 'datasets/davinci/generated/text-davinci-003-gpt-3-20k-sum-en.csv'
df = pd.read_csv(dataset_path).drop_duplicates(subset=['context'])

In [4]:
df 

Unnamed: 0,topic,context,question,answer,choices,model,dataset
0,Jacob,"Twenty years later, throughout the Middle East...",Why did Benjamin stay behind when the other 10...,To keep him safe,"['To stay with his father', 'To remain in the ...",text-davinci-003-gpt-3-20k-sum-en,wikipedia-20k-en
1,Geodesic,"In an appropriate sense, zeros of the second v...",What are Jacobi fields regarded as?,Variations through geodesics,"['Solutions of the second variation', 'Zeros o...",text-davinci-003-gpt-3-20k-sum-en,wikipedia-20k-en
2,Charles Sanders Peirce,\nPeirce wrote drafts for an introductory text...,What is the title of the book edited by mathem...,The New Elements of Mathematics by Charles S. ...,"['The New Elements of Mathematics', 'Charles S...",text-davinci-003-gpt-3-20k-sum-en,wikipedia-20k-en
3,Advertising,Advertising may be categorized in a variety of...,What is the most common form of print advertis...,Display Advertising,"['Brand Advertising', 'Local Advertising', 'Di...",text-davinci-003-gpt-3-20k-sum-en,wikipedia-20k-en
4,Basque language,Part of the Romani community in the Basque Cou...,What language is spoken by the Romani communit...,Erromintxela,"['German', 'French', 'Spanish']",text-davinci-003-gpt-3-20k-sum-en,wikipedia-20k-en
...,...,...,...,...,...,...,...
4707,Muhammad,"A year after the Battle of Tabuk, the Banu Tha...",What did Muhammad require in the military and ...,"Acknowledgement of the suzerainty of Medina, r...","['A pledge to obey the laws of Medina', 'A pro...",text-davinci-003-gpt-3-20k-sum-en,wikipedia-20k-en
4708,Boeing 747,"On November 14, 2005, Boeing announced it was ...",What was the first delivery of the 747-8?,Cargolux in 2011,"['November 14, 2005', 'February 8, 2010', '2009']",text-davinci-003-gpt-3-20k-sum-en,wikipedia-20k-en
4709,Frog,Many frogs have webbed feet and the degree of ...,What is the degree of webbing of the toes of W...,Quarter or half webbed,"['Fully webbed', 'Not webbed', 'Three quarters...",text-davinci-003-gpt-3-20k-sum-en,wikipedia-20k-en
4710,Women's National Basketball Association,"On March 14, 2016, the WNBA was completing a d...",What does the deal between the WNBA and Verizo...,"Space on the front of 10 jerseys, in-arena adv...","['Space on the front of 12 jerseys, in-arena a...",text-davinci-003-gpt-3-20k-sum-en,wikipedia-20k-en
