In [1]:
import pypdf
import pytesseract
import requests
import io

import openai
import tiktoken
import os
from dotenv import load_dotenv
load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

In [2]:
# step 1: PDF -> Text
def get_pdf(url:str) -> io.BytesIO:
    pdf_response = requests.get(pdf_url)
    pdf_bytes = io.BytesIO(pdf_response.content) # to avoid storing it locally
    return pdf_bytes

def extract_text(pdf_bytes: io.BytesIO) -> str:
    # Step 2: Extract text from the PDF file
    pdf_reader = pypdf.PdfReader(pdf_bytes)
    ocr_text = ""
    for page in range(len(pdf_reader.pages)):
        page_obj = pdf_reader.pages[page]
        ocr_text += page_obj.extract_text()
    return ocr_text

def ocrize_pdf(url: str) -> str:
    pdf_bytes = get_pdf(url)
    return extract_text(pdf_bytes)

In [3]:
pdf_url = "https://arxiv.org/pdf/2304.03843.pdf"
%timeit pdf_text = ocrize_pdf(pdf_url)

1.85 s ± 698 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
pdf_text = ocrize_pdf(pdf_url)
print(pdf_text[:100])

Why think step-by-step? Reasoning emerges from the
locality of experience
Ben Prystawski
Department 


In [None]:
# step 2: Summarize test

In [5]:
# preliminary step - assess cost before calling API
def compute_num_tokens(text, encoder_type : str = "cl100k_base") -> int:
    token_encoder = tiktoken.get_encoding(encoder_type)
    num_tokens = len(token_encoder.encode(text))
    return num_tokens

# TODO - parse automatically open ai page? "https://openai.com/pricing"
token_price_per_model = {# more recent models
                         "gpt-4": 0.03, 
                         "gpt-4-32k": 0.06,
                         "gpt-3.5-turbo":0.002, 
                         "text-embedding-ada-002": 0.0004,
                         # older gpt-3 models
                         "text-ada-001": 0.0004,
                         "text-babbage-001": 0.0005,
                         "text-curie-001": 0.002,
                         "text-davinci-003": 0.02}

model_context_length = {"gpt-4": 8196, 
                        "gpt-4-32k": 32768, 
                        "gpt-3.5-turbo": 4096, 
                        "text-davinci-003":4097, 
                        "text-embedding-ada-002": 8191}

def compute_api_call_price(num_input_tokens, model_params:dict) -> float:
    max_total_tokens = num_input_tokens + model_params["max_tokens"]
    max_price = max_total_tokens * token_price_per_model[model_params["model"]]
    return max_price/100

In [6]:
def summarize_text(input_text:str, model_params:dict) -> str:
    messages = [
    #   {"role": "system", "content": "Summarize the following article"},
        {"role": "user", "content": f"Summarize the following text in less than 10 sentences: \n {input_text}"}
    ]
    content_input = " ".join([message["content"] for message in messages])
    num_input_tokens = compute_num_tokens(content_input)
    max_total_tokens = num_input_tokens + model_params["max_tokens"]
    max_context_length = model_context_length[model_params["model"]]
    if max_total_tokens > max_context_length:
        print(f"Input longer than model max context length ({max_total_tokens}>{max_context_length}). Operation cancelled")
        return ""
        # TODO - Add logic to shorten messages up to 4097 tokens
    print(f"The input text contains {num_input_tokens} tokens.")
    max_price = compute_api_call_price(num_input_tokens, model_params)
    is_continue = input(f"The api call can cost up to {max_price}$. Continue? [y/n]")
    if is_continue == "y":        
        summary = openai.ChatCompletion.create(
          messages = messages,
          **model_params
        )
        return summary
    else:
        print("Operation cancelled.")

In [7]:
model_params = {"model": "gpt-3.5-turbo", 
                "max_tokens":200, 
                "temperature":0.7,
                "top_p":1.0, # default value
                "frequency_penalty": 0.0, 
                "presence_penalty":0.5,
                "stop" : "",
                "stream": False}

In [8]:
shorten_pdf_text = pdf_text[:17000]
len(shorten_pdf_text)

17000

In [9]:
pdf_summary = summarize_text(shorten_pdf_text, model_params)

The input text contains 3699 tokens.


The api call can cost up to 0.07798$. Continue? [y/n] y


In [10]:
pdf_summary["choices"][0]["message"]["content"]

'The article explores the hypothesis that reasoning is effective when training data consists of local clusters of variables that have strong influences on each other. The authors use language models to investigate this question and find that intermediate reasoning steps only help when the training data has a locality structure that corresponds to the variables that strongly influence each other. They also find that generating variables that d-separate the observed variable from the target variable is useful for improving conditional inference while generating irrelevant variables is not. The article discusses the methods used to generate the training data and the estimators used to estimate the conditional probabilities. The results show that chain-of-thought reasoning improves estimation because it can chain together local statistical dependencies that are frequently observed in training. The article concludes that the statistical structure of training data drives the effectiveness of