In [1]:
import os
import ast
import openai
import tiktoken
from nltk.tokenize import sent_tokenize
from Nature.NatureScraper import NatureScraper

In [None]:
scraper = NatureScraper()
links = scraper.get_article_links("microbiomics")
scraper.close_driver()

In [None]:
scraper.get_url(links[0])
input("Enter credentials manually on Chromium, then press Enter to continue...")
articles = scraper.get_full_articles(links)
scraper.close_driver()

In [None]:
method_dict = dict(zip(links, articles))

In [2]:
openai.api_key = os.environ['OPENAI_API_KEY']

In [3]:
system_prompt = """You are an expert Product Recognition system.
Your task is to analyze a given text and accurately identify and extract all of the mentioned products.
If a product's brand is specified, ensure to include the brand name along with the product.
Here is an example of the output format for a list of strings.

Text: I started my day with a fresh cup of coffee (Folgers) brewed from my coffee maker (Keurig), and then I prepared breakfast using my new KitchenAid stand mixer. After breakfast, I grabbed my running shoes (Nike) and went for a jog in the park. On my way back, I stopped by the store to pick up a pack of Tide laundry detergent and a bottle of Coca-Cola. After getting home, I started working on my laptop, which runs on a Microsoft operating system.
Answer: ["Folgers coffee", "Keurig coffee maker", "KitchenAid stand mixer", "Nike running shoes", "Tide laundry detergent", "Coca-Cola", "Laptop", "Microsoft operating system"]

Only use this output format. Do not return anything besides this output format.
Return all of the mentioned products in the order they occur in the input text.
"""

user_prompt = """Q: Given the text below, identify and extract all mentioned products.

Text: {}
Answer:
"""

In [4]:
def product_recognition(input_text):
    response = openai.chat.completions.create(
        model="gpt-4-turbo",
        presence_penalty=0,
        top_p=1e-16,
        temperature=1e-16,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(input_text)}
        ]
    )
    return response.choices[0].message.content

In [5]:
enc = tiktoken.encoding_for_model("gpt-4-turbo")

In [6]:
prefix_lengths = len(enc.encode(user_prompt)) + len(enc.encode(system_prompt))

In [7]:
def split_text(text, max_tokens, prefix_length, model_name):
    max_len = max_tokens - prefix_length
    sentences = sent_tokenize(text)
    encoder = tiktoken.encoding_for_model(model_name)
    total_length = 0
    chunk_list = []
    current_chunk = []
    for sentence in sentences:
        sentence_length = len(encoder.encode(sentence))
        total_length += sentence_length
        if total_length >= max_len:
            total_length = 0
            chunk_list.append(" ".join(current_chunk))
            current_chunk.clear()
        else:
            current_chunk.append(sentence)
    if current_chunk:
        chunk_list.append(" ".join(current_chunk))
    return chunk_list

In [None]:
product_dict = {}
for k, v in method_dict.items():
    method_len = len(enc.encode(v))
    if method_len > 2000 - prefix_lengths:
        chunks = split_text(v, 2000, prefix_lengths, 'gpt-4-turbo')
        all_entities = []
        for chunk in chunks:
            entities = product_recognition(chunk)
            all_entities.extend(ast.literal_eval(entities))
        product_dict[k] = all_entities
    else:
        entities = product_recognition(v)
        product_dict[k] = ast.literal_eval(entities)

In [None]:
# 4-turbo
for k, v in product_dict.items():
    print(f"{k}: {v}")