In [17]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
from scipy import spatial  # for calculating vector similarities for search
import re  # for cutting <ref> links out of Wikipedia articles
import requests
from bs4 import BeautifulSoup

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

## 1. Collect Documents
In this experiment, we'll download smart phone reviews from www.tomsguide.com

In [5]:
phone_list = ['samsung-galaxy-s23-ultra', 'samsung-galaxy-s23', 'iphone-14-plus',
               'iphone-14-pro', 'iphone-14-pro-max', 'iphone-13-pro-max']
phone_strings = []

In [6]:
for phone in phone_list:
    url = 'https://www.tomsguide.com/reviews/' + phone
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    sections = soup.find_all('div', {"id":"article-body"})
    for section in sections:
        section.prettify().split('div')
        items = section.find_all('p')
        for item in items:
            if item.get_text().strip():
                phone_strings.append(phone + '\n' + item.get_text())

## 2. Calculate embeddings

In [14]:
# calculate embeddings
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request

embeddings = []
for batch_start in range(0, len(phone_strings), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = phone_strings[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_end-1}")
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response["data"]):
        assert i == be["index"]  # double check embeddings are in same order as input
    batch_embeddings = [e["embedding"] for e in response["data"]]
    embeddings.extend(batch_embeddings)

df = pd.DataFrame({"text": phone_strings, "embedding": embeddings})

Batch 0 to 999


In [16]:
# save document chunks and embeddings

SAVE_PATH = "data/phone_review_test.csv"

df.to_csv(SAVE_PATH, index=False)
