In [2]:
!pip install requests
!pip install beautifulsoup4
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-2.2.4-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.4/179.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting loguru>=0.5.0 (from pinecone-client)
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting dnspython>=2.0.0 (from pinecone-client)
  Downloading dnspython-2.4.2-py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.4/300.4 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: loguru, dnspython, pinecone-client
Successfully installed dnspython-2.4.2 loguru-0.7.2 pinecone-client-2.2.4


In [5]:
import requests
from bs4 import BeautifulSoup
import pinecone
import json

pinecone.init(api_key="39e7f82b-83bf-4797-9281-0b76cb1e5b56", environment="us-west4-gcp-free")

index = pinecone.Index("nychackathon")
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 19}},
 'total_vector_count': 19}

In [6]:
def upsert_to_pinecone(id, embedding, metadata):
    try:
        index.upsert([(id, embedding, metadata)])
        print(f"Upserted data for ID: {id} with metadata.")
    except Exception as e:
        print(f"An error occurred while upserting to Pinecone: {e}")

def scrape_webpage(url):
    page_url = ""
    page_title = ""
    page_text = ""

    try:
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            page_url = url
            title_tag = soup.title
            if title_tag:
                page_title = title_tag.string

            page_text = soup.get_text()

            return page_url, page_title, page_text

        else:
            print("Failed to fetch the webpage.")
            return None, None, None

    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None, None

url = "https://otda.ny.gov/programs/snap/"
page_url, page_title, page_text = scrape_webpage(url)

def get_openai_embedding(text):
    api_url = "https://api.openai.com/v1/embeddings"
    headers = {
        "Authorization": "Bearer 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",
        "Content-Type": "application/json"
    }
    data = {
        "input": text,
        "model": "text-embedding-ada-002",
        "encoding_format": "float"
    }
    try:
        response = requests.post(api_url, headers=headers, json=data)
        if response.status_code == 200:
            embedding = json.loads(response.text)["data"][0]["embedding"]
            return embedding
        else:
            print(f"Failed to get embedding: {response.text}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

url = "https://www.nyc.gov/content/tenantprotection/pages/covid19-home-quarantine"
page_url, page_title, page_text = scrape_webpage(url)

if page_text:
    truncated_text = page_text[:1500]
    embedding = get_openai_embedding(truncated_text)
    if embedding:

        metadata = {
            "page_title": page_title,
            "page_text": page_text[:1500],
            "page_url": page_url
        }

        upsert_to_pinecone(page_url, embedding, metadata)
        print(f"Generated Embedding: {embedding}")


Upserted data for ID: https://www.nyc.gov/content/tenantprotection/pages/covid19-home-quarantine with metadata.
Generated Embedding: [0.0008428943, -0.009899382, -0.0013156554, -0.03004136, 0.009986867, 0.026986077, -0.0013930469, 0.0029442417, -0.04463134, -0.029206878, 0.028830014, 0.002612804, -0.007416124, 0.015074518, -0.027147591, -0.0015192287, 0.026084298, -0.04040509, 0.0006582483, -0.015868621, -0.0076180147, 0.01237591, -0.02409231, 0.0044550584, 0.009603275, -0.0032420307, 0.028210882, -0.022113778, 0.0025757907, -0.029745253, 0.011406833, -0.005760619, -0.027268725, -0.03246405, 0.0069248565, 0.015572514, 0.013836253, -0.0007575113, 0.007631474, -0.036717217, 0.033190858, -0.015626352, 0.0025404599, 0.0064806966, -0.0056394846, 0.013641092, 0.0056159305, -0.033002425, -0.018816227, 0.005215514, 0.0072075035, 0.02165616, 0.00039305625, -0.015532137, 0.020189086, -0.023055935, -0.0335408, 0.0063696564, -0.031871837, -0.03289475, -0.005535174, -0.015976297, -0.010605999, 0.00