In [3]:
%pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (pyproject.toml): started
  Building wheel for wikipedia (pyproject.toml): finished with status 'done'
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11786 sha256=2ab182e0c3a4ff3da174a47a5892615c021efce308979b2ee81e48aa0eb91bd8
  Stored in directory: c:\users\christian calso\appdata\local\pip\cache\wheels\63\47\7c\a9688349aa74d228ce0a9023229c6c0ac52ca2a40fe87679b8
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0
Note: you may ne

In [4]:
import wikipedia

In [None]:
from decouple import config
from openai import OpenAI

OPEN_AI_API_KEY = config('OPEN_AI_KEY')
UPSTASH_VECTOR_URL = config('UPSTASH_VECTOR_ENDPOINT')
UPSTASH_VECTOR_TOKEN = config('UPSTASH_VECTOR_TOKEN')


In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small',
    openai_api_key=OPEN_AI_API_KEY
)

In [18]:
from langchain_community.vectorstores import UpstashVectorStore

store = UpstashVectorStore(
    embedding=embeddings,
    index_url=UPSTASH_VECTOR_URL,
    index_token=UPSTASH_VECTOR_TOKEN
)

In [26]:
from langchain_core.documents import Document

In [31]:
# Create documents by Searching something on Wikipedia
documents = []
cities = [
    "Manila, Metro Manila", "General Trias City, Cavite", "Cainta City, Rizal", "Mandaluyong City, Metro Manila"
]

for city in cities:
    wiki_page = wikipedia.page(city)
    doc = Document(
        page_content=wiki_page.content,
        metadata ={
            "source": f"{wiki_page.url}",
            "title": city
        }
    )
    documents.append(doc)

In [33]:
%pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


In [34]:
# Split Tokens using Tiktoken

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import TokenTextSplitter

OpenAI_LLM_Model = "gpt-4o"

text_splitter = TokenTextSplitter.from_tiktoken_encoder(
    model_name=OpenAI_LLM_Model, chunk_size=100, chunk_overlap=0
)

In [35]:
docs = text_splitter.split_documents(documents=documents)

In [36]:
inserted_vectors = store.add_documents(docs)

In [38]:
result = store.similarity_search_with_score('City with huge traffic',k=5)

for doc, score in result:
    print(f"{doc.metadata} - {score}")

{'source': 'https://en.wikipedia.org/wiki/Manila', 'title': 'Manila, Metro Manila'} - 0.74641985
{'source': 'https://en.wikipedia.org/wiki/Mandaluyong', 'title': 'Mandaluyong City, Metro Manila'} - 0.7098517
{'source': 'https://en.wikipedia.org/wiki/Manila', 'title': 'Manila, Metro Manila'} - 0.70499575
{'source': 'https://en.wikipedia.org/wiki/Manila', 'title': 'Manila, Metro Manila'} - 0.6937239
{'source': 'https://en.wikipedia.org/wiki/Manila', 'title': 'Manila, Metro Manila'} - 0.68797946
