## Load data and split into chunks

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://en.wikipedia.org/wiki/Cancer")
docs = loader.load()
text_chunks = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
).split_documents(docs)
text_chunks[:5]

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Cancer', 'title': 'Cancer - Wikipedia', 'language': 'en'}, page_content='Cancer - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload file\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\n Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1\nEtymology and definitions\n\n\n\n

## Initialize pinecone vector database

In [2]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

embeddings = OpenAIEmbeddings()

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

pc = Pinecone(api_key=PINECONE_API_KEY)

result = embeddings.embed_query("How are you!")
result
print('''*********Example result:\n''', result)

  from tqdm.autonotebook import tqdm


*********Example result:
 [-0.004707565065473318, -0.009346039965748787, -0.003011711174622178, -0.026028219610452652, -0.01612945646047592, 0.012071968987584114, 0.0011344950180500746, -0.013529147021472454, -0.01943323202431202, -0.00337914633564651, 0.028264233842492104, 0.006764573510736227, -0.013893441297113895, -0.006437964737415314, 0.007549691013991833, -0.022762130945920944, 0.02746027335524559, -0.002361633814871311, 0.018729766830801964, -0.02713366597890854, 0.0021339496597647667, 0.021870236843824387, 0.008636293932795525, -0.006934158969670534, -0.00015839748084545135, 0.0002435827482258901, 0.011211480014026165, -0.0054958234541118145, 0.022485768422484398, -0.02778688259422779, 0.011713954620063305, -0.000575491227209568, 0.0007674524676986039, 0.002722787903621793, 0.010979085229337215, -0.02303849160671234, 0.0006740234675817192, 0.007788366638123989, 0.032736264169216156, -0.0007003249484114349, 0.02555086836218834, -0.01423261221498251, 0.0027981591410934925, -0.00

## Create an index

In [7]:
# To empty the index. DONT RUN THIS CELL IF YOU WANT TO KEEP THE INDEX
import time

index_name = "cancer-wiki" # put in the name of your pinecone index here. When creating the index in pinecone.io, the Dimensions have to be the same as the result length. 

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

pc.create_index(
    name=index_name,
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
while not pc.describe_index(index_name).status["ready"]:
    time.sleep(1)

index = pc.Index(index_name)

## If you already have an index, you can load it like this

In [None]:
# from langchain_pinecone import PineconeVectorStore

# docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings)

# docsearch

## (DON'T RUN) Create Embeddings for each of the Text Chunk

In [10]:
# insert chunks into index. DONT RUN THIS CELL IF YOU DONT WANT TO INSERT THE CHUNKS INTO THE INDEX
from langchain_pinecone import PineconeVectorStore

# embeddings is from openai. This uploads the embeddings to your index in pinecone.io. I guess Pinecone automatically uses the API key from .env file. This keeps adding embeddings to the index, so if you keep running this, there will be duplicates.
docsearch = PineconeVectorStore.from_documents(
    text_chunks, embeddings, index_name=index_name
)
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x247292160c0>

## Test the pinecone vector database


In [11]:
query = "What causes cancer?"
docs= docsearch.similarity_search(query, k=4)
docs

[Document(id='d8fa95a9-5bf3-4404-a4ea-a079da7d8efa', metadata={'language': 'en', 'source': 'https://en.wikipedia.org/wiki/Cancer', 'title': 'Cancer - Wikipedia'}, page_content='Some environments make errors more likely to arise and propagate. Such environments can include the presence of disruptive substances called carcinogens, repeated physical injury, heat, ionising radiation, or hypoxia.[98]\nThe errors that cause cancer are self-amplifying and compounding, for example:'),
 Document(id='0cffe2e5-d015-44f9-81a8-c809b15f72fc', metadata={'language': 'en', 'source': 'https://en.wikipedia.org/wiki/Cancer', 'title': 'Cancer - Wikipedia'}, page_content='Causes\nMain article: Causes of cancer\nThe GHS Hazard pictogram for carcinogenic substances\nShare of cancer deaths attributed to tobacco in 2016.[42]\nThe majority of cancers, some 90–95% of cases, are due to genetic mutations from environmental and lifestyle factors.[3] The remaining 5–10% are due to inherited genetics.[3] Environmental