## Load data and split into chunks

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://en.wikipedia.org/wiki/Cancer")
docs = loader.load()
text_chunks = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
).split_documents(docs)



## Initialize pinecone vector database

In [None]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

embeddings = OpenAIEmbeddings()

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

pc = Pinecone(api_key=PINECONE_API_KEY)

result = embeddings.embed_query("How are you!")
result
print('''*********Example result:\n''', result)

## Create an index

In [7]:
# To empty the index. DONT RUN THIS CELL IF YOU WANT TO KEEP THE INDEX
import time

index_name = "cancer-wiki" # put in the name of your pinecone index here. When creating the index in pinecone.io, the Dimensions have to be the same as the result length. 

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

pc.create_index(
    name=index_name,
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
while not pc.describe_index(index_name).status["ready"]:
    time.sleep(1)

index = pc.Index(index_name)

## If you already have an index, you can load it like this

In [None]:
# from langchain_pinecone import PineconeVectorStore

# docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings)

# docsearch

## (DON'T RUN) Create Embeddings for each of the Text Chunk

In [10]:
# insert chunks into index. DONT RUN THIS CELL IF YOU DONT WANT TO INSERT THE CHUNKS INTO THE INDEX
from langchain_pinecone import PineconeVectorStore

# embeddings is from openai. This uploads the embeddings to your index in pinecone.io. I guess Pinecone automatically uses the API key from .env file. This keeps adding embeddings to the index, so if you keep running this, there will be duplicates.
docsearch = PineconeVectorStore.from_documents(
    text_chunks, embeddings, index_name=index_name
)
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x247292160c0>

## Test the pinecone vector database


In [11]:
query = "What causes cancer?"
docs= docsearch.similarity_search(query, k=4)
docs

[Document(id='d8fa95a9-5bf3-4404-a4ea-a079da7d8efa', metadata={'language': 'en', 'source': 'https://en.wikipedia.org/wiki/Cancer', 'title': 'Cancer - Wikipedia'}, page_content='Some environments make errors more likely to arise and propagate. Such environments can include the presence of disruptive substances called carcinogens, repeated physical injury, heat, ionising radiation, or hypoxia.[98]\nThe errors that cause cancer are self-amplifying and compounding, for example:'),
 Document(id='0cffe2e5-d015-44f9-81a8-c809b15f72fc', metadata={'language': 'en', 'source': 'https://en.wikipedia.org/wiki/Cancer', 'title': 'Cancer - Wikipedia'}, page_content='Causes\nMain article: Causes of cancer\nThe GHS Hazard pictogram for carcinogenic substances\nShare of cancer deaths attributed to tobacco in 2016.[42]\nThe majority of cancers, some 90–95% of cases, are due to genetic mutations from environmental and lifestyle factors.[3] The remaining 5–10% are due to inherited genetics.[3] Environmental