In [23]:
from dotenv import load_dotenv
import os
from pinecone import Pinecone ,ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [36]:
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv('GOOGLE_API_KEY')

In [17]:
pc = Pinecone(os.getenv("PINECONE_API_KEY"))

In [18]:
pc.create_index(
    name="ismgpt",
    dimension=768, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [20]:
index = pc.Index("ismgpt")

In [28]:
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import BeautifulSoupTransformer

urls = ["https://en.wikipedia.org/wiki/IIT_(ISM)_Dhanbad"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()

Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.20it/s]


In [29]:
# Transform
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(docs, tags_to_extract=["h1", "h2", "h3", "p","span"])

In [31]:
def create_chunks(doc_to_chunk):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        length_function=len
        )
    return text_splitter.split_documents(doc_to_chunk)
chunks = create_chunks(docs_transformed)

In [37]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [None]:
vectorstore = PineconeVectorStore.from_documents(
    chunks, embeddings, index_name="ismgpt"
)