In [5]:
from dotenv import load_dotenv
import os
from pinecone import Pinecone ,ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [6]:
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv('GOOGLE_API_KEY')

In [7]:
pc = Pinecone(os.getenv("PINECONE_API_KEY"))

In [8]:
pc.create_index(
    name="ismgpt",
    dimension=768, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [9]:
index = pc.Index("ismgpt")

In [36]:
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import BeautifulSoupTransformer

urls = ["https://people.iitism.ac.in/~dsw/counselling.html","https://people.iitism.ac.in/~dsw/sa.html"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()

Fetching pages: 100%|##########| 2/2 [00:00<00:00, 47.48it/s]


In [37]:
# Transform
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(docs, tags_to_extract=["div"])

In [38]:
def create_chunks(doc_to_chunk):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        length_function=len
        )
    return text_splitter.split_documents(doc_to_chunk)
chunks = create_chunks(docs_transformed)

In [39]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [40]:
vectorstore = PineconeVectorStore.from_documents(
    chunks, embeddings, index_name="ismgpt"
)