In [1]:
from dotenv import load_dotenv
import os
from pinecone import Pinecone ,ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings

  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv('GOOGLE_API_KEY')

In [3]:
pc = Pinecone(os.getenv("PINECONE_API_KEY"))

In [8]:
pc.create_index(
    name="ismgpt",
    dimension=768, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [4]:
index = pc.Index("ismgpt")

In [36]:
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import BeautifulSoupTransformer

urls = ["https://people.iitism.ac.in/~dsw/counselling.html","https://people.iitism.ac.in/~dsw/sa.html"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()

Fetching pages: 100%|##########| 2/2 [00:00<00:00, 47.48it/s]


In [37]:
# Transform
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(docs, tags_to_extract=["div"])

In [8]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader("docs/")
docs = loader.load()

In [9]:
def create_chunks(doc_to_chunk):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        length_function=len
        )
    return text_splitter.split_documents(doc_to_chunk)
chunks = create_chunks(docs)

In [10]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [11]:
vectorstore = PineconeVectorStore.from_documents(
    chunks, embeddings, index_name="ismgpt"
)

In [13]:
retriever = vectorstore.as_retriever(k=4)
retriever.invoke("nearby Holidays")

[Document(metadata={'page': 1.0, 'source': 'docs\\holiday.pdf'}, page_content="LIST OF RESTRICTED  HOLIDAYS -2024  (Any two can be availed)  \n \n \nSl \nNo. Restricted  Holidays  Date Day of  Week  \n1.  New Year 's Day  01.01.2024  Monday  \n2.  Lohri  13.01.2024  Saturday  \n3.  Makar Sankranti  14.01.2024  Sunday  \n4.  Magha Bihu / Pongal  15.01.2024  Monday  \n5.  Guru Gobind Singh’s Birthday  17.01.2024  Wednesday  \n6.  Hazrat  Ali’s Birthday  25.01.2024  Thursday  \n7.  Shivaji  Jayanti  19.02.2024  Monday  \n8.  Guru Ravi Das’s Birthday  24.02.2024  Saturday  \n9.  Birthday of Swami  Dayananda  Saraswati  06.03.2024  Wednesday  \n10.  Holika  Dahan  24.03.2024  Sunday  \n11.  Dolyatra  25.03.2024  Monday  \n12.  Easter Sunday  31.03.2024  Sunday  \n13.  Jamat -Ul-Vida 05.04.2024  Friday  \n14.  Chaitra  Sukladi/Gudi  Padava/Ugadi/  Cheti  Chand  09.04.2024  Tuesday"),
 Document(metadata={'page': 1.0, 'source': 'docs\\holiday.pdf'}, page_content='14.  Chaitra  Sukladi/Gudi  Pa