In [14]:
from langchain_community.document_loaders import DirectoryLoader,PyMuPDFLoader

loader = DirectoryLoader("../Single-Source/papers",glob="**/*.pdf",loader_cls=PyMuPDFLoader)
docs = loader.load()

print(f"Loaded {len(docs)} documents")

Loaded 115 documents


In [15]:
## Reading all  the documents from the directory

import os
from dotenv import load_dotenv
load_dotenv()
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


def read_all_docs(directory="../Single-Source/papers"):
    """Read all the documents from the papers directory"""
    all_documents = []
    pdf_dir=Path(directory)
    pdf_files=list(pdf_dir.glob("**/*.pdf"))

    for pdf_file in pdf_files:
        try: 
            loader= PyMuPDFLoader(str(pdf_file))
            docs=loader.load()
            for doc in docs:
                doc.metadata["source"]=pdf_file
                doc.metadata["file_name"]=pdf_file.name
                all_documents.append(doc)
        except Exception as e:
            print(f"Error reading {pdf_file}: {e}")
    return all_documents

In [16]:
all_documents=read_all_docs()
print(f"\nRead {len(all_documents)} documents")


Read 115 documents


In [17]:
## making chunkings of the documents

def make_chunks(document,chunk_size=1000,chunk_overlap=200):
    """Make chunks of the documents""" 
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(document)
    if chunks:
        print(f"Document split into {len(chunks)} chunks")
        print(f"First chunk: {chunks[0].page_content}")
        print(f"Last chunk: {chunks[-1].page_content}")
    return chunks

In [18]:
chunkss=make_chunks(all_documents)

Document split into 587 chunks
First chunk: Emerging Properties in Self-Supervised Vision Transformers
Mathilde Caron1,2
Hugo Touvron1,3
Ishan Misra1
Herv´e Jegou1
Julien Mairal2
Piotr Bojanowski1
Armand Joulin1
1 Facebook AI Research
2 Inria∗
3 Sorbonne University
Figure 1: Self-attention from a Vision Transformer with 8 × 8 patches trained with no supervision. We look at the self-attention of
the [CLS] token on the heads of the last layer. This token is not attached to any label nor supervision. These maps show that the model
automatically learns class-speciﬁc features leading to unsupervised object segmentations.
Abstract
In this paper, we question if self-supervised learning pro-
vides new properties to Vision Transformer (ViT) [19] that
stand out compared to convolutional networks (convnets).
Beyond the fact that adapting self-supervised methods to this
architecture works particularly well, we make the follow-
ing observations: ﬁrst, self-supervised ViT features contain
Last chunk

In [19]:
## Lets makes the Embeddings
import numpy as np
from sentence_transformers import SentenceTransformer
from  typing import List,Dict,Any,Tuple
import chromadb
import uuid
from sklearn.metrics.pairwise import cosine_similarity

In [33]:
class EmbeddingManager:
    """Make the embeddings for the documents""" 
    def __init__(self,model_name="all-MiniLM-L6-v2"):
        self.model_name=model_name
        self.model=None
        self._load_model()
    
    def _load_model(self):
        """ Load the model""" 
        try:
            self.model=SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully")
        except Exception as e:
            print(f"Error loading model: {e}")
            raise
        

    def get_embeddings(self,text:List[str]):
        """ 
        Generate the embeddings for the given text
        Args : 
        texts:List of text strings to embed
        returns : 
        numpy array of embeddings with shape (num_texts,embedding_dim)
        """
        if self.model is None:
            raise ValueError("Model not loaded. Please load the model first")
        embeddings=self.model.encode(text,show_progress_bar=True)
        return embeddings


In [35]:

embedding_manager=EmbeddingManager()
embedding_manager

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 353.94it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model all-MiniLM-L6-v2 loaded successfully


<__main__.EmbeddingManager at 0x1f94f9dc7d0>