In [1]:
import pandas as pd

In [3]:
data = pd.read_csv('../Dataset/all_v1_transpose.csv')
data.head()

Unnamed: 0,doc,id,original_text,reference_summary,title,uid,case_code,case_text,note,title_code,title_text,urls,tldr_code,tldr_text
0,Pokemon GO Terms of Service,5786730a6cca83a54c0035b7,welcome to the pokémon go video game services ...,hi.,,legalsum01,,,,,,,,
1,Pokemon GO Terms of Service,57866df76cca83a54c0035a1,by using our services you are agreeing to thes...,by playing this game you agree to these terms....,Agreement To Terms,legalsum02,,,,,,,,
2,Pokemon GO Terms of Service,5786730a6cca83a54c0035b6,if you want to use certain features of the ser...,you have to use google pokemon trainer club or...,Eligibility and Account Registration,legalsum03,,,,,,,,
3,Pokemon GO Terms of Service,57866df76cca83a54c0035a0,during game play please be aware of your surro...,don t die or hurt others and if you do it s no...,Safe Play,legalsum04,,,,,,,,
4,Pokemon GO Terms of Service,57866df76cca83a54c00359f,subject to your compliance with these terms ni...,don t copy modify resell distribute or reverse...,Rights in App,legalsum05,,,,,,,,


In [5]:
data['doc'].nunique()

59

In [6]:
data.shape

(446, 14)

## Data Ingestion

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Loading pdf
def load_pdf(pdf_path):
    pdf_path = Path(pdf_path)
    print(f'Loading {pdf_path.name}...')

    # loading pdf
    loader = PyPDFLoader(str(pdf_path))
    pdf = loader.load()
    print(f'Found {pdf_path.name} with {len(pdf)} pages.')

    for page in pdf:
        page.metadata['source_file'] = pdf_path.name
        page.metadata['file_type'] = 'pdf'

    return pdf

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [5]:
# Chunking pdf
def split_doc(doc, chunk_size = 1000, chunk_overlap = 200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap, 
        length_function = len,
        separators=['\n\n', '\n', '', ' ']
    )
    split_doc = text_splitter.split_documents(doc)
    print(f"Split {len(doc)} documents into {len(split_doc)} chunks")    
    return split_doc

In [7]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
class EmbeddingManager:
    '''Handles doc embedding generation using SentenceTransformer'''
    def __init__(self, model_name = 'all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        '''load SentenceTransformer model'''
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts):
        '''Generate Embeddings for a list of texts'''
        if not self.model:
            raise ValueError('Model not loaded')
        # print(f"Generating embeddings for {(texts)} ...")
        embeddings = self.model.encode(texts, show_progress_bar = True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

In [12]:
class VectorStore:
    '''Manage doc embeddings in ChromaDB vector store'''
    def __init__(self, collection_name = 'pdf', persist_directory = '../data/vector_store'):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        'Initialize chromadb client and collection'''
        try:
            # Creating persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok = True)
            self.client = chromadb.PersistentClient(path = self.persist_directory)

            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={"hnsw:space": "cosine", "description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents, embeddings):
        '''Add documents and their embeddings to the vector store'''
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        print(f"Adding {len(documents)} documents to vector store...")

        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        doc_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique IDs
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Doc content
            doc_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())

        # Add to collection
        try:
            self.collection.add(
                ids = ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=doc_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

In [13]:
class RAGRetrieval:
    def __init__(self, vector_store, embedding_manager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query, top_k = 5, score_threshold = 0.0):
        '''Retrieve relevant docs for the query'''
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        # generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings(query)

        # search in vectorstore
        try:
            results = self.vector_store.collection.query(
                query_embeddings = [query_embedding.tolist()],
                n_results = top_k
            )
            retrieved_doc = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids,documents, metadatas, distances)):
                    # convert distance to similarity score. Chromadb uses cosine
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_doc.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                print(f"Retrieved {len(retrieved_doc)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_doc
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

In [14]:
from transformers import pipeline, LEDForConditionalGeneration, LEDTokenizer
from typing import List, Dict

In [17]:
class Generator:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.chunk_summarizer = None
        self.final_summarizer = None
        self._load_pipelines()

    def _load_pipelines(self):
        try:
            print(f"Loading your fine-tuned model '{self.model.name_or_path}' into pipeline...")

            self.chunk_summarizer = pipeline(
                'summarization',
                model = self.model,
                tokenizer= self.tokenizer,
                device = -1
            )
            print("Successfully loaded the model.")
        except Exception as e:
            print(f"Error loading local model into pipeline: {e}")

        print("Loading final combination summarizer (bart-large-cnn)...")
        self.final_summarizer = pipeline(
            "summarization", 
            model="facebook/bart-large-cnn", 
            device=-1
        )
        print("All models loaded.")

    def summarize(self, retrieved_docs, max_chunk_len = 150, min_chunk_len=30, final_summary_len=512):
        chunk_summaries = []
        print(f"\nSummarizing {len(retrieved_docs)} retrieved chunks...")
        for doc in retrieved_docs:
            content = doc['content']
            
            # Use YOUR model to summarize the chunk
            chunk_summary = self.chunk_summarizer(
                content, 
                max_length=max_chunk_len, 
                min_length=min_chunk_len
            )
            chunk_summaries.append(chunk_summary[0]['summary_text'])
            
        print("Chunk summarization complete.")

        # --- Step 2: Combine the chunk summaries into one text ---
        combined_text = "\n\n".join(chunk_summaries)
        
        print("\nCombining summaries into final summary...")
        # --- Step 3: Summarize the combined text ---
        final_summary = self.final_summarizer(
            combined_text, 
            max_length=final_summary_len, 
            min_length=100
        )
        
        return final_summary[0]['summary_text']

In [18]:
pdf = load_pdf('../Dataset/Terms of Service Youtube.pdf')
chunks=split_doc(pdf)

embedding_manager = EmbeddingManager()

vectorstore = VectorStore()

# convert text to embedding
text = [doc.page_content for doc in chunks]

# Generate embeddings
embeddings = embedding_manager.generate_embeddings(text)

# store in vectordb
vectorstore.add_documents(chunks, embeddings)

rag_retriever=RAGRetrieval(vectorstore,embedding_manager)

model_name = "aarushi-211/TOS-Longformer" 
model = LEDForConditionalGeneration.from_pretrained(model_name)
tokenizer = LEDTokenizer.from_pretrained(model_name)
print("Model and Tokenizer loaded.")

generator = Generator(model=model, tokenizer=tokenizer)

query = "What are the most important user responsibilities, liabilities, and service termination clauses?"


print(f"\n--- Retrieving chunks for query: '{query}' ---")
retrieved_documents = rag_retriever.retrieve(query, top_k=5)

if retrieved_documents:
    print("\n--- Generating final summary ---")
    final_summary = generator.summarize(retrieved_documents)
    
    print("\n\n========== FINAL SUMMARY ==========")
    print(final_summary)
    print("===================================")
else:
    print("No relevant documents were retrieved.")

Loading Terms of Service Youtube.pdf...
Found Terms of Service Youtube.pdf with 16 pages.
Split 16 documents into 34 chunks
Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384
Vector store initialized. Collection: pdf
Existing documents in collection: 102


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.50it/s]


Generated embeddings with shape: (34, 384)
Adding 34 documents to vector store...
Successfully added 34 documents to vector store
Total documents in collection: 136


Device set to use cpu


Model and Tokenizer loaded.
Loading your fine-tuned model 'aarushi-211/TOS-Longformer' into pipeline...
Successfully loaded the model.
Loading final combination summarizer (bart-large-cnn)...


Device set to use cpu


All models loaded.

--- Retrieving chunks for query: 'What are the most important user responsibilities, liabilities, and service termination clauses?' ---
Retrieving documents for query: 'What are the most important user responsibilities, liabilities, and service termination clauses?'
Top K: 5, Score threshold: 0.0


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 52.24it/s]
Input ids are automatically padded from 361 to 1024 to be a multiple of `config.attention_window`: 1024


Generated embeddings with shape: (384,)
Retrieved 5 documents (after filtering)

--- Generating final summary ---

Summarizing 5 retrieved chunks...


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Chunk summarization complete.

Combining summaries into final summary...


users are not liable for anything bad that happens when using the service. we don t make warranties about anything bad. that happens. we may be involved in operating the service, so please contact us if you want to discuss how you would like to use it. We are happy to help you with any questions you may have about the service or how it works. Please contact us with questions about how to use the service and we will try to answer them as soon as possible. Back to Mail Online home.
