In [33]:
import os
import re
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [None]:
# Function to get all PDF files in the directory
def get_pdf_files(directory):
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')]

: 

In [35]:
# Function to extract text from each PDF and split it into chunks
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    num_pages = len(reader.pages)
    text_chunks = []
    for page_num in range(num_pages):
        page = reader.pages[page_num]
        text = page.extract_text()
        # Replace multiple whitespace characters with a single space
        text = re.sub(r'\s+', ' ', text)
        chunks = split_text_into_chunks(text)
        for chunk in chunks:
            text_chunks.append((chunk, page_num, pdf_path))
    return text_chunks

In [36]:
# Function to split text into smaller chunks
def split_text_into_chunks(text, chunk_size=1000):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

In [37]:
# Load the pre-trained model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to generate embeddings for text chunks
def generate_embeddings(text_chunks):
    texts = [chunk[0] for chunk in text_chunks]
    embeddings = model.encode(texts)
    return [(embedding, chunk[0], chunk[1], chunk[2]) for embedding, chunk in zip(embeddings, text_chunks)]

In [38]:
# Function to store embeddings in a FAISS vector database
def store_embeddings_in_faiss(embeddings):
    dimension = len(embeddings[0][0])
    index = faiss.IndexFlatL2(dimension)
    vectors = np.array([embedding[0] for embedding in embeddings])
    index.add(vectors)
    metadata = [(chunk[1], chunk[2], chunk[3]) for chunk in embeddings]
    return index, metadata


In [39]:
# Function to get the top k similar text chunks based on user query
def get_top_k_similar_chunks(query, index, metadata, k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)
    results = []
    for idx in indices[0]:
        text, page_num, pdf_path = metadata[idx]
        results.append((text, page_num, pdf_path))
    return results

In [40]:
# Example usage
directory = r'C:\Users\Yashaswini\OneDrive\Desktop\Capstone'  # Replace with the path to your PDF directory
pdf_files = get_pdf_files(directory)

all_text_chunks = []
for pdf_file in pdf_files:
    text_chunks = extract_text_from_pdf(pdf_file)
    all_text_chunks.extend(text_chunks)

embeddings = generate_embeddings(all_text_chunks)
index, metadata = store_embeddings_in_faiss(embeddings)

query = "What are roles involved in updating a country code are:"  # Replace with your user query
top_chunks = get_top_k_similar_chunks(query, index, metadata)

# Print the top chunks with their metadata
for text, page_num, pdf_path in top_chunks:
    print(f"Text: {text}\nPage Number: {page_num}\nFile: {pdf_path}\n")


Text:  3. Once the Country Code is entered, click the Execute button shown as an icon in the top -left corne r of the screen. 4. The updated Country Code is replicated in the P08 system. 
Page Number: 34
File: C:\Users\Yashaswini\OneDrive\Desktop\Capstone\Genpact_SOP.pdf

Text:  4. The updated Country Code is replicated in SAP S/4 HANA (P40) system. The updated Country Code will be available in the system for transaction after the quarterly maintenance Finance week i.e. the 4th Week of every quarter on a Thursday for the quarter months March, June, September and December. 
Page Number: 32
File: C:\Users\Yashaswini\OneDrive\Desktop\Capstone\Genpact_SOP.pdf

Text:  2. Executive Summary 2.1 Synopsis The ABC Company has transactions with companies from all over the world. These transactions need to be tracked and the Country Code is a field used to identify transactions of ABC with an associate company from another country. Any request to update a Country Code is rai sed in the Finance Req

# Chunk Count using Tokens and not characters so working  ! he he he :)

In [41]:
import re
import os
from PyPDF2 import PdfReader
from transformers import GPT2Tokenizer
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Function to get all PDF files in the directory
def get_pdf_files(directory):
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')]

# Function to extract text from each PDF and split it into chunks
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    num_pages = len(reader.pages)
    text_chunks = []
    for page_num in range(num_pages):
        page = reader.pages[page_num]
        text = page.extract_text()
        # Replace multiple whitespace characters with a single space
        text = re.sub(r'\s+', ' ', text)
        chunks = split_text_into_chunks_by_tokens(text)
        for chunk in chunks:
            text_chunks.append((chunk, page_num, pdf_path))
    return text_chunks

# Function to split text into smaller chunks based on tokens
def split_text_into_chunks_by_tokens(text, max_tokens=1000):
    tokens = tokenizer.encode(text)
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i + max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)
    return chunks

# Load the pre-trained model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to generate embeddings for text chunks
def generate_embeddings(text_chunks):
    texts = [chunk[0] for chunk in text_chunks]
    embeddings = model.encode(texts)
    return [(embedding, chunk[0], chunk[1], chunk[2]) for embedding, chunk in zip(embeddings, text_chunks)]

# Function to store embeddings in a FAISS vector database
def store_embeddings_in_faiss(embeddings):
    dimension = len(embeddings[0][0])
    index = faiss.IndexFlatL2(dimension)
    vectors = np.array([embedding[0] for embedding in embeddings])
    index.add(vectors)
    metadata = [(chunk[1], chunk[2], chunk[3]) for chunk in embeddings]
    return index, metadata

# Function to get the top k similar text chunks based on user query
def get_top_k_similar_chunks(query, index, metadata, k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)
    results = []
    for idx in indices[0]:
        text, page_num, pdf_path = metadata[idx]
        results.append((text, page_num, pdf_path))
    return results

# Example usage
directory = r'C:\Users\Yashaswini\OneDrive\Desktop\Capstone'  # Replace with the path to your PDF directory
pdf_files = get_pdf_files(directory)

all_text_chunks = []
for pdf_file in pdf_files:
    text_chunks = extract_text_from_pdf(pdf_file)
    all_text_chunks.extend(text_chunks)

embeddings = generate_embeddings(all_text_chunks)
index, metadata = store_embeddings_in_faiss(embeddings)

query = "Can I save outlook mail as pdf"  # Replace with your user query
top_chunks = get_top_k_similar_chunks(query, index, metadata)

# Print the top chunks with their metadata
for text, page_num, pdf_path in top_chunks:
    print(f"Text: {text}\nPage Number: {page_num}\nFile: {pdf_path}\n")


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Text: How to access Hotmail in Outlook? To access your Hotmail account via Outlook, two basic versions are available: the web -based version of Outlook, which can be accessed at outlook.com, and the Outlook application available on your PC. First off, we are going to list out the steps that will help you to use your Windows Live Hotmail account on Outlook.com. 1. Go to Outlook sign -in page, and enter your email address. 2. Click “Next”, and key in your Hotmail password. 3. Click “Connect”. For further details on how to sign in to your Hotmail account, click here to read our article. You may also simply synchronize your emails while using another account on Outlook.com. To do so, do the following. 1. On Outlook.com, click the gear icon to access your settings 2. Select “View all Outlook settings ”. 
Page Number: 0
File: C:\Users\Yashaswini\OneDrive\Desktop\Capstone\test.pdf

Text: 3. Go to “Sync email”. 4. Click “Other email accounts” under the “Connected accounts” section. 5. Now, typ

In [42]:
directory = r'C:\Users\Yashaswini\OneDrive\Desktop\Capstone'  # Replace with the path to your PDF directory
pdf_files = get_pdf_files(directory)

all_text_chunks = []
for pdf_file in pdf_files:
    text_chunks = extract_text_from_pdf(pdf_file)
    all_text_chunks.extend(text_chunks)

embeddings = generate_embeddings(all_text_chunks)
index, metadata = store_embeddings_in_faiss(embeddings)

query = "What are the roles involved in updating the country code"  # Replace with your user query
top_chunks = get_top_k_similar_chunks(query, index, metadata)

# Print the top chunks with their metadata
for text, page_num, pdf_path in top_chunks:
    print(f"Text: {text}\nPage Number: {page_num}\nFile: {pdf_path}\n")

Text:  3. Once the Country Code is entered, click the Execute button shown as an icon in the top -left corne r of the screen. 4. The updated Country Code is replicated in the P08 system. 
Page Number: 34
File: C:\Users\Yashaswini\OneDrive\Desktop\Capstone\Genpact_SOP.pdf

Text:  4. The updated Country Code is replicated in SAP S/4 HANA (P40) system. The updated Country Code will be available in the system for transaction after the quarterly maintenance Finance week i.e. the 4th Week of every quarter on a Thursday for the quarter months March, June, September and December. 
Page Number: 32
File: C:\Users\Yashaswini\OneDrive\Desktop\Capstone\Genpact_SOP.pdf

Text:  2. Executive Summary 2.1 Synopsis The ABC Company has transactions with companies from all over the world. These transactions need to be tracked and the Country Code is a field used to identify transactions of ABC with an associate company from another country. Any request to update a Country Code is rai sed in the Finance Req