<a href="https://colab.research.google.com/github/akshaya-ravikumar19/vectordb-search-engine/blob/main/Using_FAISS_as_a_Vector_DB_for_Q_A_from_PDFs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install PyPDF2 faiss-cpu sentence-transformers scikit-learn nltk transformers

import os
import re
import nltk
import numpy as np
import faiss
from pathlib import Path
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline

#Ensure NLTK tokenizers
nltk.download("punkt")
nltk.download("punkt_tab")

# ==============================
# PDF Text Extraction
# ==============================
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            text += page.extract_text() or ""
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

# ==============================
# Text Chunking
# ==============================
def chunk_text(text, chunk_size=250, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

# ==============================
# Ingest PDFs with Metadata
# ==============================
def ingest(pdf_folder):
    records = []
    pdf_folder = Path(pdf_folder)
    for pdf_file in pdf_folder.glob("*.pdf"):
        print(f"Processing {pdf_file.name} ...")
        text = extract_text_from_pdf(pdf_file)
        chunks = chunk_text(text)
        for chunk in chunks:
            if chunk.strip():
                records.append((pdf_file.name, chunk.strip()))
    return records

# ==============================
# Build FAISS Index
# ==============================
def build_faiss(records, index_file, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    texts = [r[1] for r in records]
    embeddings = model.encode(texts, convert_to_numpy=True)

    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    faiss.write_index(index, index_file)
    return index, texts, records, model

# ==============================
# Smart Search with Year Filtering
# ==============================
def search(query, index, texts, model, records, top_k=3):
    year_match = re.search(r"\b(20\d{2})\b", query)
    query_year = year_match.group(1) if year_match else None

    query_vec = model.encode([query], convert_to_numpy=True)
    D, I = index.search(query_vec, top_k * 5)  # fetch more, filter later

    results = []
    for idx in I[0]:
        filename, chunk = records[idx]
        if query_year and query_year not in filename:
            continue
        results.append((filename, chunk))
        if len(results) >= top_k:
            break
    return results

# ==============================
# Reader (QA or Summarizer)
# ==============================
# Option 1: Question Answering (extracts exact span)
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# Option 2: Summarization (more natural full sentence)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def generate_answer(question, top_chunks, method="qa"):
    context = " ".join(chunk for _, chunk in top_chunks)
    context = context[:1500]  # avoid token overflow

    if method == "qa":
        result = qa_pipeline(question=question, context=context)
        return f"Q: {question}\nA: {result['answer']}"
    else:
        summary = summarizer(context, max_length=200, min_length=50, do_sample=False)
        return f"Q: {question}\nA: {summary[0]['summary_text']}"

# ==============================
# Run Pipeline
# ==============================
pdf_folder = Path("/content/sample_data/ESG_Files")  # update path
index_file = "esg_index.faiss"
model_name = "all-MiniLM-L6-v2"

print(f"Building indices from: {pdf_folder.resolve()}")
records = ingest(pdf_folder)
index, texts, records, model = build_faiss(records, index_file, model_name)
print("Index built successfully!")

# ==============================
# Example Queries
# ==============================
questions = [
    "What sustainability initiatives did Walmart report in 2020?",
    "What were the key ESG targets for Amazon in 2023?",
    "How is Walmart managing ESG risks and compliance?",
    "What carbon reduction initiatives did Amazon implement in 2021?"
  ]

for q in questions:
    chunks = search(q, index, texts, model, records, top_k=5)
    answer = generate_answer(q, chunks, method="summarizer")  # switch to "qa" if you prefer exact spans
    print("\n" + "="*80)
    print(answer)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cpu
Device set to use cpu


Building indices from: /content/sample_data/ESG_Files
Processing amazon_sustainability_report_2020.pdf ...
Processing walmart_esgreport_2022.pdf ...
Processing amazon_sustainability_report_2023.pdf ...
Processing walmart_esgreport_2023.pdf ...
Processing amazon_sustainability_report_2024.pdf ...
Processing walmart_esgreport_2020.pdf ...
Processing walmart_esgreport_2021.pdf ...
Processing amazon_sustainability_report_2021.pdf ...
Processing amazon_sustainability_report_2022 (Executive Summary).pdf ...
Index built successfully!

Q: What sustainability initiatives did Walmart report in 2020?
A: More than 2,300 suppliers have signed on to Project GigatonTM. Since 2017, suppliers have reported a cumulative 230 million metric tons of avoided emissions. Walmart provided cash and in-kind donations of more than $1.4 billion to projects that create opportunity.

Q: What were the key ESG targets for Amazon in 2023?
A: Amazon’s first priority under The Climate Pledge is to eliminate emissions wit

In [9]:
!pip install PyPDF2 faiss-cpu sentence-transformers scikit-learn nltk transformers

import os
import re
import nltk
import numpy as np
import faiss
from pathlib import Path
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline

#Ensure NLTK tokenizers
nltk.download("punkt")
nltk.download("punkt_tab")

#PDF Text Extraction
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            text += page.extract_text() or ""
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

#Text Chunking
def chunk_text(text, chunk_size=250, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

#Ingest PDFs with Metadata
def ingest(pdf_folder):
    records = []
    pdf_folder = Path(pdf_folder)
    for pdf_file in pdf_folder.glob("*.pdf"):
        print(f"Processing {pdf_file.name} ...")
        text = extract_text_from_pdf(pdf_file)
        chunks = chunk_text(text)
        for chunk in chunks:
            if chunk.strip():
                records.append((pdf_file.name, chunk.strip()))
    return records

#Build FAISS Index
def build_faiss(records, index_file, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    texts = [r[1] for r in records]
    embeddings = model.encode(texts, convert_to_numpy=True)

    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    faiss.write_index(index, index_file)
    return index, texts, records, model

#Smart Search with Year Filtering
def search(query, index, texts, model, records, top_k=3):
    year_match = re.search(r"\b(20\d{2})\b", query)
    query_year = year_match.group(1) if year_match else None

    query_vec = model.encode([query], convert_to_numpy=True)
    D, I = index.search(query_vec, top_k * 5)  # fetch more, filter later

    results = []
    for idx in I[0]:
        filename, chunk = records[idx]
        if query_year and query_year not in filename:
            continue
        results.append((filename, chunk))
        if len(results) >= top_k:
            break
    return results

#Reader (QA or Summarizer)
#Option 1: Question Answering (extracts exact span)
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

#Option 2: Summarization (more natural full sentence)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def generate_answer(question, top_chunks, method="qa"):
    context = " ".join(chunk for _, chunk in top_chunks)
    context = context[:1500]  # avoid token overflow

    if method == "qa":
        result = qa_pipeline(question=question, context=context)
        return f"Q: {question}\nA: {result['answer']}"
    else:
        summary = summarizer(context, max_length=200, min_length=50, do_sample=False)
        return f"Q: {question}\nA: {summary[0]['summary_text']}"

#Run Pipeline
pdf_folder = Path("/content/sample_data/ESG_Files")  # update path
index_file = "esg_index.faiss"
model_name = "all-MiniLM-L6-v2"

print(f"Building indices from: {pdf_folder.resolve()}")
records = ingest(pdf_folder)
index, texts, records, model = build_faiss(records, index_file, model_name)
print("Index built successfully!")

#Queries
questions = [
    "What sustainability initiatives did Walmart report in 2020?",
    "What were the key ESG targets for Amazon in 2023?",
    "How is Walmart managing ESG risks and compliance?",
    "What carbon reduction initiatives did Amazon implement in 2021?"
  ]

for q in questions:
    chunks = search(q, index, texts, model, records, top_k=5)
    answer = generate_answer(q, chunks, method="qa")
    print("\n" + "="*80)
    print(answer)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cpu
Device set to use cpu


Building indices from: /content/sample_data/ESG_Files
Processing amazon_sustainability_report_2020.pdf ...
Processing walmart_esgreport_2022.pdf ...
Processing amazon_sustainability_report_2023.pdf ...
Processing walmart_esgreport_2023.pdf ...
Processing amazon_sustainability_report_2024.pdf ...
Processing walmart_esgreport_2020.pdf ...
Processing walmart_esgreport_2021.pdf ...
Processing amazon_sustainability_report_2021.pdf ...
Processing amazon_sustainability_report_2022 (Executive Summary).pdf ...
Index built successfully!

Q: What sustainability initiatives did Walmart report in 2020?
A: climate, waste

Q: What were the key ESG targets for Amazon in 2023?
A: eliminate emissions within the value chain of our businesses

Q: How is Walmart managing ESG risks and compliance?
A: Team members serve as subject matter experts and advisors on critical topics

Q: What carbon reduction initiatives did Amazon implement in 2021?
A: 274


In [10]:
!pip install PyPDF2 faiss-cpu sentence-transformers scikit-learn nltk transformers

import os
import re
import nltk
import numpy as np
import faiss
from pathlib import Path
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline

#Ensure NLTK tokenizers
nltk.download("punkt")
nltk.download("punkt_tab")

#PDF Text Extraction
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            text += page.extract_text() or ""
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

#Text Chunking
def chunk_text(text, chunk_size=250, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

#Ingest PDFs with Metadata
def ingest(pdf_folder):
    records = []
    pdf_folder = Path(pdf_folder)
    for pdf_file in pdf_folder.glob("*.pdf"):
        print(f"Processing {pdf_file.name} ...")
        text = extract_text_from_pdf(pdf_file)
        chunks = chunk_text(text)
        for chunk in chunks:
            if chunk.strip():
                records.append((pdf_file.name, chunk.strip()))
    return records

#Build FAISS Index
def build_faiss(records, index_file, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    texts = [r[1] for r in records]
    embeddings = model.encode(texts, convert_to_numpy=True)

    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    faiss.write_index(index, index_file)
    return index, texts, records, model

#Smart Search with Year Filtering
def search(query, index, texts, model, records, top_k=3):
    year_match = re.search(r"\b(20\d{2})\b", query)
    query_year = year_match.group(1) if year_match else None

    query_vec = model.encode([query], convert_to_numpy=True)
    D, I = index.search(query_vec, top_k * 5)  # fetch more, filter later

    results = []
    for idx in I[0]:
        filename, chunk = records[idx]
        if query_year and query_year not in filename:
            continue
        results.append((filename, chunk))
        if len(results) >= top_k:
            break
    return results

#Reader (QA or Summarizer)
#Option 1: Question Answering (extracts exact span)
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

#Option 2: Summarization (more natural full sentence)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def generate_answer(question, top_chunks, method="qa"):
    context = " ".join(chunk for _, chunk in top_chunks)
    context = context[:1500]  # avoid token overflow

    if method == "qa":
        result = qa_pipeline(question=question, context=context)
        return f"Q: {question}\nA: {result['answer']}"
    else:
        summary = summarizer(context, max_length=200, min_length=50, do_sample=False)
        return f"Q: {question}\nA: {summary[0]['summary_text']}"

#Run Pipeline
pdf_folder = Path("/content/sample_data/ESG_Files")  # update path
index_file = "esg_index.faiss"
model_name = "all-MiniLM-L6-v2"

print(f"Building indices from: {pdf_folder.resolve()}")
records = ingest(pdf_folder)
index, texts, records, model = build_faiss(records, index_file, model_name)
print("Index built successfully!")

#Queries
questions = [
'What is the total Scope 1 emissions?',
'What is the total Scope 2 emissions?',
'What is the total Scope 3 emissions?',
'Are science-based targets disclosed?',
'Has the company committed to net-zero?',
'What percentage of energy is renewable?',
'Is energy efficiency improving year over year?',
'What is the total water withdrawal?',
'What is water recycled or reused?',
'Is the company exposed to water stress?',
'How much total waste is generated?',
'How much waste is recycled or diverted from landfill?',
'Are hazardous waste levels disclosed?',
'How sustainable are raw material sourcing practices?',
'Are biodiversity risks addressed?',
  ]

for q in questions:
    chunks = search(q, index, texts, model, records, top_k=5)
    answer = generate_answer(q, chunks, method="qa")
    print("\n" + "="*80)
    print(answer)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cpu
Device set to use cpu


Building indices from: /content/sample_data/ESG_Files
Processing amazon_sustainability_report_2020.pdf ...
Processing walmart_esgreport_2022.pdf ...
Processing amazon_sustainability_report_2023.pdf ...
Processing walmart_esgreport_2023.pdf ...
Processing amazon_sustainability_report_2024.pdf ...
Processing walmart_esgreport_2020.pdf ...
Processing walmart_esgreport_2021.pdf ...
Processing amazon_sustainability_report_2021.pdf ...
Processing amazon_sustainability_report_2022 (Executive Summary).pdf ...
Index built successfully!

Q: What is the total Scope 1 emissions?
A: Direct Emissions

Q: What is the total Scope 2 emissions?
A: 15.6%

Q: What is the total Scope 3 emissions?
A: 15.6%

Q: Are science-based targets disclosed?
A: working toward setting science-based targets

Q: Has the company committed to net-zero?
A: the path to achieving net- zero carbon will be challenging

Q: What percentage of energy is renewable?
A: 7.7%

Q: Is energy efficiency improving year over year?
A: 2015



In [11]:
!pip install PyPDF2 faiss-cpu sentence-transformers scikit-learn nltk transformers

import os
import re
import nltk
import numpy as np
import faiss
from pathlib import Path
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline

#Ensure NLTK tokenizers
nltk.download("punkt")
nltk.download("punkt_tab")

#PDF Text Extraction
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            text += page.extract_text() or ""
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

#Text Chunking
def chunk_text(text, chunk_size=250, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

#Ingest PDFs with Metadata
def ingest(pdf_folder):
    records = []
    pdf_folder = Path(pdf_folder)
    for pdf_file in pdf_folder.glob("*.pdf"):
        print(f"Processing {pdf_file.name} ...")
        text = extract_text_from_pdf(pdf_file)
        chunks = chunk_text(text)
        for chunk in chunks:
            if chunk.strip():
                records.append((pdf_file.name, chunk.strip()))
    return records

#Build FAISS Index
def build_faiss(records, index_file, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    texts = [r[1] for r in records]
    embeddings = model.encode(texts, convert_to_numpy=True)

    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    faiss.write_index(index, index_file)
    return index, texts, records, model

#Smart Search with Year Filtering
def search(query, index, texts, model, records, top_k=3):
    year_match = re.search(r"\b(20\d{2})\b", query)
    query_year = year_match.group(1) if year_match else None

    query_vec = model.encode([query], convert_to_numpy=True)
    D, I = index.search(query_vec, top_k * 5)  # fetch more, filter later

    results = []
    for idx in I[0]:
        filename, chunk = records[idx]
        if query_year and query_year not in filename:
            continue
        results.append((filename, chunk))
        if len(results) >= top_k:
            break
    return results

#Reader (QA or Summarizer)
#Option 1: Question Answering (extracts exact span)
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

#Option 2: Summarization (more natural full sentence)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def generate_answer(question, top_chunks, method="qa"):
    context = " ".join(chunk for _, chunk in top_chunks)
    context = context[:1500]  # avoid token overflow

    if method == "qa":
        result = qa_pipeline(question=question, context=context)
        return f"Q: {question}\nA: {result['answer']}"
    else:
        summary = summarizer(context, max_length=200, min_length=50, do_sample=False)
        return f"Q: {question}\nA: {summary[0]['summary_text']}"

#Run Pipeline
pdf_folder = Path("/content/sample_data/ESG_Files")
index_file = "esg_index.faiss"
model_name = "all-MiniLM-L6-v2"

print(f"Building indices from: {pdf_folder.resolve()}")
records = ingest(pdf_folder)
index, texts, records, model = build_faiss(records, index_file, model_name)
print("Index built successfully!")

#Queries
questions = [
'What is the total Scope 1 emissions?',
'What is the total Scope 2 emissions?',
'What is the total Scope 3 emissions?',
'Are science-based targets disclosed?',
'Has the company committed to net-zero?',
'What percentage of energy is renewable?',
'Is energy efficiency improving year over year?',
'What is the total water withdrawal?',
'What is water recycled or reused?',
'Is the company exposed to water stress?',
'How much total waste is generated?',
'How much waste is recycled or diverted from landfill?',
'Are hazardous waste levels disclosed?',
'How sustainable are raw material sourcing practices?',
'Are biodiversity risks addressed?',
  ]

for q in questions:
    chunks = search(q, index, texts, model, records, top_k=5)
    answer = generate_answer(q, chunks, method="summarizer")
    print("\n" + "="*80)
    print(answer)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cpu
Device set to use cpu


Building indices from: /content/sample_data/ESG_Files
Processing amazon_sustainability_report_2020.pdf ...
Processing walmart_esgreport_2022.pdf ...
Processing amazon_sustainability_report_2023.pdf ...
Processing walmart_esgreport_2023.pdf ...
Processing amazon_sustainability_report_2024.pdf ...
Processing walmart_esgreport_2020.pdf ...
Processing walmart_esgreport_2021.pdf ...
Processing amazon_sustainability_report_2021.pdf ...
Processing amazon_sustainability_report_2022 (Executive Summary).pdf ...
Index built successfully!

Q: What is the total Scope 1 emissions?
A: The GHG Protocol outlines three emissions sources (referred to as “scopes”) that provide the framework for operational boundaries. The three scopes are: Scope 1, “Direct Emissions,” represent emissions from the combustible fuels and other sources that occur directly on sites. Scope 2, ‘Indirect Emissions’ represent emissions that occur off-site to produce electricity or steam.

Q: What is the total Scope 2 emissions?
A:

In [13]:
!pip install PyPDF2 faiss-cpu sentence-transformers scikit-learn nltk transformers

import os
import re
import nltk
import numpy as np
import faiss
from pathlib import Path
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline

#Ensure NLTK tokenizers
nltk.download("punkt")
nltk.download("punkt_tab")

#PDF Text Extraction
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            text += page.extract_text() or ""
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

#Text Chunking
def chunk_text(text, chunk_size=250, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

#Ingest PDFs with Metadata
def ingest(pdf_folder):
    records = []
    pdf_folder = Path(pdf_folder)
    for pdf_file in pdf_folder.glob("*.pdf"):
        print(f"Processing {pdf_file.name} ...")
        text = extract_text_from_pdf(pdf_file)
        chunks = chunk_text(text)
        for chunk in chunks:
            if chunk.strip():
                records.append((pdf_file.name, chunk.strip()))
    return records

#Build FAISS Index
def build_faiss(records, index_file, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    texts = [r[1] for r in records]
    embeddings = model.encode(texts, convert_to_numpy=True)

    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    faiss.write_index(index, index_file)
    return index, texts, records, model

#Smart Search with Year Filtering
def search(query, index, texts, model, records, top_k=3):
    year_match = re.search(r"\b(20\d{2})\b", query)
    query_year = year_match.group(1) if year_match else None

    query_vec = model.encode([query], convert_to_numpy=True)
    D, I = index.search(query_vec, top_k * 5)  # fetch more, filter later

    results = []
    for idx in I[0]:
        filename, chunk = records[idx]
        if query_year and query_year not in filename:
            continue
        results.append((filename, chunk))
        if len(results) >= top_k:
            break
    return results

#Reader (QA or Summarizer)
#Option 1: Question Answering (extracts exact span)
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

#Option 2: Summarization (more natural full sentence)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def generate_answer(question, top_chunks, method="qa"):
    context = " ".join(chunk for _, chunk in top_chunks)
    context = context[:1500]  # avoid token overflow

    if method == "qa":
        result = qa_pipeline(question=question, context=context)
        return f"Q: {question}\nA: {result['answer']}"
    else:
        summary = summarizer(context, max_length=200, min_length=50, do_sample=False)
        return f"Q: {question}\nA: {summary[0]['summary_text']}"

#Run Pipeline
pdf_folder = Path("/content/sample_data/ESG_Files")
index_file = "esg_index.faiss"
model_name = "all-MiniLM-L6-v2"

print(f"Building indices from: {pdf_folder.resolve()}")
records = ingest(pdf_folder)
index, texts, records, model = build_faiss(records, index_file, model_name)
print("Index built successfully!")

#Queries
questions = [
      "How have Amazon’s sustainability initiatives improved over the past 3 years?",
      "What are the key ESG highlights from the Walmart's latest esg report?",
      "Which ESG goals were achieved and which ones are pending overall by Walmart over the past 4 years?",
      "How does Amazon compare to industry standards in ESG performance?",
      "What challenges did Walmart report in implementing ESG strategies in 2023?"
  ]

for q in questions:
    chunks = search(q, index, texts, model, records, top_k=5)
    answer = generate_answer(q, chunks, method="summarizer")
    print("\n" + "="*80)
    print(answer)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cpu
Device set to use cpu


Building indices from: /content/sample_data/ESG_Files
Processing amazon_sustainability_report_2020.pdf ...
Processing walmart_esgreport_2022.pdf ...
Processing amazon_sustainability_report_2023.pdf ...
Processing walmart_esgreport_2023.pdf ...
Processing amazon_sustainability_report_2024.pdf ...
Processing walmart_esgreport_2020.pdf ...
Processing walmart_esgreport_2021.pdf ...
Processing amazon_sustainability_report_2021.pdf ...
Processing amazon_sustainability_report_2022 (Executive Summary).pdf ...
Index built successfully!

Q: How have Amazon’s sustainability initiatives improved over the past 3 years?
A: Amazon has a presence in communities throughout the world. offer competitive pay and benefits, upskilling and educational development programs, and a workplace that promotes inclusion and diversity. We seek to be a good neighbor wherever we operate and to support local people and charitable organizations that meet on-the-ground needs.

Q: What are the key ESG highlights from the W