In [None]:
from google.colab import files
import zipfile
import os

# Create directories
os.makedirs("data/previous_papers", exist_ok=True)
os.makedirs("data/extracted_texts", exist_ok=True)

print(" Upload your ZIP containing PDF papers")
uploaded = files.upload()

# Extract ZIP
for filename in uploaded.keys():
    if filename.endswith(".zip"):
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall("data/previous_papers")
        print(f"Extracted {filename} to data/previous_papers/")


In [None]:
! pip install -q pytesseract pdf2image pillow tqdm

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import os

zip_path = list(uploaded.keys())[0]  # should be 'DRM_Questions.zip'
extract_path = "data/previous_papers/DRM_Questions"
os.makedirs(extract_path, exist_ok=True)

import zipfile
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Extracted {zip_path} to {extract_path}")

In [None]:
!apt-get install -y poppler-utils

In [None]:
# STEP 1 — Install dependencies
!apt-get install -y poppler-utils
!pip install pytesseract pdf2image Pillow tqdm

# STEP 2 — Imports
import os
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from tqdm import tqdm

# STEP 3 — Paths
pdf_folder = "data/previous_papers/DRM_Questions"
output_folder = "data/extracted_texts"
os.makedirs(output_folder, exist_ok=True)

combined_text_path = os.path.join(output_folder, "all_papers.txt")

# STEP 4 — OCR Extraction
all_text = ""
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
    pdf_path = os.path.join(pdf_folder, pdf_file)
    images = convert_from_path(pdf_path, dpi=300)
    file_text = ""

    for img in images:
        text = pytesseract.image_to_string(img)
        file_text += text + "\n"

    all_text += f"\n--- {pdf_file} ---\n{file_text}"
    print(f"Extracted text from {pdf_file}")

with open(combined_text_path, "w", encoding="utf-8") as f:
    f.write(all_text)

print(f"\nOCR extraction complete! Combined text saved at: {combined_text_path}")


In [None]:
combined_text_path = "data/extracted_texts/all_papers.txt"

with open(combined_text_path, "r", encoding="utf-8") as f:
    text = f.read()

print(f"Total characters in OCR text: {len(text)}")
print("\nPreview (first 1000 characters):\n")
print(text[:1000])

In [None]:
import re

def extract_questions(text):
    # Normalize spaces and line breaks
    text = re.sub(r'\s+', ' ', text)

    # Split text using question patterns
    raw_questions = re.split(r'(?:(?:Q\d+)|(?:\d+\.\s)|(?:\?))', text)

    # Clean and filter
    questions = [q.strip() for q in raw_questions if len(q.strip()) > 15]
    return questions

questions = extract_questions(text)
print(f"Total questions extracted: {len(questions)}")
print("\nSample questions:\n")
for q in questions[:5]:
    print("-", q[:150], "...\n")

In [None]:
!pip install sentence-transformers faiss-cpu

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Create embeddings for all questions
question_embeddings = embedder.encode(questions, convert_to_numpy=True)

# Build FAISS index
index = faiss.IndexFlatL2(question_embeddings.shape[1])
index.add(np.array(question_embeddings))
print(f"FAISS index built with {len(questions)} questions.")

In [None]:
def search_questions(query, top_k=5):
    query_vec = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_vec, top_k)
    print(f"Top {top_k} results for: '{query}'\n")
    for idx, dist in zip(indices[0], distances[0]):
        print(f"- {questions[idx][:]}...\n")

# Example query
search_questions("future's contract")