In [None]:
import pdfplumber
from pdfminer.layout import LAParams
import re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
summarizer_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

finetuned_model_dir = ""

qa_tokenizer = AutoTokenizer.from_pretrained(finetuned_model_dir)
qa_model = AutoModelForSeq2SeqLM.from_pretrained(finetuned_model_dir)
qa_model = PeftModel.from_pretrained(finetuned_model, finetuned_model_dir)

In [None]:
def extract_text_from_pdf(pdf_path):
    laparams = LAParams(line_margin=0.1)  # Adjust line margin to help with word separation
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text(x_tolerance=2, y_tolerance=3, laparams=laparams)  # Fine-tune tolerances
    return text

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'Page \d+|Header text|Footer text', '', text)
    return text.strip()

def split_text_with_sentence_overlap(text, chunk_size=512):
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())

        if current_length + sentence_length > chunk_size:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = [current_chunk[-1]]
                current_length = len(current_chunk[0].split())

        current_chunk.append(sentence)
        current_length += sentence_length

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def summarize_text(text):
    inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
    summary_ids = summarizer_model.generate(inputs['input_ids'], max_length=250, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def generate_qa(context):
    inputs = qa_tokenizer(context, return_tensors="pt")
    outputs = qa_model.generate(**inputs, max_length=100)
    qa = qa_tokenizer.decode(outputs[0], skip_special_tokens=False)
    qa = qa.replace(tokenizer.pad_token, "").replace(tokenizer.eos_token, "")
    question, answer = qa.split(tokenizer.sep_token)

    return question, answer

In [None]:
pdf_path = 'D:/dell data/rutgers/data viz/assignment5/9.pdf'
pdf_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_text(pdf_text)
sections = split_text_with_sentence_overlap(cleaned_text)

In [None]:
qa_results = []
for section in sections:
    summarized_section = summarize_text(section)
    qa_pair = generate_qa(summarized_section)
    qa_results.append({"qa_pair": qa_pair})

In [None]:
for result in qa_results:
    print(f"QA Pair: {result['qa_pair']}")
    print("-" * 50)