# Read and Preprocess PDF File

In [9]:
from PyPDF2 import PdfReader
import re
import pickle
from transformers import RobertaTokenizer, pipeline

In [10]:
def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        # Extract text from each page
        text = ' '.join(page.extract_text() for page in pdf_reader.pages)
        # Clean up the text (remove extra spaces, newlines, etc.)
        text = re.sub(r'\s+', ' ', text).strip()
    return text

# Example usage
pdf_path = './../../data/sample.pdf'
pdf_text = extract_text_from_pdf(pdf_path)

# Chunk, Tokenize, and Save to Pickle

In [11]:
# Initialize the tokenizer
model_name = "deepset/roberta-base-squad2"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

def chunk_text(text, chunk_size):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# Example usage
chunk_size = 1000
tokenized_chunks = []

# Tokenize each chunk and append to the list
for chunk in chunk_text(pdf_text, chunk_size):
    tokenized_chunk = tokenizer(chunk)
    tokenized_chunks.append(tokenized_chunk)

# Save the list of tokenized chunks to a pickle file
with open("tokenized_chunks.pkl", "wb") as f:
    pickle.dump(tokenized_chunks, f)


# Load file and answer

In [13]:
import pickle
from transformers import RobertaTokenizer, pipeline

# Load tokenized chunks from pickle file
with open("tokenized_chunks.pkl", "rb") as f:
    tokenized_chunks = pickle.load(f)

# Combine tokenized chunks into a single list of tokens
flat_tokenized_text = [token for chunk in tokenized_chunks for token in chunk["input_ids"]]

# Initialize the tokenizer and question-answering pipeline
model_name = "deepset/roberta-base-squad2"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
nlp = pipeline("question-answering", model=model_name, tokenizer=tokenizer)

# Decode the combined tokenized text into a single string
flat_text = tokenizer.decode(flat_tokenized_text)

# Example question
question = "What is the main topic of the PDF?"

# Use the question-answering pipeline to answer the question
QA_input = {"question": question, "context": flat_text}
answer = nlp(QA_input)["answer"]

print(f"Question: {question}")
print(f"Answer: {answer}")


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForQuestionAnswering: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForQuestionAnswering for predictions without further training.


Question: What is the main topic of the PDF?
Answer: Nick Carter,
