In [None]:
!pip install PyPDF2 transformers torch torchvision flask pdf2image accelerate -U

# Import necessary libraries
import PyPDF2
import re
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizerFast, Trainer, TrainingArguments, pipeline
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
import matplotlib.pyplot as plt
from flask import Flask, request, jsonify
import os
from pdf2image import convert_from_path
import time
from pdf2image import convert_from_path


In [None]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_text = extract_text_from_pdf('progit.pdf')


In [None]:
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    return text

cleaned_text = preprocess_text(pdf_text)


In [None]:
def extract_images_from_pdf(pdf_path, output_folder):
    start_time = time.time()
    images = convert_from_path(pdf_path)
    image_paths = []
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f'image_{i}.png')
        image.save(image_path, 'PNG')
        image_paths.append(image_path)
    end_time = time.time()
    print(f"Time taken: {end_time - start_time} seconds")
    return image_paths

image_paths = extract_images_from_pdf('progit.pdf', 'output_images')


In [None]:
def load_and_preprocess_image(image_path):
    image = Image.open(image_path)
    preprocess = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    image_tensor = preprocess(image)
    return image_tensor

image_tensors = [load_and_preprocess_image(img_path) for img_path in image_paths]


In [None]:
class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

# Tokenize the cleaned text for training
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

inputs = tokenizer(cleaned_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
dataset = TextDataset(inputs)


In [None]:
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    start_positions = torch.zeros(len(batch), dtype=torch.long)
    end_positions = torch.zeros(len(batch), dtype=torch.long)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'start_positions': start_positions,
        'end_positions': end_positions
    }


In [None]:
model_name = "distilbert-base-uncased"
model = DistilBertForQuestionAnswering.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=collate_fn,
)

trainer.train()
