<a href="https://colab.research.google.com/github/allagonne/Agent-Based-Modeling/blob/main/doc_classif1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install PyMuPDF
!pip install torch

Collecting PyMuPDF
  Using cached PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.9 (from PyMuPDF)
  Using cached PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Using cached PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
Using cached PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
Installing collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.9 PyMuPDFb-1.24.9
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvid

In [23]:
import fitz  # PyMuPDF
import re
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import os
import torch.nn.functional as F

# Step 1: Extract Text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Step 2: Preprocess Text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\W', ' ', text)
    text = text.lower().strip()
    return text

# Step 3: Tokenize Text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

def parse_data_folder(base_dir):
    pdf_paths = []
    labels = []

    no_kid_dir = os.path.join(base_dir, 'no_kid')
    kid_dir = os.path.join(base_dir, 'kid')

    # Traverse no_kid directory
    for filename in os.listdir(no_kid_dir):
        if filename.endswith('.pdf'):
            pdf_paths.append(os.path.join(no_kid_dir, filename))
            labels.append(0)

    # Traverse kid directory
    for filename in os.listdir(kid_dir):
        if filename.endswith('.pdf'):
            pdf_paths.append(os.path.join(kid_dir, filename))
            labels.append(1)

    return pdf_paths, labels

# Step 4: Prepare Dataset
class PDFDataset(Dataset):
    def __init__(self, pdf_paths, labels):
        self.pdf_paths = pdf_paths
        self.labels = labels

    def __len__(self):
        return len(self.pdf_paths)

    def __getitem__(self, idx):
        pdf_path = self.pdf_paths[idx]
        label = self.labels[idx]
        text = extract_text_from_pdf(pdf_path)
        text = preprocess_text(text)
        tokens = tokenize_text(text)
        return {
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(),
            'labels': torch.tensor(label)
        }

# Function to predict a new document
def predict_new_document(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    text = preprocess_text(text)
    tokens = tokenize_text(text)

    # Make sure the model is in evaluation mode
    model.eval()

    with torch.no_grad():
        outputs = model(**tokens)
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=-1)
        predicted_class = torch.argmax(logits, dim=-1)

    return predicted_class.item(), probabilities

In [30]:
# Debugging statements
print(f"Found {len(pdf_paths)} PDF files.")
if len(pdf_paths) == 0:
    raise ValueError("No PDF files found in the specified directories.")

# Example usage
base_dir = 'data'
pdf_paths, labels = parse_data_folder(base_dir)
dataset = PDFDataset(pdf_paths, labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Split dataset into training and validation sets
train_size = int(0.75 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Step 5: Train BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Debugging statements
print(f"TrainingArguments: {training_args}")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset  # Use a separate validation dataset in practice
)

# Debugging statements
print(f"Trainer: {trainer}")

trainer.train()

Found 23 PDF files.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TrainingArguments: TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
evaluation_strategy=None,
fp16=False,
fp16

Step,Training Loss
10,0.7409
20,0.7529
30,0.654


TrainOutput(global_step=30, training_loss=0.7159248193105062, metrics={'train_runtime': 752.3812, 'train_samples_per_second': 0.153, 'train_steps_per_second': 0.04, 'total_flos': 30257771366400.0, 'train_loss': 0.7159248193105062, 'epoch': 5.0})

In [31]:
# Example usage
unknown_folder = 'data/unknown'
for filename in os.listdir(unknown_folder):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(unknown_folder, filename)
        predicted_class, probabilities = predict_new_document(pdf_path)
        print(f"Prediction for {filename}: {'kid' if predicted_class == 1 else 'no_kid'}")
        print(f"Certainty levels: {probabilities.numpy()}")

Prediction for attach_18745191-2407190001.pdf: kid
Certainty levels: [[0.46846095 0.53153896]]
Prediction for PRIIP_LUCAPFOURV_LU_en_202040327_retail.pdf: kid
Certainty levels: [[0.3916675  0.60833246]]
Prediction for PRIIP_LU2594916905_DE_de_20231127_both.pdf: kid
Certainty levels: [[0.4333618  0.56663823]]
Prediction for PRIIP_LU2594916905_FR_fr_20231127_both.pdf: kid
Certainty levels: [[0.47877645 0.5212236 ]]
Prediction for PRIIP_LU0968833821_LU_en_20240215_retail.pdf: kid
Certainty levels: [[0.38504905 0.6149509 ]]
Prediction for PRIIP_LU2594916905_LU_en_20231127_both.pdf: kid
Certainty levels: [[0.3972673  0.60273266]]


In [33]:
from flask import Flask, request, jsonify
app = Flask(__name__)
model = BertForSequenceClassification.from_pretrained('./results/checkpoint-50/')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [34]:
# Endpoint for inference
@app.route('/predict', methods=['POST'])
def predict():
    if 'file' not in request.files:
        return jsonify({'error': 'No file provided'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No file selected'}), 400

    # Save the uploaded file
    file_path = f"./uploads/{file.filename}"
    file.save(file_path)

    # Extract and preprocess text
    text = extract_text_from_pdf(file_path)
    text = preprocess_text(text)
    tokens = tokenize_text(text)

    # Make predictions
    with torch.no_grad():
        outputs = model(**tokens)
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=-1)
        predicted_class = torch.argmax(logits, dim=-1)

    # Return the prediction and certainty levels
    return jsonify({
        'prediction': 'kid' if predicted_class.item() == 1 else 'no_kid',
        'certainty_levels': probabilities.numpy().tolist()
    })

if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
