In [1]:
import os
import PyPDF2
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm





Step 1: Extract Text from PDFs

In [3]:
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            return text[:3000]  # Limit to fit within BERT input
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""


Step 2: Create Sample Dataset (for training)

In [4]:
# Example documents (replace with real ones and correct labels)
docs = [
    ("Patent discusses OTA update mechanism in vehicles", 1),
    ("A new type of hydraulic gear", 0),
    ("Firmware download system using WiFi in vehicles", 1),
    ("Mechanical valve design", 0),
    ("Vehicle-to-cloud update system for ECUs", 1),
    ("Disc brake improvement", 0)
]

df = pd.DataFrame(docs, columns=["text", "label"])
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2)
train_ds = Dataset.from_pandas(pd.DataFrame({'text': train_texts, 'label': train_labels}))
test_ds = Dataset.from_pandas(pd.DataFrame({'text': test_texts, 'label': test_labels}))


Step 3: Tokenization

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)


Map: 100%|██████████| 4/4 [00:00<?, ? examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 299.19 examples/s]


 Step 4: Load Model & Fine-Tune

In [11]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./bert_results",
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
 20%|██        | 2/10 [00:01<00:05,  1.43it/s]
 20%|██        | 2/10 [00:01<00:05,  1.43it/s]

{'eval_loss': 0.7609473466873169, 'eval_runtime': 0.0365, 'eval_samples_per_second': 54.725, 'eval_steps_per_second': 27.362, 'epoch': 1.0}


 40%|████      | 4/10 [00:02<00:03,  1.60it/s]
 40%|████      | 4/10 [00:02<00:03,  1.60it/s]

{'eval_loss': 0.812250018119812, 'eval_runtime': 0.0419, 'eval_samples_per_second': 47.76, 'eval_steps_per_second': 23.88, 'epoch': 2.0}


 60%|██████    | 6/10 [00:03<00:02,  1.64it/s]
 60%|██████    | 6/10 [00:03<00:02,  1.64it/s]

{'eval_loss': 0.8276886940002441, 'eval_runtime': 0.0368, 'eval_samples_per_second': 54.371, 'eval_steps_per_second': 27.185, 'epoch': 3.0}


 80%|████████  | 8/10 [00:04<00:01,  1.74it/s]
 80%|████████  | 8/10 [00:04<00:01,  1.74it/s]

{'eval_loss': 0.8414663076400757, 'eval_runtime': 0.0434, 'eval_samples_per_second': 46.101, 'eval_steps_per_second': 23.051, 'epoch': 4.0}


100%|██████████| 10/10 [00:06<00:00,  1.75it/s]

{'loss': 0.5828, 'grad_norm': 3.4380390644073486, 'learning_rate': 0.0, 'epoch': 5.0}



100%|██████████| 10/10 [00:08<00:00,  1.16it/s]

{'eval_loss': 0.8504704833030701, 'eval_runtime': 0.0472, 'eval_samples_per_second': 42.355, 'eval_steps_per_second': 21.177, 'epoch': 5.0}
{'train_runtime': 8.6135, 'train_samples_per_second': 2.322, 'train_steps_per_second': 1.161, 'train_loss': 0.5827792167663575, 'epoch': 5.0}





TrainOutput(global_step=10, training_loss=0.5827792167663575, metrics={'train_runtime': 8.6135, 'train_samples_per_second': 2.322, 'train_steps_per_second': 1.161, 'total_flos': 62094093120.0, 'train_loss': 0.5827792167663575, 'epoch': 5.0})

In [12]:
# Save the fine-tuned model
trainer.save_model("./bert_results")

Step 5: Classify Your PDF Documents

In [14]:
def classify_pdf_file(path):
    text = extract_text_from_pdf(path)
    if not text:
        return "Error"
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()
    return "Relevant" if prediction == 1 else "Non-Relevant"


Step 6: Batch Classify Your Folder

In [None]:
PDF_FOLDER = "./IP_Documents"
results = []

for filename in tqdm(os.listdir(PDF_FOLDER)):
    if filename.lower().endswith(".pdf"):
        path = os.path.join(PDF_FOLDER, filename)
        label = classify_pdf_file(path)
        results.append((filename, label))

# Print or save
for f, l in results:
    print(f"{f}: {l}")

# Save to CSV
pd.DataFrame(results, columns=["Filename", "Classification"]).to_csv("output_classified_bert.csv", index=False)


100%|██████████| 6/6 [00:04<00:00,  1.43it/s]

US10220899_TW.pdf: Relevant
US11034404_TW.pdf: Relevant
US20220187870A1_NTW.pdf: Relevant
US8100214_TW.pdf: Relevant
US9335924_NTW.pdf: Relevant
USD748738_TW.pdf: Relevant





: 