Step 2: Python Code

In [27]:
import os
import PyPDF2
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from tqdm import tqdm


Step 3: Extract Text from PDFs

In [28]:
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            return text[:3000]  # Limit to fit within BERT input
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""


Step 4: Create Sample Dataset (for training)

In [29]:
# Example documents (replace with real ones and correct labels)
docs = [
    ("Patent discusses OTA update mechanism in vehicles", 1),
    ("A new type of hydraulic gear", 0),
    ("Firmware download system using WiFi in vehicles", 1),
    ("Mechanical valve design", 0),
    ("Vehicle-to-cloud update system for ECUs", 1),
    ("Disc brake improvement", 0)
]

df = pd.DataFrame(docs, columns=["text", "label"])
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2)
train_ds = Dataset.from_pandas(pd.DataFrame({'text': train_texts, 'label': train_labels}))
test_ds = Dataset.from_pandas(pd.DataFrame({'text': test_texts, 'label': test_labels}))


Step 5: Tokenization

In [30]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)


Map: 100%|██████████| 4/4 [00:00<?, ? examples/s]
Map: 100%|██████████| 2/2 [00:00<?, ? examples/s]


Step 6: Load Model & Fine-Tune

In [31]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./bert_results",
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
                                              
 20%|██        | 2/10 [00:01<00:06,  1.29it/s]

{'eval_loss': 0.6954444646835327, 'eval_runtime': 0.0519, 'eval_samples_per_second': 38.509, 'eval_steps_per_second': 19.255, 'epoch': 1.0}


                                              
 40%|████      | 4/10 [00:03<00:04,  1.31it/s]

{'eval_loss': 0.6784228682518005, 'eval_runtime': 0.0413, 'eval_samples_per_second': 48.452, 'eval_steps_per_second': 24.226, 'epoch': 2.0}


                                              
 60%|██████    | 6/10 [00:04<00:02,  1.37it/s]

{'eval_loss': 0.645668625831604, 'eval_runtime': 0.0404, 'eval_samples_per_second': 49.512, 'eval_steps_per_second': 24.756, 'epoch': 3.0}


                                              
 80%|████████  | 8/10 [00:05<00:01,  1.45it/s]

{'eval_loss': 0.6249417066574097, 'eval_runtime': 0.0332, 'eval_samples_per_second': 60.192, 'eval_steps_per_second': 30.096, 'epoch': 4.0}


100%|██████████| 10/10 [00:07<00:00,  1.58it/s]

{'loss': 0.5791, 'grad_norm': 7.236082553863525, 'learning_rate': 0.0, 'epoch': 5.0}


                                               
100%|██████████| 10/10 [00:08<00:00,  1.24it/s]

{'eval_loss': 0.6199725270271301, 'eval_runtime': 0.0367, 'eval_samples_per_second': 54.467, 'eval_steps_per_second': 27.234, 'epoch': 5.0}
{'train_runtime': 8.0613, 'train_samples_per_second': 2.481, 'train_steps_per_second': 1.241, 'train_loss': 0.5791090488433838, 'epoch': 5.0}





TrainOutput(global_step=10, training_loss=0.5791090488433838, metrics={'train_runtime': 8.0613, 'train_samples_per_second': 2.481, 'train_steps_per_second': 1.241, 'total_flos': 62094093120.0, 'train_loss': 0.5791090488433838, 'epoch': 5.0})

In [32]:
# Save the fine-tuned model
trainer.save_model("./bert_results")

 Step 7: Classify Your PDF Documents

In [33]:
def classify_pdf_file(path):
    text = extract_text_from_pdf(path)
    if not text:
        return "Error"
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()
    return "Relevant" if prediction == 1 else "Non-Relevant"


Step 8: Batch Classify Your Folder

In [34]:
PDF_FOLDER = "IP_Documents"
results = []

for filename in tqdm(os.listdir(PDF_FOLDER)):
    if filename.lower().endswith(".pdf"):
        path = os.path.join(PDF_FOLDER, filename)
        label = classify_pdf_file(path)
        results.append((filename, label))

# Print or save
for f, l in results:
    print(f"{f}: {l}")

# Save to CSV
pd.DataFrame(results, columns=["Filename", "Classification"]).to_csv("output_classified_bert.csv", index=False)


100%|██████████| 6/6 [00:09<00:00,  1.51s/it]

US10220899_TW.pdf: Relevant
US11034404_TW.pdf: Relevant
US20220187870A1_NTW.pdf: Relevant
US8100214_TW.pdf: Relevant
US9335924_NTW.pdf: Relevant
USD748738_TW.pdf: Relevant





In [35]:
############################################################################

✅ Hybrid Classifier Architecture :
🔍 Keyword Rules + 🧠 BERT Classifier


Keywords to Detect OTA Relevance (examples)

In [36]:
KEYWORDS = [
    "over the air", "ota update", "firmware update", "ecu", "telematics",
    "software update", "remote update", "cloud update", "v2x", "vehicle communication",
    "can bus", "ota system", "connected vehicle", "wireless update"
]


🔧 Step 1: Utility Functions

In [37]:
import re

# Simple keyword rule matcher
def keyword_rule_match(text):
    text_lower = text.lower()
    for kw in KEYWORDS:
        if kw in text_lower:
            return True
    return False


🔧 Step 2: Load BERT Model & Tokenizer

In [38]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch

# Load from fine-tuned checkpoint if available
model_path = "./bert_results"  # Replace if different
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def classify_with_bert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()
    return prediction  # 1: Relevant, 0: Non-Relevant


🔧 Step 3: Hybrid Classification Function

In [39]:
def hybrid_classify_pdf(path):
    text = extract_text_from_pdf(path)
    if not text:
        return "Error"

    # Rule-based shortcut
    if keyword_rule_match(text):
        return "Relevant (Rule)"

    # Otherwise use BERT
    prediction = classify_with_bert(text)
    return "Relevant (BERT)" if prediction == 1 else "Non-Relevant"


🔧 Step 4: Batch Run Across Folder

In [None]:
from tqdm import tqdm
import os
import pandas as pd

PDF_FOLDER = "IP_Documents"
results = []

for filename in tqdm(os.listdir(PDF_FOLDER)):
    if filename.lower().endswith('.pdf'):
        path = os.path.join(PDF_FOLDER, filename)
        label = hybrid_classify_pdf(path)
        results.append((filename, label))

# Save results
df = pd.DataFrame(results, columns=["Filename", "Classification"])
df.to_csv("hybrid_classification_output.csv", index=False)
print(df)


100%|██████████| 6/6 [00:09<00:00,  1.51s/it]

                  Filename   Classification
0        US10220899_TW.pdf  Relevant (BERT)
1        US11034404_TW.pdf  Relevant (BERT)
2  US20220187870A1_NTW.pdf  Relevant (BERT)
3         US8100214_TW.pdf  Relevant (BERT)
4        US9335924_NTW.pdf  Relevant (BERT)
5         USD748738_TW.pdf  Relevant (BERT)





: 