In [1]:
# 1. Imports
import os
import torch
import pandas as pd
import PyPDF2
from tqdm import tqdm
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# 2. OTA keyword list for rule-based shortcut
KEYWORDS = [
    "ota", "over the air", "firmware update", "software update",
    "ecu", "electronic control unit", "connected vehicle", "v2x",
    "telematics", "can bus", "wireless update", "remote update", "ota update"
]

def keyword_match(text):
    text = text.lower()
    return any(keyword in text for keyword in KEYWORDS)


In [3]:
# 3. PDF Text Extraction
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
            return text[:3000]  # truncate to 3000 chars for BERT input
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""


In [4]:
# 4. Create a small manually labeled dataset (add more for better accuracy)
# Label: 1 = Relevant (OTA), 0 = Non-Relevant
data = [
    ("This invention relates to OTA software updates for ECUs in vehicles.", 1),
    ("A mechanical gear system used in industrial automation.", 0),
    ("Firmware update protocol using telematics over the air.", 1),
    ("Hydraulic brake system with improved piston design.", 0),
    ("Cloud-controlled OTA management for vehicle systems.", 1),
    ("Improved shock absorbers for two-wheelers.", 0)
]

df = pd.DataFrame(data, columns=["text", "label"])
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2)

train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels.tolist()})


In [5]:
# 5. Tokenization
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)


Map: 100%|██████████| 4/4 [00:00<00:00, 199.92 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 183.83 examples/s]


In [6]:
# 6. Load and Fine-tune BERT
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./bert_ota_results",
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=5
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model("./bert_ota_model")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
                                              
 20%|██        | 2/10 [00:02<00:10,  1.31s/it]

{'eval_loss': 0.6878044605255127, 'eval_runtime': 0.0591, 'eval_samples_per_second': 33.855, 'eval_steps_per_second': 16.927, 'epoch': 1.0}


                                              
 40%|████      | 4/10 [00:04<00:06,  1.04s/it]

{'eval_loss': 0.6522800326347351, 'eval_runtime': 0.0684, 'eval_samples_per_second': 29.219, 'eval_steps_per_second': 14.609, 'epoch': 2.0}


 50%|█████     | 5/10 [00:05<00:05,  1.06s/it]

{'loss': 0.6174, 'grad_norm': 14.743284225463867, 'learning_rate': 2.5e-05, 'epoch': 2.5}


                                              
 60%|██████    | 6/10 [00:06<00:04,  1.04s/it]

{'eval_loss': 0.6284276247024536, 'eval_runtime': 0.064, 'eval_samples_per_second': 31.258, 'eval_steps_per_second': 15.629, 'epoch': 3.0}


                                              
 80%|████████  | 8/10 [00:08<00:01,  1.04it/s]

{'eval_loss': 0.6073078513145447, 'eval_runtime': 0.0716, 'eval_samples_per_second': 27.925, 'eval_steps_per_second': 13.963, 'epoch': 4.0}


100%|██████████| 10/10 [00:10<00:00,  1.08it/s]

{'loss': 0.4256, 'grad_norm': 9.87718391418457, 'learning_rate': 0.0, 'epoch': 5.0}


                                               
100%|██████████| 10/10 [00:12<00:00,  1.20s/it]


{'eval_loss': 0.5983909368515015, 'eval_runtime': 0.0771, 'eval_samples_per_second': 25.928, 'eval_steps_per_second': 12.964, 'epoch': 5.0}
{'train_runtime': 12.0233, 'train_samples_per_second': 1.663, 'train_steps_per_second': 0.832, 'train_loss': 0.5215099334716797, 'epoch': 5.0}


In [7]:
# 7. Hybrid Classification Function (rule + BERT)
def classify_pdf_file(path):
    text = extract_text_from_pdf(path)
    if not text:
        return "Unreadable"

    if keyword_match(text):
        return "Relevant (Rule)"

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()
    return "Relevant (BERT)" if prediction == 1 else "Non-Relevant"


In [8]:
# 8. Classify all files in the folder
PDF_FOLDER = "IP_Documents"
results = []

for filename in tqdm(os.listdir(PDF_FOLDER)):
    if filename.lower().endswith(".pdf"):
        path = os.path.join(PDF_FOLDER, filename)
        label = classify_pdf_file(path)
        results.append((filename, label))

# Save results
df_out = pd.DataFrame(results, columns=["Filename", "Classification"])
df_out.to_csv("OTA_Classification_Results.csv", index=False)
print(df_out)


100%|██████████| 7/7 [00:05<00:00,  1.23it/s]

                  Filename   Classification
0        US10220899_TW.pdf  Relevant (BERT)
1        US11034404_TW.pdf     Non-Relevant
2  US20220187870A1_NTW.pdf  Relevant (BERT)
3         US8100214_TW.pdf     Non-Relevant
4        US9335924_NTW.pdf  Relevant (BERT)
5         USD748738_TW.pdf  Relevant (BERT)



