In [4]:
import torch
import pandas as pd
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

# Load the dataset
df = pd.read_csv("dataset.csv")

# Preprocess the data: Tokenization
# BERT tokenizer converts text into tokens that BERT can understand
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


# Custom Dataset class for handling text data
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        # Tokenize and encode the texts
        self.encodings = tokenizer(
            texts.tolist(),
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors="pt",
        )

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Get encoded items for a specific index
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx])
        return item


# Split the data into training and validation sets
# Split the data into 80 percent training data and 20 percent test data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)

# Create PyTorch datasets
train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save the model checkpoints
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Strength of weight decay for regularization
    logging_dir="./logs",  # Directory for storing logs
    logging_steps=10,  # Log every 10 steps
    evaluation_strategy="epoch",  # Evaluate after each epoch
)


# Create a Trainer
trainer = Trainer(
    model=model,  # The BERT model to be trained
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=val_dataset,  # Evaluation dataset
)

# Train the model
trainer.train()

# Save the trained model and tokenizer
save_directory = "./saved_model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

OSError: [WinError 126] The specified module could not be found. Error loading "c:\Users\micha\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the model and tokenizer
save_directory = "./saved_model"
model = BertForSequenceClassification.from_pretrained(save_directory)
tokenizer = BertTokenizer.from_pretrained(save_directory)


def get_sensitivity_score(text):
    # Tokenize the input text
    inputs = tokenizer(
        text, return_tensors="pt", truncation=True, padding=True, max_length=512
    )
    # Get the model's predictions
    outputs = model(**inputs)
    logits = outputs.logits
    # Convert logits to probabilities using softmax
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    # Get the sensitivity score (probability of the 'Sensitive' class)
    sensitivity_score = probabilities[0][1].item() * 100
    return sensitivity_score


# Example usage
new_text = """Document Title: Office Cleaning Schedule

Content:
- Mondays: Kitchen and Common Areas
- Tuesdays: Conference Rooms and Hallways
- Wednesdays: Restrooms and Entryways
- Thursdays: Offices and Desks
- Fridays: Meeting Rooms and Lounge

Note: Please ensure that all personal items are removed from common areas before cleaning begins."""


score = get_sensitivity_score(new_text)
print(f"Sensitivity Score: {score}")

OSError: [WinError 126] The specified module could not be found. Error loading "c:\Users\micha\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.