In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import json

# Load and preprocess the data
with open("categorical_data.json", "r") as file:
    data = json.load(file)

# Tokenize item names
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenized_texts = tokenizer([item["c1"] for item in data], padding=True, truncation=True, return_tensors="pt")

# Convert category labels to numerical form
labels = [item["c0"] for item in data]
label_map = {label: i for i, label in enumerate(set(labels))}
labels = [label_map[label] for label in labels]

# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.tokenized_texts = tokenized_texts
        self.labels = labels
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {
            "input_ids": self.tokenized_texts["input_ids"][idx],
            "attention_mask": self.tokenized_texts["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx])
        }

dataset = CustomDataset(tokenized_texts, labels)

# Define the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_map))

# Define training parameters
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

# Train the model
model.train()
for epoch in range(3):  # Adjust the number of epochs
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the trained model
model.save_pretrained("model_output")
tokenizer.save_pretrained("model_output")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 