In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd

# Load CSV into pandas
df = pd.read_csv("../../shared/data/processed/pc-parts-processed.csv")

# Map categories to IDs
label2id = {label: i for i, label in enumerate(sorted(df['part_type'].unique()))}
id2label = {i: label for label, i in label2id.items()}
df['label'] = df['part_type'].map(label2id)

# Save updated CSV (HuggingFace datasets works better if the labels are already in the file)
df.to_csv("../../shared/data/processed/pc-parts-processed-labeled.csv", index=False)

# Load with HuggingFace
dataset = load_dataset('csv', data_files={'train': '../../shared/data/processed/pc-parts-processed-labeled.csv'})

# Tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['part_name'], truncation=True, padding='longest', max_length=32)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("label", "labels")  # use the label column we created
dataset.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])

dataset = dataset['train'].train_test_split(test_size=0.2)
train_dataset = dataset['train']
test_dataset = dataset['test']

# Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# Training
args = TrainingArguments(
    output_dir="./bert-pcparts",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

trainer.train()

# Save model
trainer.save_model("./bert-pcparts")
tokenizer.save_pretrained("./bert-pcparts")


Generating train split: 49200 examples [00:00, 548372.65 examples/s]
Map: 100%|██████████| 49200/49200 [00:01<00:00, 27098.52 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0026,0.002323
2,0.0044,0.000853
3,0.0,0.00099
4,0.0,0.000551
5,0.0,0.000523


('./bert-pcparts\\tokenizer_config.json',
 './bert-pcparts\\special_tokens_map.json',
 './bert-pcparts\\vocab.txt',
 './bert-pcparts\\added_tokens.json',
 './bert-pcparts\\tokenizer.json')

In [13]:
# Test the model
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the trained model + tokenizer
model_path = "./bert-pcparts"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Manual input
part_name = "Clés USB publicitaires ".lower()

# Tokenize
inputs = tokenizer(part_name, return_tensors="pt", truncation=True, padding='longest', max_length=32)

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax(dim=-1).item()

# Map back to label
predicted_label = id2label[predicted_class_id]

print(f"Part Name: {part_name}")
print(f"Predicted Category: {predicted_label}")


Part Name: clés usb publicitaires 
Predicted Category: CASE
