In [66]:
from datasets import Dataset
import pandas as pd

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline

import evaluate
import numpy as np
import torch

In [21]:
df = pd.read_excel("data/df_docs_with_topic.xlsx")
df.shape

(4741, 7)

In [7]:
df.head()

Unnamed: 0,id,text,generated_text,document,topics,probability,topic_number
0,1,"product_name_es:Salchichón, labels:Sin gluten,...",El Salchichón es un producto de origen español...,El Salchichón es un producto de origen español...,"sausage, meat, spices, charcuterie",0.65609,16
1,2,"countries:France, Suisse, brands:Nestlé, brand...",Rillettes de cabillaud is a delicious seafood ...,Rillettes de cabillaud is a delicious seafood ...,"rillettes, cabillaud, cod, fish",0.90186,1
2,3,"product_name:Hierba a romatica de, chamomile, ...",Hierba a romatica de chamomile is a delightful...,Hierba a romatica de chamomile is a delightful...,"tea, green, beverage, leaves",1.0,6
3,4,"product_name:Rillettes de cabillaud, _keywords...","Rillettes de cabillaud, also known as cod rill...","Rillettes de cabillaud, also known as cod rill...","rillettes, cabillaud, cod, fish",0.90186,1
4,5,"product_name:Thin & crispy pizza, pnns_groups_...",The Thin & Crispy Pizza is a delicious meal op...,The Thin & Crispy Pizza is a delicious meal op...,"pizzas, crust, crispy, dough",0.521115,32


In [45]:
df_training = df[['text','topics']].copy()
df_training.columns = ['text', 'topics']
df_training.shape

labels = list(df_training['topics'].unique())
id2label = { i:labels[i] for i in range(len(labels))}
label2id = {v:k for k, v in id2label.items()}

In [46]:
df_training['label'] = df_training['topics'].apply(lambda topic:label2id[topic])

In [49]:
df_training.sample(5)

Unnamed: 0,text,topics,label
902,product_name:Vinaigre balsamique de modéne vie...,"wine, vinegar, white, red",29
399,"product_name:Nougat, _keywords:nougat, categor...","nuts, nutty, snack, pan",47
4634,"product_name:Cocktail vegetable Samosas, _keyw...","salad, dressing, fresh, dish",8
2695,"ingredients_debug:Riz thaï long grain, photogr...","aliments, question, plant, bread",4
1265,"product_name:Sirop Zero, _keywords:aproz,aroma...","chocolate, treat, product, snack",12


In [54]:
from datasets import Dataset
# Create HF dataset
ds = Dataset.from_dict({"text": df_training['text'].to_list(), "label": df_training['label'].to_list()})

In [55]:
ds[3]

{'text': 'product_name:Rillettes de cabillaud, _keywords:mer,tartiner,poisson,sale,rillette,produit,preparation,de,cabillaud,la, categories:Produits de la mer, Poissons et dérivés, Poissons, Produits à tartiner, Produits à tartiner salés, Rillettes, Préparations de poisson, Rillettes de poissons, Rillettes de cabillaud, categories_old:Produits de la mer, Poissons, Produits à tartiner, Produits à tartiner salés, Rillettes, Préparations de poisson, Rillettes de poissons, Rillettes de cabillaud, ecoscore_data.agribalyse.name_fr:Rillettes de poisson, ecoscore_data.agribalyse.name_en:Rillette, fish',
 'label': 1}

In [56]:
ds = ds.train_test_split(test_size=0.3)

In [57]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3318
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1423
    })
})

In [58]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(labels), id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [60]:
tokenized_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/3318 [00:00<?, ? examples/s]

Map:   0%|          | 0/1423 [00:00<?, ? examples/s]

In [61]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [62]:
accuracy = evaluate.load("accuracy")

In [63]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [64]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,4.704757,0.127196
2,No log,4.552974,0.127196
3,4.702400,4.345029,0.134223
4,4.702400,4.214291,0.187632
5,4.184500,4.170931,0.2052


TrainOutput(global_step=1040, training_loss=4.427901561443623, metrics={'train_runtime': 451.2762, 'train_samples_per_second': 36.762, 'train_steps_per_second': 2.305, 'total_flos': 946921133613312.0, 'train_loss': 4.427901561443623, 'epoch': 5.0})

In [69]:
text = """labels:Viande Française,Viande Porcine Française,Fabriqué en France,Transformé en France, product_name:Boudin Noir aux oignons, pnns_groups_2:Offals, _keywords:abat,aux,boudin,charcuterie,couenne,de,en,et,fabrique,frai,francaise,france,gra,noir,oeuf,oignon,poisson,porc,porcine,produit,sang,transforme,tripier,viande, labels_old:Viande Française, Viande Porcine Française, Fabriqué en France, Transformé en France
, traces_from_ingredients:gluten, lait, oeufs, moutarde, fruits à coque, mollusques, images.6.uploader:tenka, categories:Viandes et dérivés, Viandes, Charcuteries, Poissons et viandes et oeufs, Frais, Abats, Produits tripiers, Boudins, Boudins noirs, Boudins noirs aux oignons, categories_old:Viandes et dérivés, Viandes, Charcuteries, Poissons et viandes et oeufs, Frais, Abats, Boudins, Produits tripiers, Boudins noirs, Boudins noirs aux oignons, categories_imported:Viandes, Charcuteries, ecoscore_data.agribalyse.name_fr:Boudin noir, rayon frais, generic_name_fr:Boudins noirs aux oignons, origins_old:Sang et gras de porc,Couenne,France, origins:France,Couenne,Sang et gras de porc, stores:Super U,Magasins U, product_name_fr_imported:Boudin noir aux oignons, labels_imported:Viande Française, Viande Porcine Française, Transformé en France, traces_imported:Œufs, Gluten, Lait, Mollusques, Moutarde, Fruits à coque, packaging_imported:Frais"
"""
inputs = tokenizer(text, return_tensors="pt").to('cuda')
model.to('cuda')
with torch.no_grad():
    logits = model(**inputs).logits
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]   

'chocolate, treat, product, snack'