# Finetuning CamemBERT on Keywords Extraction Task using Custom Dataset and WikiNews French Keywords Dataset

In this notebook, we will see how to fine-tune CamemBERT on a Keywords Extraction task. We will use the CamemBERT model, fine-tuned on the Semantic Similarity task, as a starting point, then we will fine-tune it on the Keywords Extraction task. Alternatively, we will also see how to fine-tune CamemBERT on the Keywords Extraction task from scratch. 

## Loading and Preprocessing

In [1]:
import torch
import pandas as pd
from transformers import AutoModelForTokenClassification
import lightning.pytorch as pl
from transformers import AutoTokenizer
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
import os
import json

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TORCH_USE_CUDA_DSA'] = '1'

# Load config json
root_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
os.chdir(root_dir)

with open("config.json") as f:
    config = json.load(f)
    
data_config = config['data']
dataset = os.path.join(data_config['data_folder'], data_config['KE_DATASET']['name'])

data = {
    "train": os.path.join(dataset, "train.csv"),
    "val": os.path.join(dataset, "val.csv"),
    "test": os.path.join(dataset, "test.csv"),
}

def preprocess(data):
    data = data.dropna()
    data = data[data['keywords'] != '']
    data = data.drop_duplicates(subset=['keywords'])
    data['keywords'] = data['keywords'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').replace(',', '\t'))
    data['keywords'] = data['keywords'].apply(lambda x: x[:-1] if x[-1] == ' ' else x)
    return data


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "camembert-base"
max_length = 256
#"camembert/camembert-large"
# "yanekyuk/camembert-keyword-extractor"


class MyDataModule(pl.LightningDataModule):
    def __init__(self, train_csv, val_csv, test_csv, batch_size):
        super().__init__()
        self.train_csv = train_csv
        self.val_csv = val_csv
        self.test_csv = test_csv
        self.batch_size = batch_size
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def setup(self, stage=None):
        self.train_data = MyDataset(self.train_csv, self.tokenizer)
        self.val_data = MyDataset(self.val_csv, self.tokenizer)
        self.test_data = MyDataset(self.test_csv, self.tokenizer)

    def train_dataloader(self):
        return DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_data, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_data, batch_size=self.batch_size)

class MyDataset(Dataset):
    def __init__(self, csv_file, tokenizer):
        self.data = preprocess(pd.read_csv(csv_file))
        self.tokenizer = tokenizer
        self.label2id = {"B-KEY": 1, "I-KEY": 2, "O": 0}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        keywords = self.data.iloc[index]['keywords'].split('\t')
        
        tokens = self.tokenizer.tokenize(text,truncation=True, padding=True, max_length=max_length)
        labels = ["O"] * len(tokens)

        for keyword in keywords:
            keyword_tokens = self.tokenizer.tokenize(keyword,truncation=True, padding=True, max_length=max_length)
            # find the start index of the keyword in the tokens
            keyword_start_idx = -1
            for i in range(len(tokens) - len(keyword_tokens) + 1):
                if tokens[i:i+len(keyword_tokens)] == keyword_tokens:
                    keyword_start_idx = i
                    break
            if keyword_start_idx >= 0:
                labels[keyword_start_idx] = "B-KEY"
                for i in range(keyword_start_idx+1, keyword_start_idx+len(keyword_tokens)):
                    labels[i] = "I-KEY"

        label_ids = [self.label2id[label] for label in labels]
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)
        padding_length = max_length - len(input_ids)

        input_ids = input_ids + ([0] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        label_ids = label_ids + ([0] * padding_length)

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(label_ids, dtype=torch.long)
        }
    

# Create dataloaders

batch_size = 4
dm = MyDataModule(data['train'], data['val'], data['test'], batch_size)
dm.setup()

# Retrieve a batch of data
batch = next(iter(dm.train_dataloader()))
print(batch['input_ids'].shape)

torch.Size([4, 256])


In [5]:
class MyModel(pl.LightningModule):
    def __init__(self, num_labels):
        super().__init__()
        self.save_hyperparameters()
        self.num_labels = num_labels
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)       
        self.model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=self.num_labels)
        

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self.forward(batch['input_ids'], batch['attention_mask'], batch['labels'])
        loss = outputs.loss
        self.log('train_loss', loss)
        acc = (outputs.logits.argmax(-1) == batch['labels']).float().mean()
        self.log('train_acc', acc)
        f1 = f1_score(batch['labels'].cpu().numpy().flatten(), outputs.logits.argmax(-1).cpu().numpy().flatten(), average='macro')
        self.log('train_f1', f1)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(batch['input_ids'], batch['attention_mask'], batch['labels'])
        loss = outputs.loss
        acc = (outputs.logits.argmax(-1) == batch['labels']).float().mean()
        f1 = f1_score(batch['labels'].cpu().numpy().flatten(), outputs.logits.argmax(-1).cpu().numpy().flatten(), average='macro')
        self.log('val_loss', loss)
        self.log('val_acc', acc)
        self.log('val_f1', f1)
        
    
        return loss

    def test_step(self, batch, batch_idx):
        outputs = self.forward(batch['input_ids'], batch['attention_mask'], batch['labels'])
        loss = outputs.loss
        self.log('test_loss', loss)
        acc = (outputs.logits.argmax(-1) == batch['labels']).float().mean()
        self.log('test_acc', acc)
        f1 = f1_score(batch['labels'].cpu().numpy().flatten(), outputs.logits.argmax(-1).cpu().numpy().flatten(), average='macro')
        self.log('test_f1', f1)
        return loss

    def configure_optimizers(self):
        #return AdamW(self.parameters(), lr=5e-5)
        return Adam(self.parameters(), lr=2e-5, eps=1e-08, betas=(0.9, 0.999))
    
    def infer(self, text):
        tokens = self.tokenizer.tokenize(text,truncation=True, padding=True, max_length=max_length)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)
        outputs = self.model(torch.tensor([input_ids]), torch.tensor([attention_mask]))
        # return keyword tokens and labels
        return outputs.logits.argmax(-1)
    
    
    

# Create model
model = MyModel(num_labels=3)

# Create trainer 

trainer = pl.Trainer(accelerator='auto', max_epochs=7, devices=[0], accumulate_grad_batches=8)

# Train model

trainer.fit(model, dm)

# Test model

trainer.test(model, datamodule=dm)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing CamembertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream tas

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0:  10%|█         | 185/1806 [00:52<07:40,  3.52it/s, v_num=1]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 227/227 [00:25<00:00,  9.01it/s]


[{'test_loss': 0.30427297949790955,
  'test_acc': 0.9807493090629578,
  'test_f1': 0.3410547851214217}]

In [6]:
def extract_keywords(text, model):
    # use infer method to get the labels
    labels = model.infer(text)[0]
    # get the tokens from the text
    tokens = model.tokenizer.tokenize(text,truncation=True, padding=True, max_length=max_length)
    # get the keywords from the tokens and labels
    keywords = []
    for i in range(len(tokens)):
        if labels[i] == 1:
            keyword = tokens[i]
            for j in range(i+1, len(tokens)):
                if labels[j] == 2:
                    keyword += " " + tokens[j]
                else:
                    break
            # convert the keyword to the original string
            keyword = model.tokenizer.convert_tokens_to_string(model.tokenizer.tokenize(keyword))
            keywords.append(keyword)

    return keywords


text2 = "Je vous avais préparé encore un autre exercice pour s'habituer un petit peu avec les techniques d'enregistrement comptable des opérations commerciales. Vous avez ici un bilan initial, construction 100 mèles euros, client 15 mèles euros, banque 13 mèles euros, caisse 2 mèles euros, au total 130 mèles euros, capital 100 mèles euros, fournisseur 30 mèles euros, total la même chose. Voilà, on a ici un bilan initial, tout simple, et on va essayer de voir un petit peu ce qui est demandé dans l'exercice. Donc ici vous avez les opérations, les opérations que l'entreprise a effectuées, et tout en bas on a un travail à faire. Le travail à faire c'est quoi ? On va effectuer l'ouverture des comptes, on va enregistrer les opérations dans les comptes en T, et puis on va calculer le résultat de l'entreprise, on va établir la balance définitive, et au final établir le bilan final. J'espère bien arriver à un bilan final équilibré. J'essaierai de me concentrer pour que je ne fasse pas d'erreurs, parce que dans ce genre d'exercice, une simple erreur quelque part peut aboutir à un bilan final déséquilibré, c'est-à-dire total actif et différent de total passif."

extract_keywords(text2, model)



[]