# Installation

### Configuration and importations

In [None]:
# Google Colab configuration
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Install Kaggle
! pip install kaggle
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json
! kaggle competitions download -c detecting-french-texts-difficulty-level-2023

# Document importation
from zipfile import ZipFile
with ZipFile('detecting-french-texts-difficulty-level-2023.zip','r') as zip:
  zip.extractall(path="")

!pip install torch
!pip install transformers
!pip install pandas
!pip install sklearn
!pip install sentencepiece

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from transformers import CamembertTokenizer, CamembertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
import sentencepiece as spm

Mounted at /content/drive
mkdir: cannot create directory ‘/root/.kaggle’: File exists
detecting-french-texts-difficulty-level-2023.zip: Skipping, found more recently modified local copy (use --force to force download)
Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


### DataFrames visualisation

In [None]:
# Training dataset
df = pd.read_csv('training_data.csv', index_col='id')
df

Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1
...,...,...
4795,"C'est pourquoi, il décida de remplacer les hab...",B2
4796,Il avait une de ces pâleurs splendides qui don...,C1
4797,"Et le premier samedi de chaque mois, venez ren...",A2
4798,Les coûts liés à la journalisation n'étant pas...,C2


In [None]:
# Prediction dataset
df_pred = pd.read_csv('unlabelled_test_data.csv', index_col = 'id')
df_pred

Unnamed: 0_level_0,sentence
id,Unnamed: 1_level_1
0,Nous dûmes nous excuser des propos que nous eû...
1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,"Et, paradoxalement, boire froid n'est pas la b..."
3,"Ce n'est pas étonnant, car c'est une saison my..."
4,"Le corps de Golo lui-même, d'une essence aussi..."
...,...
1195,C'est un phénomène qui trouve une accélération...
1196,Je vais parler au serveur et voir si on peut d...
1197,Il n'était pas comme tant de gens qui par pare...
1198,Ils deviennent dangereux pour notre économie.


# Coding part

### Model definition

In [None]:
# Difficulty mapping in order to tranform it in integer
difficulty_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}

# Apply mapping onto 'difficulty' column of the training dataset
df['difficulty'] = df['difficulty'].map(difficulty_mapping)


# Divide dataframe into training and testing set
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)

# Define tokenizer and CamemBERT model
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=len(df['difficulty'].unique()))

# Define a custom dataset class to process data
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence = str(self.data.iloc[index]['sentence'])
        label = int(self.data.iloc[index]['difficulty'])

        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Accuracy calculation

In [None]:
# Create datasets using a custom dataset class and a tokenizer
train_dataset = CustomDataset(train_df, tokenizer)
test_dataset = CustomDataset(test_df, tokenizer)

# Create data loaders for training and testing with specified batch size and shuffle
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Initialize AdamW optimizer with specified learning rate and weight decay
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=2e-05, weight_decay = 0.01)
num_epochs = 3  # Set the number of training epochs

# Determine the device to use for training (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the selected device
model.to(device)

# Training loop over multiple epochs
for epoch in range(num_epochs):
    # Set the model to training mode
    model.train()
    # Iterate over batches in the training data
    for batch in train_loader:
        # Move batch data to the selected device
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # Zero the gradients, perform forward pass, calculate loss, backward pass, and update weights
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Set the model to evaluation mode
model.eval()

# Initialize lists to store predictions and true labels for evaluation
predictions = []
true_labels = []

# Evaluate the model on the test set
for batch in test_loader:
    # Move batch data to the selected device
    inputs = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    # Perform forward pass without gradient computation
    with torch.no_grad():
        outputs = model(inputs, attention_mask=attention_mask)
    # Get the predicted labels, convert to numpy arrays, and extend the lists
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
    predictions.extend(predicted_labels)
    true_labels.extend(labels.cpu().numpy())

# Calculate accuracy on the test set and print the result
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='weighted')
recall = recall_score(true_labels, predictions, average='weighted')
f1 = f1_score(true_labels, predictions, average='weighted')
print(f"Accuracy on test set: {accuracy}")
print(f"Precision on test set: {precision}")
print(f"Recall on test set: {recall}")
print(f"F1 Score on test set: {f1}")



Accuracy on test set: 0.5635416666666667
Precision on test set: 0.5834116801531931
Recall on test set: 0.5635416666666667
F1 Score on test set: 0.5583648203275025


### Application of the model on the prediction dataset

In [None]:
# Set the model to evaluation mode
model.eval()

# Define a custom dataset class for prediction
class PredictionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # Retrieve the sentence from the dataframe
        sentence = str(self.data.iloc[index]['sentence'])
        # Tokenize and encode the sentence using the provided tokenizer
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        # Return the input_ids and attention_mask for the given sentence
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

# Create an instance of the dataset for prediction
pred_dataset = PredictionDataset(df_pred, tokenizer)

# Define a dataloader for prediction
pred_loader = DataLoader(pred_dataset, batch_size=8, shuffle=False)

# Use the GPU if available, otherwise use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the specified device (GPU or CPU)
model.to(device)

# List to store the predictions
predictions = []

# Prediction loop
with torch.no_grad():
    for batch in pred_loader:
        # Move input tensors to the specified device
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        # Forward pass through the model
        outputs = model(inputs, attention_mask=attention_mask)
        logits = outputs.logits
        # Extract predicted labels and convert to numpy array on the CPU
        predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
        # Extend the list of predictions with the current batch
        predictions.extend(predicted_labels)

# Add the predictions to the 'difficulty' column in the df_pred dataframe and display the updated dataframe with predictions
df_pred['difficulty'] = predictions
df_pred

Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Nous dûmes nous excuser des propos que nous eû...,5
1,Vous ne pouvez pas savoir le plaisir que j'ai ...,1
2,"Et, paradoxalement, boire froid n'est pas la b...",2
3,"Ce n'est pas étonnant, car c'est une saison my...",1
4,"Le corps de Golo lui-même, d'une essence aussi...",5
...,...,...
1195,C'est un phénomène qui trouve une accélération...,2
1196,Je vais parler au serveur et voir si on peut d...,1
1197,Il n'était pas comme tant de gens qui par pare...,5
1198,Ils deviennent dangereux pour notre économie.,3


In [None]:
# Creating an inverse mapping of difficulty values from the original difficulty_mapping dictionary
inverse_difficulty_mapping = {v: k for k, v in difficulty_mapping.items()}

# Mapping the 'difficulty' column in the DataFrame 'df_pred' using the inverse_difficulty_mapping
df_pred['difficulty'] = df_pred['difficulty'].map(inverse_difficulty_mapping)


In [None]:
# Put the DataFrame to the good format
df_pred = df_pred.drop(columns=['sentence'])

In [None]:
# Submit our result
df_pred.to_csv('submission.csv')

! kaggle competitions submit -c detecting-french-texts-difficulty-level-2023 -f submission.csv -m "UNIL_Rolex"

100% 8.30k/8.30k [00:02<00:00, 3.64kB/s]
Successfully submitted to Detecting the difficulty level of French texts