# Free Attempt for the DMML Project, Ahmed and Joris
Inspired by: https://www.kaggle.com/code/houssemayed/camembert-for-french-tweets-classification/notebook

We will use CamemBERT to do the classification. Note: You will need to use a GPU on colab to run this computationally heavy model!

First, load packages and data

In [1]:
!pip install transformers
!pip install sentencepiece

# read data and import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")

!pip install -U spacy

# import some additional packages
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn. preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import torch.nn as nn
import torch
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# train data
df_train = pd.read_csv('training_data.csv')

# Rename Labeling
df_train['difficulty'] = df_train['difficulty'].replace(['A1','A2','B1', 'B2', 'C1', 'C2'],[0,1,2,3,4,5])

# test data
df_test = pd.read_csv('unlabelled_test_data.csv')


# Imports to use a pipeline
from tqdm import tqdm, trange
from tensorflow import keras 

# Imports for modelling
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertModel
from transformers import BertTokenizer, BertForSequenceClassification


# Evaluate the model
def evaluate(true, pred):
    precision = precision_score(true, pred, average = 'weighted')
    recall = recall_score(true, pred, average = 'weighted')
    f1 = f1_score(true, pred, average = 'weighted')
    print(f"CONFUSION MATRIX:\n{confusion_matrix(true, pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(true, pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n\tPrecision: {precision:.4f}\n\tRecall: {recall:.4f}\n\tF1_Score: {f1:.4f}")



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 33.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 61.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 76.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K   

# Set Parameters, load pretrained Tokenizer

In [8]:
# These parameters are not tuned, but we checked a few possiblities to get close to the ideal value
epochs = 5
MAX_LEN = 64
batch_size = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Here we load the CamemBERT tokenizer and use lower_case
tokenizer = CamembertTokenizer.from_pretrained('camembert/camembert-large', do_lower_case=True)

# As list is required as input
text = df_train['sentence'].to_list()
labels = df_train['difficulty'].to_list()


# Preparing Input for the CamemBERT model


In [13]:
# CamamBERT tokenizer is used to transform input_ids'
input_ids  = [tokenizer.encode(sent,add_special_tokens=True,max_length=MAX_LEN) for sent in text]

# the input tokens are padded
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []

# Use 0s for padding and creating a mask of 1s
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]  
    attention_masks.append(seq_mask)


# Split 80/20 Train/Validation
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(input_ids, labels, attention_masks,
                                                            random_state=42, test_size=0.2)


# torch tensors is the required datatype for the BERT model, so change input
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Create an iterator, as it will use less memory than a loop
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


# Load and prepare CamemBERT model



In [14]:
# Load model and set it to device
model = CamembertForSequenceClassification.from_pretrained("camembert/camembert-large", num_labels=6)
model.to(device)

# Parameter settings
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# Use AdamW as optimizer
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, eps=1e-8)

# Calculate the accuracy, simply assign the text difficulty to the one with the highest probability (argmax)
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Linear scheduler for warmup
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_labels)*epochs)

# Store loss
train_loss_set = []


Some weights of the model checkpoint at camembert/camembert-large were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert/camembert-large and are newly initialized: ['cl

# Train already pre-trained model

In [15]:
# for loop over each epoch (5)
for _ in trange(epochs, desc="Epoch"):  

    # Keep track of variables during the training
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
  
    # Starting trained the pre-trained BERT model on our data set
    model.train()
    for step, batch in enumerate(train_dataloader):

        # Assign batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Optain inputs from the previously made dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        # Obtain loss value
        loss = outputs[0]
        # Append to training loss set
        train_loss_set.append(loss.item())    
        # Backward pass
        loss.backward()
        # Step optimizer (gradient) and scheduler
        optimizer.step()
        scheduler.step()

        # Update variables that track results
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    

    # Check the results on the validation set
    # Tracking variables
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Put model in validation mode
    model.eval()
    # Evaluation results for one epoch
    for batch in validation_dataloader:
        # Batch assigned to GPU
        batch = tuple(t.to(device) for t in batch)
        # unpack inputs
        b_input_ids, b_input_mask, b_labels = batch
        # don't compute/store gradients
        with torch.no_grad():
            # Forward pass, and calculate predictions
            outputs =  model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss, logits = outputs[:2]
    
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    # Print resulting validation accuracy
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

# Note, the validation accuracy changes every time you re-run it

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Train loss: 1.4416452964146933


Epoch:  20%|██        | 1/5 [02:44<10:56, 164.16s/it]

Validation Accuracy: 0.4239583333333333
Train loss: 1.0896249944965044


Epoch:  40%|████      | 2/5 [05:27<08:11, 163.70s/it]

Validation Accuracy: 0.49375
Train loss: 0.9461640218893687


Epoch:  60%|██████    | 3/5 [08:10<05:27, 163.57s/it]

Validation Accuracy: 0.5645833333333333
Train loss: 0.8505047939717769


Epoch:  80%|████████  | 4/5 [10:54<02:43, 163.42s/it]

Validation Accuracy: 0.5760416666666667
Train loss: 0.7996513730535905


Epoch: 100%|██████████| 5/5 [13:37<00:00, 163.51s/it]

Validation Accuracy: 0.528125





# Check performance on Validation set

In [12]:
# Evaluate on validation set
eval_pred = []
with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs =  model(validation_inputs.to(device),token_type_ids=None, attention_mask=validation_masks.to(device))
    logits = outputs[0]
    logits = logits.detach().cpu().numpy() 
    eval_pred.extend(np.argmax(logits, axis=1).flatten())

evaluate(validation_labels, eval_pred)

CONFUSION MATRIX:
[[ 90  73   3   0   0   0]
 [ 14 115  28   0   1   0]
 [  8  68  83   4   1   2]
 [  0   5  52  58  32   6]
 [  0   1  14  36  88  13]
 [  0   1   9  20  61  74]]
ACCURACY SCORE:
0.5292
CLASSIFICATION REPORT:
	Precision: 0.5752
	Recall: 0.5292
	F1_Score: 0.5320


# After Training the model, make predictions

Use the format from the sample submission.

In [7]:
comments = df_test['sentence'].to_list()

# Encode the comments
tokenized_comments_ids = [tokenizer.encode(comment,add_special_tokens=True,max_length=MAX_LEN) for comment in comments]
# Pad the resulted encoded comments
tokenized_comments_ids = pad_sequences(tokenized_comments_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks 
attention_masks = []
for seq in tokenized_comments_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

prediction_inputs = torch.tensor(tokenized_comments_ids)
prediction_masks = torch.tensor(attention_masks)

# Apply the finetuned model (Camembert)
flat_pred = []
with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs =  model(prediction_inputs.to(device),token_type_ids=None, attention_mask=prediction_masks.to(device))
    logits = outputs[0]
    logits = logits.detach().cpu().numpy() 
    flat_pred.extend(np.argmax(logits, axis=1).flatten())

df_samplesub = pd.read_csv('sample_submission.csv')
df_samplesub['difficulty'] = flat_pred

# Translate back
df_samplesub['difficulty'] = df_samplesub['difficulty'].replace([0,1,2,3,4,5],['A1','A2','B1', 'B2', 'C1', 'C2'])

# Upload pandas dataframe as csv to drive
# from google.colab import drive
# drive.mount('drive')

# df_samplesub.to_csv('FinalTest4.csv', index = False)
# !cp FinalTest4.csv "drive/My Drive/"
