<a href="https://colab.research.google.com/github/YUEWU1233/ML_French_Blancpain/blob/main/improved_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch


if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")


Using GPU: Tesla T4


In [None]:
import torch

if torch.cuda.is_available():
    print("GPU is available")
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    print("GPU is not available, using CPU instead")


GPU is available
Using GPU: Tesla T4


In [None]:
# Install necessary packages
!pip install transformers[torch] -U
!pip install accelerate -U
!pip install optuna
!pip install nltk

# Imports
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import optuna
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import wordnet
import random

nltk.download('wordnet')
nltk.download('omw-1.4')


In [None]:

# Define the tokenize_data function to be used later in the script
def tokenize_data(tokenizer, texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=128)

# Function to get synonyms
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char.isalpha() or char == ' '])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

# Data augmentation function
def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.isalpha()]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)
    return sentence

def augment_data(data, augment_func, times=1, n=1):
    augmented_sentences = []
    for sentence in data:
        augmented_sentences.append(sentence)
        for _ in range(times):
            augmented_sentence = augment_func(sentence, n)
            augmented_sentences.append(augmented_sentence)
    return augmented_sentences

# Load data
training_data_path = 'https://raw.githubusercontent.com/YUEWU1233/ML_French_Blancpain/main/training_data.csv'
test_data_path = 'https://raw.githubusercontent.com/YUEWU1233/ML_French_Blancpain/main/unlabelled_test_data.csv'
training_data = pd.read_csv(training_data_path)
test_data = pd.read_csv(test_data_path)

# Tokenizers
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer_camembert = CamembertTokenizer.from_pretrained('camembert-base')

# Preprocess, tokenize and augment data
train_sentences = list(training_data['sentence'])
augmented_sentences = augment_data(train_sentences, synonym_replacement, times=1, n=1)  # augment data
augmented_difficulties = [label for label in training_data['difficulty'] for _ in range(2)]  # Duplicate labels for augmented sentences

train_encodings_bert = tokenize_data(tokenizer_bert, augmented_sentences)
train_encodings_camembert = tokenize_data(tokenizer_camembert, augmented_sentences)
test_encodings_bert = tokenize_data(tokenizer_bert, list(test_data['sentence']))
test_encodings_camembert = tokenize_data(tokenizer_camembert, list(test_data['sentence']))

# Map difficulties to labels
difficulty_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
y_train = [difficulty_mapping[label] for label in augmented_difficulties]

# Dataset class
class FrenchTextDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets for both tokenizers
train_dataset_bert = FrenchTextDataset(train_encodings_bert, y_train)
test_dataset_bert = FrenchTextDataset(test_encodings_bert)
train_dataset_camembert = FrenchTextDataset(train_encodings_camembert, y_train)
test_dataset_camembert = FrenchTextDataset(test_encodings_camembert)

# Model initialization functions
def model_init_bert():
    return BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=6)

def model_init_camembert():
    return CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Use the same training arguments for both models for simplicity
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=30,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=100,
)

# Initialize trainers for both models
trainer_bert = Trainer(
    model_init=model_init_bert,
    args=training_args,
    train_dataset=train_dataset_bert,
    eval_dataset=test_dataset_bert
)

trainer_camembert = Trainer(
    model_init=model_init_camembert,
    args=training_args,
    train_dataset=train_dataset_camembert,
    eval_dataset=test_dataset_camembert
)

# Train both models
trainer_bert.train()
trainer_camembert.train()

# Predict using both models
predictions_bert = trainer_bert.predict(test_dataset_bert)
predictions_camembert = trainer_camembert.predict(test_dataset_camembert)

# Ensemble predictions by averaging
probs_bert = torch.nn.functional.softmax(torch.tensor(predictions_bert.predictions), dim=-1).numpy()
probs_camembert = torch.nn.functional.softmax(torch.tensor(predictions_camembert.predictions), dim=-1).numpy()
average_probs = (probs_bert + probs_camembert) / 2
predicted_labels = np.argmax(average_probs, axis=1)
predicted_difficulties = [list(difficulty_mapping.keys())[label] for label in predicted_labels]

# Prepare and save submission
submission = pd.DataFrame({'id': test_data['id'], 'difficulty': predicted_difficulties})
submission.to_csv('submission.csv', index=False)
print("Submission file created ok ! ;)")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably 

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
from google.colab import files
files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>