In [4]:
import pandas as pd
from transformers import CamembertTokenizer, CamembertForSequenceClassification
import torch
import string
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

def cleanse_french_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove stopwords
    words = text.split()
    cleaned_words = [word for word in words if word not in fr_stop]
    cleaned_text = ' '.join(cleaned_words)

    return cleaned_text

# Load the tokenizer and model
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForSequenceClassification.from_pretrained("camembert-base")

# Load your dataset
df = pd.read_csv('../data/esg_fr_classification.csv', encoding='utf-8', sep=',')
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df['text'] = df['text'].apply(cleanse_french_text)
df['label'] = df['esg_category'].factorize()[0] 


texts = df['text'].tolist()
labels = df['label'].tolist()


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer, CamembertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score

from tqdm.auto import tqdm


# Split dataset into training and testing
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2)

# Tokenize the test set
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)


# Convert your input data into PyTorch tensors
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Create a TensorDataset
dataset = TensorDataset(input_ids, attention_mask)

# Create a DataLoader
data_loader = DataLoader(dataset, batch_size=32) 

# Load your model
model = CamembertForSequenceClassification.from_pretrained("camembert-base")

# Predict in batches
model.eval()  # Set the model to evaluation mode
predictions = []
for batch in tqdm(data_loader, desc="Making Predictions"):
    input_ids, attention_mask = batch

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        batch_predictions = torch.argmax(outputs.logits, dim=-1)
        predictions.extend(batch_predictions.cpu().numpy())


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.

Making Predictions:   0%|          | 0/531 [00:00<?, ?it/s]

KeyboardInterrupt: 