In [1]:
import os
import pandas as pd


# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload

# notebook will reload external python modules;
%autoreload 2 

In [3]:

df = pd.read_csv('../data/esg_fr_classification.csv', encoding='utf-8', sep=',')

# rename text_fr column to text and esg_category to label
df = df.rename(columns={'text_fr': 'text', 'esg_category': 'label'})
print(df.head())

df['label'].value_counts()

                                                text            label
0  Les niveaux de dioxyde de carbone intérieur po...  environnemental
1  Le prix de l'électricité est déroutant sanglan...  environnemental
2  La migration de masse n'est pas une crise: c'e...  environnemental
3  La rupture du climat arrive. Le Royaume-Uni a ...  environnemental
4  Le changement climatique forçant déjà les oise...  environnemental


label
environnemental    4428
non-esg            4306
social             4228
gouvernance        4024
Name: count, dtype: int64

In [4]:
def is_text_within_limit(text, max_position_embeddings=1024):
    tokens = text.split() # A simple whitespace tokenizer
    l = len(tokens)
    return len(tokens) <= max_position_embeddings

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split


# shuffles rows
dataset = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
train_size = 0.6
test_size = 0.2
validation_size = 0.2

# train, test and validation split
df_train, df_temp = train_test_split(df, train_size=train_size, stratify=df.label, random_state=42)

df_val, df_test = train_test_split(df_temp, test_size=0.5, stratify=df_temp.label, random_state=42)

print(f"train size: {df_train.shape} \t test size: {df_test.shape} \t validation size: {df_val.shape}")

train size: (10191, 2) 	 test size: (3398, 2) 	 validation size: (3397, 2)


In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("camembert-base")
model = AutoModelForMaskedLM.from_pretrained("camembert-base")

In [8]:
import torch
from transformers import Trainer, TrainingArguments
from datasets import Dataset

# Convert the Pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
val_dataset = Dataset.from_pandas(df_val)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Assuming df is your DataFrame with 'text' and 'label' columns
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/10191 [00:00<?, ? examples/s]

Map:   0%|          | 0/3398 [00:00<?, ? examples/s]

Map:   0%|          | 0/3397 [00:00<?, ? examples/s]

In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    optim= "adamw_torch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset= val_dataset
)

trainer.train()

# Evaluate on validation dataset
validation_results = trainer.evaluate(eval_dataset=val_dataset)

# Evaluate on test dataset
test_results = trainer.evaluate(eval_dataset=test_dataset)

  0%|          | 0/1911 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.

In [None]:
from transformers import CamembertConfig, CamembertModel

# Initializing a Camembert camembert-base style configuration
configuration = CamembertConfig( max_position_embeddings=1024)

# Initializing a model (with random weights) from the camembert-base style configuration
model = CamembertModel(configuration)

# Accessing the model configuration
configuration = model.config

model