In [14]:
import pandas as pd
import string
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

def cleanse_french_text(text):
    # Convert to lowercase
    text = text.lower()


    # Remove stopwords
    words = text.split()
    cleaned_words = [word for word in words if word not in fr_stop]
    cleaned_text = ' '.join(cleaned_words)

    return cleaned_text

df = pd.read_csv('../data/esg_fr_classification.csv', encoding='utf-8', sep=',')
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df['text'] = df['text'].apply(cleanse_french_text)



In [20]:
import torch
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer, CamembertForSequenceClassification,Trainer, TrainingArguments


def load_and_preprocess_data(df, sample_size=0.1):
    
    df_sampled = df.sample(frac=sample_size)
    texts = df_sampled['text'].tolist()
    labels = df_sampled['esg_category'].tolist()
        
    return texts, labels    


# Function to tokenize data
def tokenize_data(texts, labels, tokenizer):
    tokenized_data = tokenizer(texts, padding=True, truncation=True, max_length=512)
    dataset = torch.utils.data.TensorDataset(torch.tensor(tokenized_data['input_ids']), torch.tensor(labels))
    return dataset


def fine_tune_camembert(filepath, sample_size=0.1):
    texts, labels = load_and_preprocess_data(filepath, sample_size=sample_size)
    train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

    tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
    train_dataset = tokenize_data(train_texts, train_labels, tokenizer)
    val_dataset = tokenize_data(val_texts, val_labels, tokenizer)

    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=len(set(labels)))

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,  # Reduced batch size for CPU
        per_device_eval_batch_size=32,
        warmup_steps=500,
        weight_decay=0.01,
        evaluate_during_training=True,
        logging_dir='./logs',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    model.save_pretrained('./camembert_finetuned_esg')

    return model

In [21]:
model = fine_tune_camembert(df, sample_size=0.1)

1699 1699


ValueError: too many dimensions 'str'