In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
import tensorflow as tf


  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import pandas as pd 
data = pd.read_csv(r"C:\Users\carlf\Documents\GitHub\lyrics_generator\data\05_lyrics_genius_\lyrics_genius.csv")

In [2]:
df = data.query("Artist == 'BOOBA'| Artist == 'La Fouine'")
df = df[["Lyrics", "Artist"]]
print(df.columns,
    df.shape)

Index(['Lyrics', 'Artist'], dtype='object') (857, 2)


In [3]:
df = df.dropna(subset=["Lyrics"])
df.shape

(845, 2)

In [6]:
artists_to_classify = ["BOOBA", "La Fouine"]

In [8]:
# Encoder les labels
df['Label'] = df['Artist'].apply(lambda x: artists_to_classify.index(x))

In [33]:
# Diviser en ensembles d'entraînement et de test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Lyrics'].tolist(),
    df['Label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['Label'],  # Utiliser les labels pour stratifier
)


In [31]:
# Calculer les proportions dans les ensembles d'entraînement et de test
import numpy as np

train_distribution = np.unique(train_labels, return_counts=True)
test_distribution = np.unique(test_labels, return_counts=True)

print("Train distribution:", dict(zip(train_distribution[0], train_distribution[1])))
print("Test distribution:", dict(zip(test_distribution[0], test_distribution[1])))

# Vérifiez si les proportions sont similaires
total_train = sum(train_distribution[1])
total_test = sum(test_distribution[1])

for cls, count in zip(train_distribution[0], train_distribution[1]):
    train_ratio = count / total_train
    test_ratio = dict(zip(test_distribution[0], test_distribution[1]))[cls] / total_test
    print(f"Class {cls}: Train ratio: {train_ratio:.2f}, Test ratio: {test_ratio:.2f}")


Train distribution: {0: 350, 1: 326}
Test distribution: {0: 87, 1: 82}
Class 0: Train ratio: 0.52, Test ratio: 0.51
Class 1: Train ratio: 0.48, Test ratio: 0.49


In [None]:
# Charger le tokenizer et le modèle

# model_name = "distilbert-base-uncased"
model_name = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
All PyTorch model weights were used when initializing TFCamembertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFCamembertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:

# Tokeniser les données
def tokenize_function(examples):
    return tokenizer(examples, truncation=True, padding=True, max_length=128)

train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

In [35]:
# Convertir en Dataset TensorFlow
def to_tf_dataset(encodings, labels):
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings), labels))
    return dataset

train_dataset = to_tf_dataset(train_encodings, train_labels).shuffle(1000).batch(16)
test_dataset = to_tf_dataset(test_encodings, test_labels).batch(16)

In [36]:
# Compiler et entraîner le modèle
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

In [37]:
model.fit(
    train_dataset, 
    validation_data=test_dataset, 
    epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x209809c6ff0>

In [38]:
# Évaluer le modèle
predictions = model.predict(test_dataset).logits
predicted_labels = tf.argmax(predictions, axis=1).numpy()



# Résultat

Scoring de la classification en utilisant CamemBert

In [39]:
print("Classification Report:")
print(classification_report(test_labels, predicted_labels, target_names=artists_to_classify))

Classification Report:
              precision    recall  f1-score   support

       BOOBA       0.94      0.68      0.79        87
   La Fouine       0.74      0.95      0.83        82

    accuracy                           0.81       169
   macro avg       0.84      0.81      0.81       169
weighted avg       0.84      0.81      0.81       169



# Résultat

Scoring de la classification en utilisant DistilBert

In [19]:
print("Classification Report:")
print(classification_report(test_labels, predicted_labels, target_names=artists_to_classify))

Classification Report:
              precision    recall  f1-score   support

       BOOBA       0.82      0.88      0.85        88
   La Fouine       0.85      0.79      0.82        81

    accuracy                           0.83       169
   macro avg       0.84      0.83      0.83       169
weighted avg       0.84      0.83      0.83       169

