In [1]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/My Drive/EMLyon/Engie

/content/drive/My Drive/EMLyon/Engie


In [3]:
#Load the file
file_path = 'Verbatims SAT Digitaux 2023-2024 - Feuille 1.csv'
df = pd.read_csv(file_path)

# Display the head
df.head()

Unnamed: 0,user ID,Device_type,date,VERBATIMS (NEGATIFS/POSITIFS),note,THEME,CODIFICATION
0,63fecea0f5273004f9004c69,Responsive,01/03/2023,Complexe / -,3,Ergonomie,Ergonomie
1,63fed20e8a1b4e04f6004b82,Desktop,01/03/2023,"- / site très clair, facile a comprendre et à ...",5,Ergonomie,
2,63fed74eadb28b04f9004b5b,Desktop,01/03/2023,- / Un tableau de bord pratique. Un suivi effi...,5,Suivi Conso,
3,63fee98c6f72f304fa005851,Responsive,01/03/2023,- / Ergonomie \nDonnées \nFactures dispo \nOn ...,4,Ergonomie,
4,63fef00a33f0fe04fb0055a0,Desktop,01/03/2023,- / clair et rapide pour évaluer la conso,4,Suivi Conso,


In [4]:
# Distribution of THEME
theme_counts = df['THEME'].value_counts().reset_index()
theme_counts.columns = ['THEME', 'Count']

theme_fig = px.bar(theme_counts, x='Count', y='THEME', orientation='h', title='Distribution of Themes')
theme_fig.update_layout(
    yaxis_title='Theme',
    xaxis_title='Count',
    yaxis=dict(tickmode='linear'),
    margin=dict(l=150, r=50, t=50, b=50),
    font=dict(size=12)
)
theme_fig.show()

# Distribution of note
note_fig = px.histogram(df[df["note"]!="note_num"], x='note', title='Distribution of Notes', nbins=10)
note_fig.update_layout(
    xaxis_title='Note',
    yaxis_title='Count',
    font=dict(size=12)
)
note_fig.show()

In [5]:
# Check rows with missing values in 'THEME' and 'VERBATIMS'
missing_theme = df['THEME'].isna().sum()
missing_verbatims = df['VERBATIMS (NEGATIFS/POSITIFS)'].isna().sum()
missing_notes = df['note'].isna().sum()
# Display the rows with missing values in 'THEME'
print("Rows with missing values in 'THEME':")
print(missing_theme)

# Display the rows with missing values in 'VERBATIMS (NEGATIFS/POSITIFS)'
print("\nRows with missing values in 'VERBATIMS (NEGATIFS/POSITIFS)':")
print(missing_verbatims)

# Display the rows with missing values in 'note'
print("\nRows with missing values in 'note':")
print(missing_notes)

Rows with missing values in 'THEME':
2206

Rows with missing values in 'VERBATIMS (NEGATIFS/POSITIFS)':
0

Rows with missing values in 'note':
0


In [6]:
#Dataset without missing values
df_filtered = df[~df['THEME'].isna()]

In [7]:
df_filtered.duplicated().sum()

1

In [8]:
df_filtered.drop_duplicates(inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [9]:
X = df_filtered['VERBATIMS (NEGATIFS/POSITIFS)']
y = df_filtered['THEME']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the train and test sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (17764,)
X_test shape: (4442,)
y_train shape: (17764,)
y_test shape: (4442,)


In [13]:
import emoji

emoji_list = []

def contains_emoji(text):
    emojis = [char for char in text if char in emoji.EMOJI_DATA]
    if emojis:
        emoji_list.extend(emojis)
        return True
    return False

# Check each tweet for emojis
emoji_presence = [contains_emoji(tweet) for tweet in X]

# Combine the tweets with their emoji presence status
tweets_with_emoji_status = list(zip(X, emoji_presence))

# Display the results
emoji_count = 0
for tweet, has_emoji in tweets_with_emoji_status:
    if has_emoji:
        emoji_count += 1
        #print(f"Tweet: {tweet}\nContains Emoji: {has_emoji}\n")

print(f"Total tweets with emojis: {emoji_count}")
print(f"Emojis found: {emoji_list}")


Total tweets with emojis: 8
Emojis found: ['™', '😢', '☺', '😏', '😁', '😡', '🥴', '🥴', '🥴', '🥴', '🥴', '😊', '😤']


In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
import emoji

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Set of French stopwords
stop_words = set(stopwords.words('french'))
# Initialize French stemmer (as lemmatizer for French is less commonly used)
stemmer = FrenchStemmer()

# Dictionary to convert common emojis to French words
emoji_dict = {
    '™': 'marque',
    '😢': 'triste',
    '☺': 'sourire',
    '😏': 'sourire narquois',
    '😁': 'grand sourire',
    '😡': 'colère',
    '🥴': 'désorienté',
    '😊': 'sourire',
    '😤': 'frustré'
}

def convert_emojis(text):
    for emot in emoji_dict:
        text = text.replace(emot, emoji_dict[emot])
    return text

def clean_text(text):
    # Convert non-string to string
    text = str(text)

    # Parse HTML
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    # Convert emojis to French words
    text = convert_emojis(text)

    # Lowercasing
    text = text.lower()

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove hyperlinks
    text = re.sub(r'https?://\S+', '', text)

    # Remove user tags
    text = re.sub(r'@\w+', '', text)

    # Reduce multiple exclamations or question marks to one
    text = re.sub(r'!+', '!', text)
    text = re.sub(r'\?+', '?', text)

    # Tokenize (split text into a list of words) while preserving punctuation
    words = word_tokenize(text, language='french')

    # Ensure punctuation is separate tokens
    words = [word if word not in ['!', '?'] else f' {word} ' for word in words]
    words = ' '.join(words).split()

    # Remove stopwords and perform stemming
    words = [stemmer.stem(word) for word in words if word not in stop_words]

    # Join the words back into one string
    text = ' '.join(Rwords)

    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ataavlar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ataavlar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
X_train = [clean_text(text) for text in X_train]
X_test = [clean_text(text) for text in X_test]


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



In [16]:
# Define the pipelines
pipelines = {
    'logistic_regression': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(max_iter=1000))
    ]),
    'random_forest': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', RandomForestClassifier())
    ])
}

# Train and evaluate the models
for name, pipeline in pipelines.items():
    print(f"Training {name}...")
    pipeline.fit(X_train, y_train)
    y_pred_train = pipeline.predict(X_train)
    y_pred = pipeline.predict(X_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    print(f"Train Accuracy: {accuracy_train:.4f}")

    print(f"Classification report for {name}:")
    print(classification_report(y_test, y_pred))
    print("\n")


Training logistic_regression...
Train Accuracy: 0.7433
Classification report for logistic_regression:
                                           precision    recall  f1-score   support

                                        -       0.75      0.13      0.22        23
                                 Accès EC       0.65      0.35      0.45       121
                           Accès EC (AEC)       0.56      0.44      0.49        57
                       Application mobile       0.31      0.17      0.22        24
             Contact & Suivi des demandes       0.55      0.32      0.40       128
       Contact & Suivi des demandes (CSD)       0.00      0.00      0.00         9
                Dysfonctionnement général       0.46      0.16      0.24        37
          Dysfonctionnement général (DYS)       0.00      0.00      0.00        18
                         Engie en général       0.60      0.06      0.11        49
                                Ergonomie       0.76      0.93     


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Train Accuracy: 0.9627
Classification report for random_forest:
                                           precision    recall  f1-score   support

                                        -       0.25      0.78      0.38        23
                                 Accès EC       0.50      0.22      0.31       121
                           Accès EC (AEC)       0.44      0.54      0.49        57
                       Application mobile       0.22      0.08      0.12        24
             Contact & Suivi des demandes       0.51      0.34      0.40       128
       Contact & Suivi des demandes (CSD)       1.00      0.11      0.20         9
                Dysfonctionnement général       0.33      0.22      0.26        37
          Dysfonctionnement général (DYS)       0.33      0.06      0.10        18
                         Engie en général       0.33      0.12      0.18        49
                                Ergonomie       0.81      0.88      0.84      1858
              Factures


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [17]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.12.0-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.12.0-py3-none-any.whl (234 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-macosx_14_0_universal2.whl size=662657 sha256=b88f2b5a437e3fd335a34c3d6650fdeea8e0da089d868d073da19591fd21c8be
  Stored in directory: /Users/ataavlar/Library/Caches/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.12.0
Note: you may need to restart the kernel to use up

In [24]:
import fasttext
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load pre-trained FastText model for French
# Load the FastText model
fasttext_model = fasttext.load_model('cc.fr.300.bin')

import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator

class FastTextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([self.model.get_sentence_vector(text) for text in X])



In [25]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Encode y_test and handle unseen labels
y_test_encoded = []
y_test_filtered = []
X_test_filtered = []
for i, label in enumerate(y_test):
    if label in label_encoder.classes_:
        y_test_encoded.append(label_encoder.transform([label])[0])
        y_test_filtered.append(label)
        X_test_filtered.append(X_test[i])

# Convert lists to numpy arrays
y_test_encoded = np.array(y_test_encoded)
X_test_filtered = np.array(X_test_filtered)

# Define the XGBoost pipeline
pipeline = Pipeline([
    ('fasttext', FastTextTransformer(fasttext_model)),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])

# Train and evaluate the model
print("Training XGBoost with FastText embeddings...")
pipeline.fit(X_train, y_train_encoded)
y_pred_train = pipeline.predict(X_train)
y_pred = pipeline.predict(X_test_filtered)
accuracy_train = accuracy_score(y_train_encoded, y_pred_train)
accuracy_test = accuracy_score(y_test_encoded, y_pred)

print(f"Train Accuracy: {accuracy_train:.4f}")
print(f"Test Accuracy: {accuracy_test:.4f}")

print("Classification report for XGBoost with FastText embeddings:")
print(classification_report(y_test_encoded, y_pred))


Training XGBoost with FastText embeddings...
Train Accuracy: 0.9754
Test Accuracy: 0.6700
Classification report for XGBoost with FastText embeddings:
              precision    recall  f1-score   support

           0       0.79      0.48      0.59        23
           1       0.55      0.24      0.33       121
           2       0.52      0.46      0.49        57
           3       0.12      0.04      0.06        24
           4       0.47      0.32      0.38       128
           5       1.00      0.11      0.20         9
           6       0.47      0.19      0.27        37
           7       0.00      0.00      0.00        18
           8       0.30      0.06      0.10        49
           9       0.77      0.91      0.83      1858
          10       0.43      0.22      0.29        89
          11       0.25      0.08      0.12        12
          12       0.46      0.44      0.45       297
          13       1.00      0.08      0.15        36
          14       0.45      0.42      


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [32]:
#Sample Cleaning
import random
import pandas as pd
from tabulate import tabulate

def display_random_cleaning(data, num_samples=5):
    # Ensure data is a pandas Series
    if isinstance(data, pd.Series):
        data = data.tolist()
    
    samples = random.sample(data, num_samples)
    cleaned_samples = [clean_text(sample) for sample in samples]

    df = pd.DataFrame({
        'Original': samples,
        'Cleaned': cleaned_samples
    })

    print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))


display_random_cleaning(X, num_samples=5)


+----------------------------------------------------+------------------------------------------+
|                      Original                      |                 Cleaned                  |
+----------------------------------------------------+------------------------------------------+
|                - / mise a jour gaz                 |            - / mis a jour gaz            |
|            Pas tout le temps à jour / -            |            tout temp jour / -            |
|                 - / Bien expliqué                  |             - / bien expliqu             |
| - / Facile à utiliser, paiement et suivie pratique |   - / facil utilis , pai suiv pratiqu    |
|        - / Application très intuitive ...          | - / appliqu tres intuit ... facil navigu |
|                 Facile à naviguer                  |                                          |
+----------------------------------------------------+------------------------------------------+



The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



## Category Mapping

In [43]:
#Distinct themes:
len(y.unique())

25

In [44]:
y.value_counts()

THEME
Ergonomie                                    9667
Suivi Conso                                  4474
Gestion des données & services               1457
Hors digital                                 1305
Suivi Conso \n (SC)                          1161
Contact & Suivi des demandes                  624
Accès EC                                      539
Programme d'engagement                        501
Factures & Autres documents                   465
Hors digital (HD)                             432
Accès EC (AEC)                                315
Engie en général                              228
Offres & Tarifs                               219
Gestion des données et des services (GDS)     185
Dysfonctionnement général                     155
Application mobile                            116
-                                             102
Dysfonctionnement général (DYS)                79
Factures & Autres documents (DOC)              78
Contact & Suivi des demandes (CSD)          

In [45]:
unique_values = y.unique()

array(['Ergonomie', 'Suivi Conso', 'Accès EC',
       'Gestion des données & services', 'Contact & Suivi des demandes',
       'Engie en général', 'Offres & Tarifs',
       'Factures & Autres documents', 'Application mobile',
       'Hors digital', "Programme d'engagement", 'Relève compteur',
       'Dysfonctionnement général', 'Ma Conso', '-',
       'Factures & Autres documentsrt',
       'Contact & Suivi des demandes (CSD)', 'Hors digital (HD)',
       'Gestion des données et des services (GDS)', 'Suivi Conso \n (SC)',
       'Relève compteur (RC)', 'Dysfonctionnement général (DYS)',
       'Accès EC (AEC)', 'Offres et Tarifs (OT)',
       'Factures & Autres documents (DOC)'], dtype=object)

In [47]:
category_mapping = {
    'Ergonomie': 'Ergonomie',
    'Suivi Conso': 'Suivi Conso',
    'Accès EC': 'Accès EC',
    'Gestion des données & services': 'Gestion des données & services',
    'Contact & Suivi des demandes': 'Contact & Suivi des demandes',
    'Engie en général': 'Engie en général',
    'Offres & Tarifs': 'Offres & Tarifs',
    'Factures & Autres documents': 'Factures & Autres documents',
    'Application mobile': 'Application mobile',
    'Hors digital': 'Hors digital',
    "Programme d'engagement": "Programme d'engagement",
    'Relève compteur': 'Relève compteur',
    'Dysfonctionnement général': 'Dysfonctionnement général',
    'Ma Conso': 'Ma Conso',
    '-': 'Autres',
    'Factures & Autres documentsrt': 'Factures & Autres documents',
    'Contact & Suivi des demandes (CSD)': 'Contact & Suivi des demandes',
    'Hors digital (HD)': 'Hors digital',
    'Gestion des données et des services (GDS)': 'Gestion des données & services',
    'Suivi Conso \n (SC)': 'Suivi Conso',
    'Relève compteur (RC)': 'Relève compteur',
    'Dysfonctionnement général (DYS)': 'Dysfonctionnement général',
    'Accès EC (AEC)': 'Accès EC',
    'Offres et Tarifs (OT)': 'Offres & Tarifs',
    'Factures & Autres documents (DOC)': 'Factures & Autres documents'
}

# Replace the values in the 'y' series
y_mapped = y.replace(category_mapping)

# Count the new categories
new_category_counts = y_mapped.value_counts()

# Display the results
print(new_category_counts)

THEME
Ergonomie                         9667
Suivi Conso                       5635
Hors digital                      1737
Gestion des données & services    1642
Accès EC                           854
Contact & Suivi des demandes       671
Factures & Autres documents        544
Programme d'engagement             501
Offres & Tarifs                    236
Dysfonctionnement général          234
Engie en général                   228
Application mobile                 116
Autres                             102
Relève compteur                     38
Ma Conso                             1
Name: count, dtype: int64


In [48]:
X_train, X_test, y_train_mapped, y_test_mapped = train_test_split(X, y_mapped, test_size=0.2, random_state=42)

In [49]:
X_train = [clean_text(text) for text in X_train]
X_test = [clean_text(text) for text in X_test]


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



In [50]:
# Define the pipelines
pipelines = {
    'logistic_regression': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(max_iter=1000))
    ]),
    'random_forest': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', RandomForestClassifier())
    ])
}

# Train and evaluate the models
for name, pipeline in pipelines.items():
    print(f"Training {name}...")
    pipeline.fit(X_train, y_train_mapped)
    y_pred_train = pipeline.predict(X_train)
    y_pred = pipeline.predict(X_test)
    accuracy_train = accuracy_score(y_train_mapped, y_pred_train)
    print(f"Train Accuracy: {accuracy_train:.4f}")

    print(f"Classification report for {name}:")
    print(classification_report(y_test_mapped, y_pred))
    print("\n")


Training logistic_regression...
Train Accuracy: 0.8195
Classification report for logistic_regression:
                                precision    recall  f1-score   support

                      Accès EC       0.77      0.62      0.69       178
            Application mobile       0.33      0.12      0.18        24
                        Autres       0.75      0.13      0.22        23
  Contact & Suivi des demandes       0.59      0.33      0.42       137
     Dysfonctionnement général       0.79      0.35      0.48        55
              Engie en général       0.60      0.06      0.11        49
                     Ergonomie       0.78      0.91      0.84      1858
   Factures & Autres documents       0.63      0.41      0.50       102
Gestion des données & services       0.59      0.49      0.53       333
                  Hors digital       0.57      0.55      0.56       343
                      Ma Conso       0.00      0.00      0.00         1
               Offres & Tarifs   


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Train Accuracy: 0.9797
Classification report for random_forest:
                                precision    recall  f1-score   support

                      Accès EC       0.73      0.56      0.63       178
            Application mobile       0.30      0.12      0.18        24
                        Autres       0.26      0.78      0.39        23
  Contact & Suivi des demandes       0.57      0.34      0.42       137
     Dysfonctionnement général       0.57      0.36      0.44        55
              Engie en général       0.50      0.10      0.17        49
                     Ergonomie       0.82      0.88      0.85      1858
   Factures & Autres documents       0.63      0.37      0.47       102
Gestion des données & services       0.56      0.44      0.49       333
                  Hors digital       0.56      0.51      0.53       343
                      Ma Conso       0.00      0.00      0.00         1
               Offres & Tarifs       0.55      0.11      0.18        55


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [51]:
import fasttext
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load pre-trained FastText model for French
# Load the FastText model
fasttext_model = fasttext.load_model('cc.fr.300.bin')

import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator

class FastTextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([self.model.get_sentence_vector(text) for text in X])



In [53]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_mapped)

# Encode y_test and handle unseen labels
y_test_encoded = []
y_test_filtered = []
X_test_filtered = []
for i, label in enumerate(y_test_mapped):
    if label in label_encoder.classes_:
        y_test_encoded.append(label_encoder.transform([label])[0])
        y_test_filtered.append(label)
        X_test_filtered.append(X_test[i])

# Convert lists to numpy arrays
y_test_encoded = np.array(y_test_encoded)
X_test_filtered = np.array(X_test_filtered)

# Define the XGBoost pipeline
pipeline = Pipeline([
    ('fasttext', FastTextTransformer(fasttext_model)),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])

# Train and evaluate the model
print("Training XGBoost with FastText embeddings...")
pipeline.fit(X_train, y_train_encoded)
y_pred_train = pipeline.predict(X_train)
y_pred = pipeline.predict(X_test_filtered)
accuracy_train = accuracy_score(y_train_encoded, y_pred_train)
accuracy_test = accuracy_score(y_test_encoded, y_pred)

print(f"Train Accuracy: {accuracy_train:.4f}")
print(f"Test Accuracy: {accuracy_test:.4f}")

print("Classification report for XGBoost with FastText embeddings:")
print(classification_report(y_test_encoded, y_pred))


Training XGBoost with FastText embeddings...
Train Accuracy: 0.9858
Test Accuracy: 0.7390
Classification report for XGBoost with FastText embeddings:
              precision    recall  f1-score   support

           0       0.79      0.55      0.65       178
           1       0.29      0.08      0.13        24
           2       0.77      0.43      0.56        23
           3       0.54      0.32      0.40       137
           4       0.71      0.31      0.43        55
           5       0.33      0.08      0.13        49
           6       0.78      0.90      0.84      1858
           7       0.65      0.32      0.43       102
           8       0.55      0.46      0.50       333
           9       0.51      0.46      0.48       343
          10       0.46      0.11      0.18        55
          11       0.52      0.16      0.25       104
          12       0.00      0.00      0.00         7
          13       0.79      0.90      0.84      1173

    accuracy                          


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



## Camembert + Neural Networks

In [None]:
import torch
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Prepare the data
X = df_filtered['VERBATIMS (NEGATIFS/POSITIFS)'].astype(str).tolist()
y = df_filtered['THEME'].astype(str).tolist()

# Encode the labels as integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to Hugging Face datasets format
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


In [None]:
# Load CamemBERT tokenizer and model
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=len(label_encoder.classes_))

# Move model to GPU if available
model.to(device)

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/17764 [00:00<?, ? examples/s]

Map:   0%|          | 0/4442 [00:00<?, ? examples/s]

In [None]:
# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = torch.tensor(logits)
    labels = torch.tensor(labels)
    predictions = torch.argmax(logits, dim=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train and evaluate
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.2888,1.223563,0.666367,0.595743,0.574077,0.666367


  _warn_prf(average, modifier, msg_start, len(result))


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.2888,1.223563,0.666367,0.595743,0.574077,0.666367
2,1.0864,1.130007,0.679424,0.611728,0.581808,0.679424


  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.1300067901611328,
 'eval_accuracy': 0.6794236830256641,
 'eval_f1': 0.6117282834429539,
 'eval_precision': 0.5818077232455309,
 'eval_recall': 0.6794236830256641,
 'eval_runtime': 61.7589,
 'eval_samples_per_second': 71.925,
 'eval_steps_per_second': 4.501,
 'epoch': 2.0}