In [1]:
# ====== Importer les library ====== #
import pandas as pd
import numpy as np
import os

# ====== Déterminer les path ====== #
import os
cwd = os.getcwd()
parent = os.path.dirname(cwd)
data_path = parent + '\\data\\'

df =pd.read_hdf(cwd + '\\dataframe\\' + 'df_clean.h5')
df.head(2)

Unnamed: 0,IDENTITY_ATTACK,INSULT,PROFANITY,SEVERE_TOXICITY,THREAT,TOXICITY,comment_count,like_count,message_comments,mainTopic,message_posts,secondTopic,shares,title,time_difference,year,month,weekday
0,0.102216,0.651296,0.664565,0.350583,0.029933,0.588517,0,0,"Faux, ce ne sera jamais le temps pour lui car ...",composer-preview,HÉLÈNE BUZZETTI / Jagmeet Singh a soutenu que ...,7FJ4TUHKEFEXFIZI6DY2WAQE4E,0,Les incohérences de l’ingérence,3.0,2023,6,4
1,0.005476,0.167515,0.255499,0.006981,0.005647,0.176317,1,1,Mon dieu!! On nous prend sérieusement pour des...,composer-preview,HÉLÈNE BUZZETTI / Jagmeet Singh a soutenu que ...,7FJ4TUHKEFEXFIZI6DY2WAQE4E,0,Les incohérences de l’ingérence,18.0,2023,6,4


In [2]:
posts = pd.read_csv(data_path + 'Posts.csv')

In [48]:
post_test = posts[['title']].head(1000).copy().dropna()

***
## Classification des titres avec cammemBert
***

In [54]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from torch.utils.data import DataLoader, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset

# Assuming you have a dataframe df with a column 'title' for news titles
# and a list topics with your topics
topics = ['actualités', 'politique', 'sports', 'divertissement', 'santé', 'éducation',
                         'hockey', 'musique', 'auto', 'affaires-locales', 'environnement', 'covid-19',
                         'science', 'voyages', 'mode', 'arts', 'alimentation', 'cinéma', 'technologie',
                         'culture', 'jeunesse', 'expositions', 'techno', 'théâtre', 'livres', 'voyages',
                         'recettes', 'arts-visuels', 'sexe', 'estrie', 'remparts', 'sciences', 'société',
                         'oceanic', 'vin', 'guerre', 'canadiens', 'celebrites']

# topics = ['covid', 'politique', 'art', 'sport', 'environnement', 'autre']
df = post_test # your dataframe with news titles

# Load the BERT tokenizer and BERT model
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
model = CamembertForSequenceClassification.from_pretrained(
    'camembert-base',
    num_labels=len(topics),
    output_attentions=False,
    output_hidden_states=False,
)

# Tokenize all titles in the dataframe
input_ids = []
attention_masks = []

for title in df['title']:
    encoded_dict = tokenizer.encode_plus(
                        title,
                        add_special_tokens=True,
                        max_length=64,
                        pad_to_max_length=True,
                        return_attention_mask=True,
                        return_tensors='pt',
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Create a DataLoader
dataset = TensorDataset(input_ids, attention_masks)
dataloader = DataLoader(
            dataset,
            sampler=SequentialSampler(dataset),
            batch_size=32
)

# Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(),
                  lr=2e-5,
                  eps=1e-8
                )

epochs = 10

# Assume a multi-class classification problem
total_steps = len(dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# Implement training loop, evaluation, etc.

# Save your model
model.save_pretrained('path_to_save_model')

# Load the model for inference
model = CamembertForSequenceClassification.from_pretrained('path_to_save_model')

# Predict function
def predict(title, model=model):
    encoded_dict = tokenizer.encode_plus(
                        title,
                        add_special_tokens=True,
                        max_length=64,
                        pad_to_max_length=True,
                        return_attention_mask=True,
                        return_tensors='pt',
                   )
    
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    
    with torch.no_grad():
        outputs = model(input_id, token_type_ids=None, attention_mask=attention_mask)
    
    logits = outputs[0]
    index = logits.argmax()
    return topics[index]

# Classify all titles
df['predicted_topic'] = df['title'].apply(predict)
df.to_excel('TopicsClassification.xlsx')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


***
## Apiori
***

In [58]:
posts = pd.read_csv(data_path + 'Posts.csv')
post_test = posts[['title']].head(100).copy().dropna()

In [70]:
post_test['title'][0]

'Une électrification à « la mesure des besoins de remplacement »'

In [72]:
# Remove stop words
import spacy

nlp = spacy.load('fr_core_news_sm')
text = post_test['title'][0]
doc = nlp(text)

filtered_words = [token.text for token in doc if not token.is_stop]
clean_text = ' '.join(filtered_words)

print(text)
print(clean_text)

Une électrification à « la mesure des besoins de remplacement »
électrification « mesure besoins remplacement »


In [75]:
import spacy
import spacy_cleaner
from spacy_cleaner.processing import removers, replacers, mutators

model = spacy.load("fr_core_news_sm")
pipeline = spacy_cleaner.Pipeline(
    model,
    removers.remove_stopword_token,
    replacers.replace_punctuation_token,
    mutators.mutate_lemma_token,
)

text = post_test['title'][0]

text_clean = pipeline.clean(text)

print(text)
print(text_clean)

# ['hello _IS_PUNCT_ Cellan _IS_PUNCT_ love swim _IS_PUNCT_']

AttributeError: module 'spacy_cleaner' has no attribute 'Pipeline'

In [64]:
from apyori import apriori

text = post_test['title'][0]

results = list(apriori(text))
results

[RelationRecord(items=frozenset({' '}), support=0.15873015873015872, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({' '}), confidence=0.15873015873015872, lift=1.0)]),
 RelationRecord(items=frozenset({'e'}), support=0.15873015873015872, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'e'}), confidence=0.15873015873015872, lift=1.0)])]