
# **Librairies**

In [1]:
# Installer les bibliothèques nécessaires
!pip install bitsandbytes
!pip install peft
!pip install datasets
!pip install accelerate

# Importer les bibliothèques requises pour le traitement, l'entraînement et l'accélération
import accelerate
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import DistilBertTokenizer, DistilBertModel, Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
import re

# Définir une fonction pour obtenir le dispositif (GPU si disponible, sinon CPU)
def get_device_map() -> str:
    return 'cuda' if torch.cuda.is_available() else 'cpu'

# Définir le dispositif à utiliser pour l'entraînement et l'inférence
device = get_device_map() 

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1
Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.11.1


2024-06-05 23:33:51.306875: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-05 23:33:51.307012: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-05 23:33:51.589789: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# **Loading the Base Model to Fine-tune**

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# Charger le modèle pré-entraîné
### Si vous voulez travailler avec LLaMa2, décommentez les lignes suivantes
"""model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    cache_dir="/data/yash/base_models",
    device_map='auto'
)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                          cache_dir="/data/yash/base_models"
                                         )"""

## Pour travailler avec DeBERTa, On utilise le code ci-dessous
model_name = "microsoft/deberta-v3-base"  
# Charger le tokenizer DeBERTa
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Charger le modèle DeBERTa pour la classification de séquences
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Processing des Données**

In [3]:
# Charger le jeu de données à partir du hub Hugging Face
data = load_dataset("ErfanMoosaviMonazzah/fake-news-detection-dataset-English", cache_dir = "/data/datasets")

Downloading readme:   0%|          | 0.00/487 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 78.4M/78.4M [00:01<00:00, 58.3MB/s]
Downloading data: 100%|██████████| 15.5M/15.5M [00:00<00:00, 30.0MB/s]
Downloading data: 100%|██████████| 22.0M/22.0M [00:00<00:00, 29.6MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'subject', 'date', 'label'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'subject', 'date', 'label'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'subject', 'date', 'label'],
        num_rows: 8267
    })
})

In [5]:
def process_data(example):
# Combinons le titre, le sujet et le texte pour créer un prompt unique.
  example ['prompt']="the news title is: " + example['title']+"\n\n"+"The subject of the news is: " +  example['subject']+"\n\n" + "The body text of the news is: " + example['text'] + '\n\n'
# Ajoutons une étiquette indiquant si la nouvelle est vraie ou fausse.
  example['complete_prompt'] = example['prompt']+ "this news is " + ("fake" if example['label']==0 else "real")
  return example

In [7]:
# Appliquons la fonction de traitement 
data = data.map(process_data)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8267 [00:00<?, ? examples/s]

In [8]:
data

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'subject', 'date', 'label', 'prompt', 'complete_prompt'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'subject', 'date', 'label', 'prompt', 'complete_prompt'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'subject', 'date', 'label', 'prompt', 'complete_prompt'],
        num_rows: 8267
    })
})

In [9]:
#visualisation d'exemple
print(data['train']['complete_prompt'][2])

the news title is: Federal Reserve governor Powell's policy views, in his own words

The subject of the news is: politicsNews

The body text of the news is: President Donald Trump on Thursday tapped Federal Reserve Governor Jerome Powell to become head of the U.S. central bank, promoting a soft-spoken centrist to replace Janet Yellen when her term expires in February 2018. In five years as a Fed Governor Jerome Powell has been a consistent, middle of the road voice, backing the consensus crafted by Fed chair Janet Yellen that interest rates should be raised slowly so labor markets could recover, that financial stability risks were muted, and that new regulations had made the economy safer. Following is a collection of quotes from select policy speeches he has delivered since 2015: On Rates:  “The financial crisis did significant damage to the productive capacity of our economy, and the damage was of a character, extent, and duration that cannot be fully known today…It seems plausible t

In [11]:
# Définition du jeton de padding comme le jeton de fin de séquence (EOS) du tokenizer.
tokenizer.pad_token = tokenizer.eos_token
# Ajout du jeton spécial [PAD] au tokenizer.
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Fonction pour tokeniser chaque exemple du dataset en ajoutant un padding pour atteindre une longueur maximale de 512 tokens.
def tokenize_dataset(example):
    response = tokenizer(example['complete_prompt'], padding='max_length', truncation=True, max_length=512)
    return response

In [12]:
# Application de la fonction de tokenisation à l'ensemble du dataset (batched)
data = data.map(tokenize_dataset, batched=True)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8267 [00:00<?, ? examples/s]

In [13]:
# Suppression des colonnes inutiles du dataset et mise en forme des données restantes pour l'utilisation avec PyTorch.
data = data.remove_columns(['title', 'subject', 'text', 'Unnamed: 0'])
data.set_format('torch', columns=['input_ids','prompt', 'attention_mask', 'label','complete_prompt'])

In [14]:
data

DatasetDict({
    train: Dataset({
        features: ['date', 'label', 'prompt', 'complete_prompt', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['date', 'label', 'prompt', 'complete_prompt', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['date', 'label', 'prompt', 'complete_prompt', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8267
    })
})

# **Fine-tuning du Model**

In [15]:
def print_trainable_parameters(model):
     """
    Affiche le nombre de paramètres entraînables dans le modèle.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel() # Nombre total de paramètres
        if param.requires_grad:
            trainable_params += param.numel()  # Nombre de paramètres entraînables
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [16]:
print_trainable_parameters(model)

trainable params: 184423682 || all params: 184423682 || trainable%: 100.0


In [17]:
# Arguments de l'entraînement
training_args = TrainingArguments(
    output_dir="./results",  # Répertoire de sortie pour les résultats
    evaluation_strategy="epoch",  # Stratégie d'évaluation après chaque époque
    learning_rate=2e-5,   # Taux d'apprentissage
    per_device_train_batch_size=8,  # Taille du batch d'entraînement par appareil
    per_device_eval_batch_size=8,  # Taille du batch d'évaluation par appareil
    num_train_epochs=3,  # Nombre d'époques d'entraînement
    weight_decay=0.01,  # Décroissance de poids
    save_total_limit=1,  # Limite du nombre total de sauvegardes
    save_steps=1000,  # Sauvegarde tous les 1000 pas
    logging_dir='./logs',   # Répertoire de journalisation
    report_to="none", #désactiver le W&B
)

# Initialiser le Trainer
trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
)

# Affiner l'ensemble du modèle
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss
1,0.0,5e-06
2,0.0,1e-06
3,0.0,1e-06




TrainOutput(global_step=5625, training_loss=0.003989431050067975, metrics={'train_runtime': 7859.8613, 'train_samples_per_second': 11.451, 'train_steps_per_second': 0.716, 'total_flos': 2.368041965568e+16, 'train_loss': 0.003989431050067975, 'epoch': 3.0})

In [18]:
# Sauvegardons le modèle entraîné dans le répertoire 'outputs'
trainer.model.save_pretrained('outputs')
# Rechargeons le tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base" )



# Test de la sortie du modèle affiné

In [19]:
def classify_text(prompt):
    # Tokenizer l'entrée
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)

    # Obtenir les prédictions du modèle
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Obtenir la classe prédite
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probabilities, dim=-1).item()

    return predicted_class

# Tester la fonction de classification
n = 100  # Index de l'exemple de test
prompt = data['test']['prompt'][n]
predicted_class = classify_text(prompt)
print(f"Predicted class: {predicted_class}")

Predicted class: 0


In [66]:
#Vérification
data['test']['label'][100]

tensor(0)

In [32]:
#évaluer la performance du modèle
correct=0
for i in range(100):
  prompt = data['validation']['prompt'][i]
  if classify_text(prompt) == data['validation']['label'][i]:
    correct+=1
  if i%10==0:
    print(i)
# Afficher le pourcentage de précision
print(correct/100)

0
10
20
30
40
50
60
70
80
90
1.0


In [20]:
#teste de la fonction classify_text sur un prompt quelconque
prompt = 'usa is not a country'
predicted_class = classify_text(prompt)
print(f"Predicted class: {predicted_class}")

Predicted class: 0


# UI: Streamlit

In [30]:
%%writefile app.py
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Definir le model afiné
model = AutoModelForSequenceClassification.from_pretrained("outputs")
# Definir le tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

#  interface Streamlit app
st.title("Fake News Detection with DeBERTa")

uploaded_text = st.text_area("Enter news text here...")

if st.button("Classify"):
    if uploaded_text:
        inputs = tokenizer(uploaded_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
        class_name = "Fake" if predicted_class == 0 else "Real"
        st.write(f"Predicted class: {class_name}")
    else:
        st.write("Please enter some text to classify.")

Overwriting app.py


In [None]:
#installation de Node.js, npm et Streamlit, ainsi que le package localtunnel via npm
!apt-get install -y nodejs npm
!npm install -g localtunnel
!pip install streamlit

In [31]:
#On exécute l'application Streamlit localement via localtunnel sur le port 8501.
!streamlit run app.py & npx localtunnel --port 8501

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.19.2.2:8501[0m
[34m  External URL: [0m[1mhttp://35.226.65.76:8501[0m
[0m
your url is: https://old-bottles-accept.loca.lt
^C
[34m  Stopping...[0m
