In [53]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm


In [39]:
fake_path = 'data/Fake.csv'
true_path = 'data/True.csv'

In [40]:
fake_data = pd.read_csv(fake_path)
true_data = pd.read_csv(true_path)

fake_data['label'] = 'fake'
true_data['label'] = 'true'

data = pd.concat([fake_data, true_data], ignore_index=True)

In [41]:
data

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",fake
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",true
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",true
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",true
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",true


In [42]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Télécharger les ressources nécessaires de NLTK
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialiser les stopwords et le lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convertir en minuscules
    text = text.lower()
    # Supprimer les caractères spéciaux et les chiffres
    text = re.sub(r'[^a-z\s]', '', text)
    # Supprimer les stopwords et appliquer la lemmatisation
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Appliquer le prétraitement sur les colonnes 'title' et 'text'
data['cleaned_title'] = data['title'].apply(preprocess_text)
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Afficher un exemple avant et après le nettoyage
print("Exemple avant nettoyage :")
print(data['text'].iloc[0])
print("\nExemple après nettoyage :")
print(data['cleaned_text'].iloc[0])

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/onyxia/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Exemple avant nettoyage :
Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialiser le CountVectorizer
bow_vectorizer = CountVectorizer(max_features=5000)  # Limiter à 5000 mots les plus fréquents

# Appliquer sur les textes nettoyés
bow_features = bow_vectorizer.fit_transform(data['cleaned_text'])

# Afficher la forme de la matrice BoW
print("Shape of Bag of Words matrix:", bow_features.shape)

Shape of Bag of Words matrix: (44898, 5000)


In [44]:

# 2. Vectorisation BoW
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned_text']).toarray()
y = (data['label'] == 'true').astype(int).values  # 1 pour true, 0 pour fake

# 3. Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Dataset PyTorch
class FakeNewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = FakeNewsDataset(X_train, y_train)
test_dataset = FakeNewsDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 5. Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 6. Modèle LSTM (input = BoW)
class LSTMBinaryClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Reshape x for LSTM: (batch, seq_len=1, input_size)
        x = x.unsqueeze(1)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return self.sigmoid(out).squeeze(1)

# 7. Initialisation
model = LSTMBinaryClassifier(input_size=5000, hidden_size=128, num_layers=1).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 8. Entraînement
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# 9. Évaluation simple
model.eval()
correct = total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        predicted = (outputs >= 0.5).float()
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f"Accuracy: {100 * correct / total:.2f}%")


Epoch [1/10], Loss: 0.0073
Epoch [2/10], Loss: 0.0010
Epoch [3/10], Loss: 0.0002
Epoch [4/10], Loss: 0.0000
Epoch [5/10], Loss: 0.0001
Epoch [6/10], Loss: 0.0000
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0000
Accuracy: 99.61%


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiter à 5000 mots les plus fréquents
X = tfidf_vectorizer.fit_transform(data['cleaned_text']).toarray()
y = (data['label'] == 'true').astype(int).values  # 1 pour true, 0 pour fake

# 3. Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Dataset PyTorch
class FakeNewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = FakeNewsDataset(X_train, y_train)
test_dataset = FakeNewsDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 5. Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 6. Modèle LSTM (input = BoW)
class LSTMBinaryClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Reshape x for LSTM: (batch, seq_len=1, input_size)
        x = x.unsqueeze(1)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return self.sigmoid(out).squeeze(1)

# 7. Initialisation
model = LSTMBinaryClassifier(input_size=5000, hidden_size=128, num_layers=1).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 8. Entraînement
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# 9. Évaluation simple
model.eval()
correct = total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        predicted = (outputs >= 0.5).float()
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f"Accuracy: {100 * correct / total:.2f}%")


Epoch [1/10], Loss: 0.0117
Epoch [2/10], Loss: 0.0017
Epoch [3/10], Loss: 0.0048
Epoch [4/10], Loss: 0.0003
Epoch [5/10], Loss: 0.0002
Epoch [6/10], Loss: 0.0038
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0000
Accuracy: 99.31%


In [47]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class FakeNewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class LSTMBinaryClassifier(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = x.unsqueeze(1)  # (batch_size, seq_len=1, input_size)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return self.sigmoid(out).squeeze(1)

class LSTMExperiment:
    def __init__(self, vectorizer, input_size, device):
        self.vectorizer = vectorizer
        self.device = device
        self.model = LSTMBinaryClassifier(input_size=input_size).to(device)
        self.criterion = nn.BCELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)

    def prepare_data(self, data):
        X = self.vectorizer.fit_transform(data['cleaned_text']).toarray()
        y = (data['label'] == 'true').astype(int).values
        return train_test_split(X, y, test_size=0.2, random_state=42)

    def run(self, data, num_epochs=10):
        X_train, X_test, y_train, y_test = self.prepare_data(data)

        train_loader = DataLoader(FakeNewsDataset(X_train, y_train), batch_size=32, shuffle=True)
        test_loader = DataLoader(FakeNewsDataset(X_test, y_test), batch_size=32)

        for epoch in range(num_epochs):
            self.model.train()
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(self.device), y_batch.to(self.device)
                outputs = self.model(X_batch)
                loss = self.criterion(outputs, y_batch)

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

        # Évaluation
        self.model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch = X_batch.to(self.device)
                outputs = self.model(X_batch)
                preds = (outputs >= 0.5).float().cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(y_batch.numpy())

        acc = accuracy_score(all_labels, all_preds)
        return acc


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# TF-IDF
exp_tfidf = LSTMExperiment(TfidfVectorizer(max_features=5000), input_size=5000, device=device)
acc_tfidf = exp_tfidf.run(data)
print(f"TF-IDF Accuracy: {acc_tfidf:.2%}")

# BoW
exp_bow = LSTMExperiment(CountVectorizer(max_features=5000), input_size=5000, device=device)
acc_bow = exp_bow.run(data)
print(f"BoW Accuracy: {acc_bow:.2%}")


TF-IDF Accuracy: 99.33%
BoW Accuracy: 99.60%


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
from gensim.models import Word2Vec
import numpy as np
import tqdm

# Dataset
class FakeNewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Modèle LSTM
class LSTMBinaryClassifier(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return self.sigmoid(out).squeeze(1)

# Classe générique pour BoW / TF-IDF
class LSTMExperimentBase:
    def __init__(self, input_size, device):
        self.device = device
        self.model = LSTMBinaryClassifier(input_size=input_size).to(device)
        self.criterion = nn.BCELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)

    def run(self, X_train, X_test, y_train, y_test, batch_size=32, num_epochs=10):
        train_loader = DataLoader(FakeNewsDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(FakeNewsDataset(X_test, y_test), batch_size=batch_size)

        for epoch in range(num_epochs):
            self.model.train()
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(self.device), y_batch.to(self.device)
                outputs = self.model(X_batch)
                loss = self.criterion(outputs, y_batch)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

        self.model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch = X_batch.to(self.device)
                outputs = self.model(X_batch)
                preds = (outputs >= 0.5).float().cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(y_batch.numpy())

        return accuracy_score(all_labels, all_preds)

# Word2Vec
class LSTMWord2VecExperiment(LSTMExperimentBase):
    def __init__(self, data, device, vector_size=100, max_len=50):
        self.max_len = max_len
        self.vector_size = vector_size
        self.device = device
        self.model = LSTMBinaryClassifier(input_size=vector_size).to(device)
        self.criterion = nn.BCELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)

        tokenized = [text.lower().split() for text in data['cleaned_text']]
        self.w2v_model = Word2Vec(sentences=tokenized, vector_size=vector_size, window=5, min_count=1, workers=4)
        self.word_vectors = self.w2v_model.wv

    def encode(self, texts):
        encoded = []
        for sent in texts:
            tokens = sent.lower().split()
            vecs = [self.word_vectors[w] if w in self.word_vectors else np.zeros(self.vector_size) for w in tokens]
            vecs = vecs[:self.max_len] + [np.zeros(self.vector_size)] * (self.max_len - len(vecs)) if len(vecs) < self.max_len else vecs[:self.max_len]
            encoded.append(vecs)
        return np.array(encoded)

    def run_on_data(self, data):
        y = (data['label'] == 'true').astype(int).values
        X_train_raw, X_test_raw, y_train, y_test = train_test_split(data['cleaned_text'], y, test_size=0.2, random_state=42)
        X_train = self.encode(X_train_raw)
        X_test = self.encode(X_test_raw)
        return super().run(X_train, X_test, y_train, y_test)

# BERT
class LSTMBERTExperiment(LSTMExperimentBase):
    def __init__(self, device, max_len=50):
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)
        self.bert_model.eval()
        self.device = device
        self.max_len = max_len
        super().__init__(input_size=768, device=device)

    def encode(self, texts):
        embeddings = []
        for text in tqdm.tqdm(texts, desc="BERT Embeddings"):
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=self.max_len)
            input_ids = inputs["input_ids"].to(self.device)
            attention_mask = inputs["attention_mask"].to(self.device)
            with torch.no_grad():
                outputs = self.bert_model(input_ids, attention_mask=attention_mask)
                last_hidden_state = outputs.last_hidden_state.squeeze(0)  # (seq_len, 768)
            embeddings.append(last_hidden_state.cpu().numpy())
        return np.stack(embeddings)

    def run_on_data(self, data):
        y = (data['label'] == 'true').astype(int).values
        X_train_raw, X_test_raw, y_train, y_test = train_test_split(data['cleaned_text'], y, test_size=0.2, random_state=42)
        X_train = self.encode(X_train_raw)
        X_test = self.encode(X_test_raw)
        return super().run(X_train, X_test, y_train, y_test)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# 1. Détection GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. BoW
bow_vectorizer = CountVectorizer(max_features=5000)
X_bow = bow_vectorizer.fit_transform(data['cleaned_text']).toarray()
y = (data['label'] == 'true').astype(int).values
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, y, test_size=0.2, random_state=42)

exp_bow = LSTMExperimentBase(input_size=5000, device=device)
accuracy_bow = exp_bow.run(X_train_bow, X_test_bow, y_train_bow, y_test_bow)
print(f"BoW Accuracy: {accuracy_bow:.2%}")

# 3. TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(data['cleaned_text']).toarray()
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

exp_tfidf = LSTMExperimentBase(input_size=5000, device=device)
accuracy_tfidf = exp_tfidf.run(X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf)
print(f"TF-IDF Accuracy: {accuracy_tfidf:.2%}")

# 4. Word2Vec
exp_w2v = LSTMWord2VecExperiment(data=data, device=device, vector_size=100, max_len=50)
accuracy_w2v = exp_w2v.run_on_data(data)
print(f"Word2Vec Accuracy: {accuracy_w2v:.2%}")

# 5. BERT
exp_bert = LSTMBERTExperiment(device=device, max_len=50)
accuracy_bert = exp_bert.run_on_data(data)
print(f"BERT Accuracy: {accuracy_bert:.2%}")
