In [13]:
# local
!pip install pandas numpy scikit-learn joblib

# optionnel pour BERT
!pip install transformers datasets accelerate evaluate tokenizers


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [1]:
    from google.colab import drive
    drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import os
    import random
    import re
    import string
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    import sklearn.ensemble as ensemble


In [10]:
df_fake = pd.read_csv('/content/drive/MyDrive/Data/Fake.csv')
df_true = pd.read_csv('/content/drive/MyDrive/Data/True.csv')

# CHARGEMENT ET CONCATENATION DES DONNEES

In [14]:
import pandas as pd
from sklearn.utils import shuffle

df_fake = pd.read_csv('/content/drive/MyDrive/Data/Fake.csv')
df_true = pd.read_csv('/content/drive/MyDrive/Data/True.csv')

df_fake['label'] = 0
df_true['label'] = 1

# s'assurer que title/text existent
for col in ['title','text','subject','date']:
    if col not in df_fake.columns: df_fake[col] = ""
    if col not in df_true.columns: df_true[col] = ""

df = pd.concat([df_fake, df_true], ignore_index=True)
df = shuffle(df, random_state=42).reset_index(drop=True)

print("Total rows:", df.shape[0])
print(df['label'].value_counts())
df.head()


Total rows: 44898
label
0    23481
1    21417
Name: count, dtype: int64


Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",0
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",1
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",1
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",0
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",1


# ANALYSE EXPLORATOIRE DES DONNEES

In [15]:
# distributions et stats simples
df['content'] = df['title'].fillna('') + " " + df['text'].fillna('')
df['n_words'] = df['content'].apply(lambda t: len(str(t).split()))
print(df['n_words'].describe())

# ex : afficher les 5 titres les plus longs
df.loc[df['n_words'].nlargest(5).index, ['title','n_words']]

# distribution label
print(df['label'].value_counts(normalize=True))


count    44898.000000
mean       417.735757
std        351.480777
min          2.000000
25%        216.000000
50%        375.000000
75%        526.000000
max       8148.000000
Name: n_words, dtype: float64
label
0    0.522985
1    0.477015
Name: proportion, dtype: float64


# NETTOYAGE ET PRETRAITEMENT

In [16]:
import re, string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

STOPWORDS = set(ENGLISH_STOP_WORDS)

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', ' ', text)          # URLs
    text = re.sub(r'<.*?>', ' ', text)                     # HTML mini
    text = text.translate(str.maketrans('', '', string.punctuation))  # punctuation
    text = re.sub(r'\d+', ' ', text)                       # chiffres
    text = re.sub(r'\s+', ' ', text).strip()               # espaces
    tokens = [w for w in text.split() if w not in STOPWORDS and len(w) > 1]
    return " ".join(tokens)

df['clean'] = df['content'].apply(clean_text)


# DIVISER LE DATASET CONCATENER EN DONNEES D' ENTRAINEMENT ET DE TEST

In [17]:
from sklearn.model_selection import train_test_split

X = df['clean'].values
y = df['label'].values

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print("Train:", len(X_train), "Val:", len(X_val), "Test:", len(X_test))


Train: 33673 Val: 5612 Test: 5613


#  ENTRAINEMENT

#### Définition des modèles à tester

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
import numpy as np
import joblib

experiments = []

# TF-IDF — Logistic Regression
experiments.append((
    "TFIDF + Logistic Regression",
    Pipeline([
        ('tfidf', TfidfVectorizer(max_df=0.95, min_df=4, ngram_range=(1,2))),
        ('clf', LogisticRegression(max_iter=400, class_weight='balanced'))
    ])
))

# TF-IDF — SVM
experiments.append((
    "TFIDF + Linear SVM",
    Pipeline([
        ('tfidf', TfidfVectorizer(max_df=0.95, min_df=4, ngram_range=(1,2))),
        ('clf', LinearSVC())
    ])
))

# TF-IDF — Naive Bayes
experiments.append((
    "TFIDF + MultinomialNB",
    Pipeline([
        ('tfidf', TfidfVectorizer(max_df=0.95, min_df=5)),
        ('clf', MultinomialNB())
    ])
))

# CountVectorizer — Logistic Regression
experiments.append((
    "CountVectorizer + Logistic Regression",
    Pipeline([
        ('count', CountVectorizer(min_df=4)),
        ('clf', LogisticRegression(max_iter=400))
    ])
))

# CountVectorizer — Linear SVM
experiments.append((
    "CountVectorizer + Linear SVM",
    Pipeline([
        ('count', CountVectorizer(min_df=4)),
        ('clf', LinearSVC())
    ])
))

# TF-IDF — RandomForest
experiments.append((
    "TFIDF + RandomForest",
    Pipeline([
        ('tfidf', TfidfVectorizer(min_df=5)),
        ('clf', RandomForestClassifier(n_estimators=250))
    ])
))


In [21]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


#### Construction du vecteur Word2Vec

In [22]:
from gensim.models import Word2Vec

# préparer les tokens
tokenized = [text.split() for text in X_train]

# entraîner un Word2Vec rapide
w2v_model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=3, workers=4)

def embed_text(text):
    words = [w for w in text.split() if w in w2v_model.wv]
    if len(words) == 0:
        return np.zeros(100)
    return np.mean(w2v_model.wv[words], axis=0)

# vectoriser datasets
X_train_w2v = np.array([embed_text(t) for t in X_train])
X_val_w2v   = np.array([embed_text(t) for t in X_val])
X_test_w2v  = np.array([embed_text(t) for t in X_test])


#### Ajouter Word2Vec + ML à la liste des modèles

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

experiments.append((
    "Word2Vec + Logistic Regression",
    LogisticRegression(max_iter=400)
))

experiments.append((
    "Word2Vec + RandomForest",
    RandomForestClassifier(n_estimators=300)
))


#### Boucle d’entraînement globale

In [24]:
results = []
best_f1 = 0
best_model = None
best_name = ""

for name, model in experiments:
    print("\n==============================")
    print("Training:", name)
    print("==============================")

    # cas Word2Vec (données numériques)
    if "Word2Vec" in name:
        model.fit(X_train_w2v, y_train)
        y_pred = model.predict(X_val_w2v)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

    f1 = f1_score(y_val, y_pred)
    print("F1-score:", f1)
    results.append((name, f1))

    if f1 > best_f1:
        best_f1 = f1
        best_model = model
        best_name = name

print("\n\n=== Résultats finaux ===")
for name, f1 in results:
    print(name, "=> F1 =", f1)

print("\nMeilleur modèle :", best_name, " avec F1 =", best_f1)



Training: TFIDF + Logistic Regression
F1-score: 0.9892153216809223

Training: TFIDF + Linear SVM
F1-score: 0.9966430436404327

Training: TFIDF + MultinomialNB
F1-score: 0.9429049655941975

Training: CountVectorizer + Logistic Regression
F1-score: 0.997195737521032

Training: CountVectorizer + Linear SVM




F1-score: 0.996258885147774

Training: TFIDF + RandomForest
F1-score: 0.9968277663743236

Training: Word2Vec + Logistic Regression
F1-score: 0.9808585764727745

Training: Word2Vec + RandomForest
F1-score: 0.9746741154562384


=== Résultats finaux ===
TFIDF + Logistic Regression => F1 = 0.9892153216809223
TFIDF + Linear SVM => F1 = 0.9966430436404327
TFIDF + MultinomialNB => F1 = 0.9429049655941975
CountVectorizer + Logistic Regression => F1 = 0.997195737521032
CountVectorizer + Linear SVM => F1 = 0.996258885147774
TFIDF + RandomForest => F1 = 0.9968277663743236
Word2Vec + Logistic Regression => F1 = 0.9808585764727745
Word2Vec + RandomForest => F1 = 0.9746741154562384

Meilleur modèle : CountVectorizer + Logistic Regression  avec F1 = 0.997195737521032


In [25]:
!pip install tensorflow



In [26]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

max_words = 20000
max_len = 300

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_val_seq   = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=max_len)

model_lstm = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

history = model_lstm.fit(
    X_train_seq, y_train,
    validation_data=(X_val_seq, y_val),
    epochs=3,
    batch_size=128
)

# prédiction
y_pred_lstm = (model_lstm.predict(X_val_seq) > 0.5).astype(int)
f1_lstm = f1_score(y_val, y_pred_lstm)
print("LSTM F1 =", f1_lstm)




Epoch 1/3
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 32ms/step - accuracy: 0.9139 - loss: 0.2347 - val_accuracy: 0.9847 - val_loss: 0.0553
Epoch 2/3
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.9859 - loss: 0.0467 - val_accuracy: 0.9824 - val_loss: 0.0512
Epoch 3/3
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 25ms/step - accuracy: 0.9943 - loss: 0.0210 - val_accuracy: 0.9852 - val_loss: 0.0554
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step
LSTM F1 = 0.984626782737544


In [27]:
joblib.dump(best_model, "best_fake_news_model.joblib")
print("Modèle sauvegardé :", best_name)


Modèle sauvegardé : CountVectorizer + Logistic Regression
