## Load Dataset and add label

In [1]:
import pandas as pd
import numpy as np

In [2]:
fake_ds = pd.read_csv('dataset/fake.csv')
true_ds = pd.read_csv('dataset/true.csv')

fake_ds = fake_ds.dropna(how='all')
true_ds = true_ds.dropna(how='all')


In [3]:
fake_ds = fake_ds.assign(label=1)
true_ds = true_ds.assign(label=0)

In [4]:
# One dataset
ds = pd.concat([fake_ds, true_ds],axis=0)
# random shuffle
ds = ds.sample(frac=1)

# drop the index colunm
ds.reset_index(inplace = True)
ds.drop(["index"], axis = 1, inplace = True)

In [None]:
# despartim textele de labels
x = ds['text']
y = ds['label']

## Antrenam diferite modele pe setul de date

### 1. Training RandomForest

In [None]:
# pregatim datele pentru antrenament
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

le = LabelEncoder()
cv = CountVectorizer()
Y = le.fit_transform(y)

text_list = []
for text in x:
    text_list.append(text)
X = cv.fit_transform(text_list)

In [None]:
# Impartim datele in seturi de antrenament si in seturi de validare
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=.15,random_state=41)

In [None]:
# Antrenam RandomForest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100, random_state = 41, verbose = 10, n_jobs=-1)
clf.fit(x_train, y_train)

In [None]:
# Verificam acuratetea modelului
y_pred = clf.predict(x_test)
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the Random Forest classifier: {accuracy:.4f}")
print()
print(classification_report(y_test, y_pred))

In [None]:
# Salvam modelul
from joblib import dump
dump(clf, 'models/RandomForest/trained_model.pkl')
dump(cv, 'models/RandomForest/count_vectorizer.pkl')
dump(le, 'models/RandomForest/label_encoder.pkl')

### 2. Training NaiveBayes

In [None]:
# pregatim datele pentru antrenament
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

le = LabelEncoder()
cv = CountVectorizer()
Y = le.fit_transform(y)

text_list = []
for text in x:
    text_list.append(text)
X = cv.fit_transform(text_list)

In [None]:
# Impartim datele in seturi de antrenament si in seturi de validare
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=.15,random_state=41)

In [None]:
# Antrenam NaiveBayes
from sklearn.naive_bayes import ComplementNB
clf = ComplementNB()
clf.fit(x_train,y_train)

In [None]:
# Verificam acuratetea modelului
y_pred = clf.predict(x_test)
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the Random Forest classifier: {accuracy:.4f}")
print()
print(classification_report(y_test, y_pred))

In [None]:
# Salvam modelul
from joblib import dump
dump(clf, 'models/NaiveBayes/trained_model.pkl')
dump(cv, 'models/NaiveBayes/count_vectorizer.pkl')
dump(le, 'models/NaiveBayes/label_encoder.pkl')

### 3. Logistic Regression

In [None]:
# pregatim datele pentru antrenament
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

le = LabelEncoder()
cv = CountVectorizer()
Y = le.fit_transform(y)

text_list = []
for text in x:
    text_list.append(text)
X = cv.fit_transform(text_list)

In [None]:
# Impartim datele in seturi de antrenament si in seturi de validare
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=.15,random_state=41)

In [None]:
# Antrenam LogisticRegression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1500, verbose=10, n_jobs=-1)
clf.fit(x_train,y_train)

In [None]:
# Verificam acuratetea modelului
y_pred = clf.predict(x_test)
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the Random Forest classifier: {accuracy:.4f}")
print()
print(classification_report(y_test, y_pred))

In [None]:
# Salvam modelul
from joblib import dump
dump(clf, 'models/LogisticRegression/trained_model.pkl')
dump(cv, 'models/LogisticRegression/count_vectorizer.pkl')
dump(le, 'models/LogisticRegression/label_encoder.pkl')

### 4. Training SVM

In [None]:
# pregatim datele pentru antrenament
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

le = LabelEncoder()
cv = CountVectorizer()
Y = le.fit_transform(y)

text_list = []
for text in x:
    text_list.append(text)
X = cv.fit_transform(text_list)

In [None]:
# Impartim datele in seturi de antrenament si in seturi de validare
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=.15,random_state=41)

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC(        
        max_iter=20_000,
        dual=False)
clf.fit(x_train,y_train)

In [None]:
# Verificam acuratetea modelului
y_pred = clf.predict(x_test)
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the Random Forest classifier: {accuracy:.4f}")
print()
print(classification_report(y_test, y_pred))

In [None]:
# Salvam modelul
from joblib import dump
dump(clf, 'models/SVM/trained_model.pkl')
dump(cv, 'models/SVM/count_vectorizer.pkl')
dump(le, 'models/SVM/label_encoder.pkl')

### 5. FineTune MiniLM-L3-H384-uncased
MiniLM-L3-H384-uncased is  a 3 layer version of microsoft/MiniLM-L12-H384-uncased

In [5]:
# Transform pandas DataFrame to dataset
from datasets import Dataset, DatasetDict
dataset = Dataset.from_pandas(ds)
splits = dataset.train_test_split(test_size=0.15, seed=42)
dataset_dict = DatasetDict({
    'train': splits['train'],
    'test': splits['test']
})

In [6]:
# tokenize the data
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nreimers/MiniLM-L3-H384-uncased")

def preprocess_function(date):
    return tokenizer(date["text"], truncation=True)

tokenized_ds = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/66401 [00:00<?, ? examples/s]

Map:   0%|          | 0/11718 [00:00<?, ? examples/s]

In [None]:
# for efficiency
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [None]:
# getting ready for training #1: optimizer
id2label = {0: "TRUE", 1: "FAKE"}
label2id = {"TRUE": 0, "FAKE": 1}

from transformers import create_optimizer
import tensorflow as tf

BATCH_SIZE = 16
EPOCHS = 10
BATCHES_PER_EPOCH = len(tokenized_ds["train"]) // BATCH_SIZE
total_train_steps = int(BATCHES_PER_EPOCH * EPOCHS)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
# getting ready for training #2: load model and convert ds to tf.dataset

tf.keras.mixed_precision.set_global_policy('mixed_float16')

from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "nreimers/MiniLM-L3-H384-uncased",
    from_pt=True,
    num_labels=2, 
    id2label=id2label, 
    label2id=label2id,
)

tf_train_set = model.prepare_tf_dataset(
    tokenized_ds["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_ds["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [10]:
# getting ready for training #3: compile the model
model.compile(optimizer=optimizer)

In [11]:
# getting ready for training #4: Callbacks
from transformers.keras_callbacks import KerasMetricCallback
from tf_keras.callbacks import TensorBoard
from tf_keras.callbacks import BackupAndRestore
from tf_keras.callbacks import ModelCheckpoint

def compute_metrics(eval_predictions):
    predictions, labels = eval_predictions
    # For classification tasks, we need to get the argmax
    predictions = np.argmax(predictions, axis=1)
    
    # You'll need a metric object - using accuracy from sklearn for simplicity
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


from sklearn.metrics import accuracy_score, precision_recall_fscore_support

metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=tf_validation_set
)

periodic_checkpoint = ModelCheckpoint(
    filepath="./checkpoint/model_epoch_{epoch:03d}.keras",
    save_freq= 5 * BATCHES_PER_EPOCH,
    save_weights_only=False
)

backup_callback = BackupAndRestore(
    backup_dir="./training_backups",
    save_freq= 1 * BATCHES_PER_EPOCH
)

tensorboard_callback = TensorBoard(log_dir="./tensorboard_logs/logs")
callbacks = [metric_callback, tensorboard_callback, backup_callback, periodic_checkpoint]

In [None]:
# Set memory growth to prevent OOM errors
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [13]:
# training
import os
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

model.fit(
    tf_train_set,
    validation_data=tf_validation_set,
    epochs=EPOCHS,
    callbacks=callbacks,
)

Epoch 1/10


I0000 00:00:1746469873.666898   34391 service.cc:152] XLA service 0x7efc08330c60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1746469873.667038   34391 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 OEM, Compute Capability 8.6
2025-05-05 21:31:13.751781: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1746469873.844954   34390 cuda_dnn.cc:529] Loaded cuDNN version 90501
I0000 00:00:1746469874.102499   34391 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10


2025-05-05 21:47:00.771188: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 3/10


2025-05-05 22:00:54.371954: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 4/10
Epoch 5/10


2025-05-05 22:28:20.309079: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence






Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


2025-05-05 23:22:49.246426: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 10/10





<tf_keras.src.callbacks.History at 0x7efc81f92690>

In [None]:
### predict on a text
## imports only for new session
# import tensorflow as tf
# import numpy as np
# import pandas as pd
# from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

# 1. tokenizer
tokenizer = AutoTokenizer.from_pretrained("nreimers/MiniLM-L3-H384-uncased")

# 2. Model
model_path = "checkpoint/model_epoch_005.keras"
model = tf.keras.models.load_model(model_path)

# 3. Function for prediction
def predict_news(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True)
    
    # Make prediction
    outputs = model(inputs)
    logits = outputs.logits
    probabilities = tf.nn.softmax(logits, axis=-1).numpy()[0]
    predicted_class = np.argmax(probabilities)
    
    # Get the class label
    id2label = {0: "TRUE", 1: "FAKE"}
    predicted_label = id2label[predicted_class]
    
    # Return results
    return {
        "label": predicted_label,
        "confidence": float(probabilities[predicted_class]),
        "probabilities": {
            "TRUE": float(probabilities[0]),
            "FAKE": float(probabilities[1])
        }
    }

# 4. How to use this function:
# provide input: can be only one string, or a list with multiple strings
example_texts = [
    "Scientists discover new treatment for cancer that shows promising results in clinical trials.",
    "Breaking: Famous celebrity secretly an alien, government documents reveal shocking truth!"
]
# iterate through list and make predictions:
for text in example_texts:
    result = predict_news(text)
    print(f"Text: {text[:50]}...")
    print(f"Prediction: {result['label']} (confidence: {result['confidence']:.2f})")
    print(f"TRUE probability: {result['probabilities']['TRUE']:.2f}")
    print(f"FAKE probability: {result['probabilities']['FAKE']:.2f}")
    print("-" * 50)


# For other type of inputs:

# # 5. Batch prediction function for multiple texts
# def predict_batch(texts, batch_size=16):
#     # Tokenize all texts
#     inputs = tokenizer(texts, return_tensors="tf", truncation=True, padding=True)
    
#     # Make predictions
#     outputs = model(inputs)
#     logits = outputs.logits
#     probabilities = tf.nn.softmax(logits, axis=-1).numpy()
#     predicted_classes = np.argmax(probabilities, axis=1)
    
#     # Format results
#     id2label = {0: "TRUE", 1: "FAKE"}
#     results = []
    
#     for i, pred_class in enumerate(predicted_classes):
#         results.append({
#             "text": texts[i][:100] + "..." if len(texts[i]) > 100 else texts[i],
#             "label": id2label[pred_class],
#             "confidence": float(probabilities[i][pred_class]),
#             "true_prob": float(probabilities[i][0]),
#             "fake_prob": float(probabilities[i][1])
#         })
    
#     return results

# # 6. Example for loading and predicting on a CSV file
# def predict_from_csv(csv_file, text_column="text"):
#     # Load data
#     df = pd.read_csv(csv_file)
#     texts = df[text_column].tolist()
    
#     # Make predictions in batches
#     all_results = []
#     batch_size = 16
    
#     for i in range(0, len(texts), batch_size):
#         batch_texts = texts[i:i+batch_size]
#         batch_results = predict_batch(batch_texts)
#         all_results.extend(batch_results)
    
#     # Convert to DataFrame and save
#     results_df = pd.DataFrame(all_results)
#     results_df.to_csv("prediction_results.csv", index=False)
#     print(f"Saved predictions for {len(results_df)} texts to prediction_results.csv")
    
#     return results_df

# df = predict_from_csv("your_news_file.csv", text_column="text")

In [None]:
# Model FineTuned after 5 epochs
# Text: Scientists discover new treatment for cancer that ...
# Prediction: FAKE (confidence: 0.91)
# TRUE probability: 0.09
# FAKE probability: 0.91
# --------------------------------------------------
# Text: Breaking: Famous celebrity secretly an alien, gove...
# Prediction: FAKE (confidence: 1.00)
# TRUE probability: 0.00
# FAKE probability: 1.00
# --------------------------------------------------

# Model FineTuned after 10 epochs
# Text: Scientists discover new treatment for cancer that ...
# Prediction: FAKE (confidence: 0.99)
# TRUE probability: 0.01
# FAKE probability: 0.99
# --------------------------------------------------
# Text: Breaking: Famous celebrity secretly an alien, gove...
# Prediction: FAKE (confidence: 1.00)
# TRUE probability: 0.00
# FAKE probability: 1.00
# --------------------------------------------------