In [39]:
import os
import numpy as np
import pandas as pd

from joblib import load, dump
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from xgboost import XGBClassifier

In [40]:
label_encoder = load("label_encoder.pkl")
LABEL_CLASSES = list(label_encoder.classes_)

print("Label classes:", LABEL_CLASSES)

Label classes: ['fake', 'real']


In [41]:
meta_df = pd.read_csv("WELFake_meta_dataset.csv")

print("Meta dataset shape:", meta_df.shape)
print("Columns:", meta_df.columns.tolist())
meta_df.head()

Meta dataset shape: (72095, 9)
Columns: ['text_clean', 'rf_prob_fake', 'rf_prob_real', 'bert_prob_fake', 'bert_prob_real', 'rf_pred_label', 'bert_pred_label', 'true_label', 'true_label_enc']


Unnamed: 0,text_clean,rf_prob_fake,rf_prob_real,bert_prob_fake,bert_prob_real,rf_pred_label,bert_pred_label,true_label,true_label_enc
0,no comment is expected from barack obama membe...,0.936667,0.063333,0.998145,0.001855,fake,fake,real,1
1,did they post their votes for hillary already,0.993333,0.006667,2.7e-05,0.999973,fake,real,real,1
2,now most of the demonstrators gathered last ni...,0.966667,0.033333,0.000132,0.999868,fake,real,real,1
3,a dozen politically active pastors came here f...,0.73,0.27,0.999655,0.000345,fake,fake,fake,0
4,the rs 28 sarmat missile dubbed satan 2 will r...,0.583333,0.416667,7.4e-05,0.999926,fake,real,real,1


In [42]:
meta_df.isnull().sum()

Unnamed: 0,0
text_clean,841
rf_prob_fake,0
rf_prob_real,0
bert_prob_fake,0
bert_prob_real,0
rf_pred_label,0
bert_pred_label,0
true_label,0
true_label_enc,0


In [43]:
FEATURE_COLS = [
    "rf_prob_fake",
    "rf_prob_real",
    "bert_prob_fake",
    "bert_prob_real"
]

X = meta_df[FEATURE_COLS].values
y = meta_df["true_label_enc"].values

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (72095, 4)
y shape: (72095,)


In [44]:
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X,
    y,
    meta_df.index.values,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])


Train samples: 57676
Test samples: 14419


In [45]:
train_index_set = set(idx_train)
test_index_set = set(idx_test)

intersection = train_index_set.intersection(test_index_set)

print("Common samples between train & test:", len(intersection))

assert len(intersection) == 0
print("No data leakage: Train and test sets are completely disjoint.")


Common samples between train & test: 0
No data leakage: Train and test sets are completely disjoint.


In [46]:
meta_clf = XGBClassifier(
    n_estimators=800,
    max_depth=6,
    learning_rate=0.02,
    subsample=0.9,
    colsample_bytree=1.0,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

meta_clf.fit(X_train, y_train)


In [47]:
y_pred = meta_clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"\nMeta-Classifier Accuracy (Leakage-Safe): {acc*100:.2f}%\n")

print("Classification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=LABEL_CLASSES,
    zero_division=0
))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Meta-Classifier Accuracy (Leakage-Safe): 94.90%

Classification Report:
              precision    recall  f1-score   support

        fake       0.95      0.95      0.95      7006
        real       0.95      0.95      0.95      7413

    accuracy                           0.95     14419
   macro avg       0.95      0.95      0.95     14419
weighted avg       0.95      0.95      0.95     14419

Confusion Matrix:
[[6623  383]
 [ 353 7060]]


In [48]:
dump(meta_clf, "meta_classifier.pkl")
print("Meta-classifier saved as meta_classifier.pkl")

Meta-classifier saved as meta_classifier.pkl


In [49]:
# Sanity check for a random sample point
import random

rand_pos = random.choice(range(len(X_test)))

sample_X = X_test[rand_pos].reshape(1, -1)

true_label_enc = y_test[rand_pos]
true_label = LABEL_CLASSES[true_label_enc]

pred_enc = meta_clf.predict(sample_X)[0]
pred_label = LABEL_CLASSES[pred_enc]

proba = meta_clf.predict_proba(sample_X)[0]
confidence = proba[pred_enc]

print("RANDOM SAMPLE EVALUATION")
print("True Label       :", true_label)
print("Predicted Label  :", pred_label)
print(f"Confidence       : {confidence:.4f}")

if pred_label == true_label:
    print("Prediction is CORRECT")
else:
    print("Prediction is WRONG")


RANDOM SAMPLE EVALUATION
True Label       : fake
Predicted Label  : fake
Confidence       : 0.9990
Prediction is CORRECT
