In [53]:
import os
import sys
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    DistilBertConfig,
    get_linear_schedule_with_warmup,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from joblib import dump
from tqdm.auto import tqdm
import torch.nn.functional as F
import torch.optim as optim

from preprocess import clean_text  # from ml_backend/preprocess.py

In [54]:
true_df = pd.read_csv("../../data/True.csv")
fake_df = pd.read_csv("../../data/Fake.csv")

print("True shape:", true_df.shape)
print("Fake shape:", fake_df.shape)
print("True columns:", true_df.columns.tolist())
print("Fake columns:", fake_df.columns.tolist())

True shape: (21417, 4)
Fake shape: (23481, 4)
True columns: ['title', 'text', 'subject', 'date']
Fake columns: ['title', 'text', 'subject', 'date']


In [55]:
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [56]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [57]:
true_df.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0


In [58]:
fake_df.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0


In [59]:
true_df["label"] = "real"
fake_df["label"] = "fake"

# For text, we concatenate title + text for richer context
def combine_title_text(df):
    # handle missing safely
    title = df["title"].fillna("")
    text = df["text"].fillna("")
    return (title + ". " + text).str.strip()

true_df["statement"] = combine_title_text(true_df)
fake_df["statement"] = combine_title_text(fake_df)

# Keep only what we need
true_df = true_df[["statement", "label"]]
fake_df = fake_df[["statement", "label"]]

# Concatenate
full_df = pd.concat([true_df, fake_df], axis=0).reset_index(drop=True)

# Shuffle (for safety)
full_df = full_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

print("Full dataset shape:", full_df.shape)
full_df.head()


Full dataset shape: (44898, 2)


Unnamed: 0,statement,label
0,BREAKING: GOP Chairman Grassley Has Had Enough...,fake
1,Failed GOP Candidates Remembered In Hilarious ...,fake
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY ...,fake
3,California AG pledges to defend birth control ...,real
4,AZ RANCHERS Living On US-Mexico Border Destroy...,fake


In [60]:
full_df = full_df.dropna(subset=["statement", "label"]).reset_index(drop=True)
print("After dropna:", full_df.shape)

print("Label distribution (overall):")
print(full_df["label"].value_counts(normalize=True))

# Train / Valid / Test split: 70 / 15 / 15 (stratified)
train_df, temp_df = train_test_split(
    full_df,
    test_size=0.30,
    random_state=42,
    stratify=full_df["label"]
)

valid_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42,
    stratify=temp_df["label"]
)

print("Train:", train_df.shape, "Valid:", valid_df.shape, "Test:", test_df.shape)
print("Train label dist:")
print(train_df["label"].value_counts(normalize=True))


After dropna: (44898, 2)
Label distribution (overall):
label
fake    0.522985
real    0.477015
Name: proportion, dtype: float64
Train: (31428, 2) Valid: (6735, 2) Test: (6735, 2)
Train label dist:
label
fake    0.522973
real    0.477027
Name: proportion, dtype: float64


In [23]:
le = LabelEncoder()
y_train = le.fit_transform(train_df["label"].astype(str))
y_valid = le.transform(valid_df["label"].astype(str))
y_test  = le.transform(test_df["label"].astype(str))

print("Label classes:", list(le.classes_))  # expect ['fake', 'real']
num_labels = len(le.classes_)

Label classes: ['fake', 'real']


In [25]:
train_df["statement_clean"] = train_df["statement"].astype(str).apply(clean_text)
valid_df["statement_clean"] = valid_df["statement"].astype(str).apply(clean_text)
test_df["statement_clean"]  = test_df["statement"].astype(str).apply(clean_text)

print("Sample original:", train_df["statement"].iloc[0][:300], "...")
print("Sample cleaned :", train_df["statement_clean"].iloc[0][:300], "...")


Sample original: Trump ‘Diversity Council’ Member Threatens to Quit If Trump Ends DACA…Bye, Bye! [Video]. A member of President Trump s  Diversity Council  is threatening to quit because he opposes Trump s cancelation of DACA. Bye Bye!Trump diversity council member tells @Acosta he may quit the council if Trump move ...
Sample cleaned : trump diversity council member threatens to quit if trump ends daca bye bye video a member of president trump s diversity council is threatening to quit because he opposes trump s cancelation of daca bye bye trump diversity council member tells he may quit the council if trump moves ahead to end dac ...


In [26]:
MAX_LEN = 256
BATCH_SIZE = 8

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = NewsDataset(train_df["statement_clean"], y_train, tokenizer, MAX_LEN)
valid_dataset = NewsDataset(valid_df["statement_clean"], y_valid, tokenizer, MAX_LEN)
test_dataset  = NewsDataset(test_df["statement_clean"],  y_test,  tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE)

len(train_loader), len(valid_loader), len(test_loader)


(3929, 842, 842)

In [27]:
from transformers import DistilBertConfig, get_linear_schedule_with_warmup
from sklearn.utils.class_weight import compute_class_weight
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model_name = "distilbert-base-uncased"

# compute class weights (for fake/real imbalance)
classes = np.unique(y_train)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print("Class weights:", class_weights)

# config with our num_labels
config = DistilBertConfig.from_pretrained(
    base_model_name,
    num_labels=num_labels
)

# load base DistilBERT (likely already cached from earlier work)
model = DistilBertForSequenceClassification.from_pretrained(
    base_model_name,
    config=config
)
model.to(device)

EPOCHS = 4
LEARNING_RATE = 3e-5

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

num_training_steps = EPOCHS * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)

print("Model initialized from:", base_model_name)
print("Total training steps:", num_training_steps)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class weights: tensor([0.9561, 1.0482], device='cuda:0')
Model initialized from: distilbert-base-uncased
Total training steps: 15716


In [28]:
def train_epoch(model, data_loader, optimizer, scheduler, device, class_weights):
    model.train()
    total_loss = 0.0

    for batch in tqdm(data_loader, desc="Training", leave=False):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        logits = outputs.logits

        # weighted cross-entropy loss
        loss = F.cross_entropy(logits, labels, weight=class_weights)

        loss.backward()

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    return avg_loss


def eval_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    report = classification_report(
        all_labels,
        all_preds,
        target_names=le.classes_,
        zero_division=0
    )
    return acc, report


In [29]:
best_val_acc = 0.0
best_state_dict = None

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device, class_weights)
    val_acc, val_report = eval_model(model, valid_loader, device)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Accuracy: {val_acc:.4f}")
    print("Validation report:")
    print(val_report)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_state_dict = model.state_dict()
        print(f"New best model saved (val acc = {best_val_acc:.4f})")

# load best model
if best_state_dict is not None:
    model.load_state_dict(best_state_dict)
    print(f"\n Loaded best model with val acc = {best_val_acc:.4f}")
else:
    print("\n No improvement detected; using last epoch model.")



Epoch 1/4


Training:   0%|          | 0/3929 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/842 [00:00<?, ?it/s]

Train Loss: 0.0540
Validation Accuracy: 0.9994
Validation report:
              precision    recall  f1-score   support

        fake       1.00      1.00      1.00      3522
        real       1.00      1.00      1.00      3213

    accuracy                           1.00      6735
   macro avg       1.00      1.00      1.00      6735
weighted avg       1.00      1.00      1.00      6735

New best model saved (val acc = 0.9994)

Epoch 2/4


Training:   0%|          | 0/3929 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/842 [00:00<?, ?it/s]

Train Loss: 0.0078
Validation Accuracy: 0.9984
Validation report:
              precision    recall  f1-score   support

        fake       1.00      1.00      1.00      3522
        real       1.00      1.00      1.00      3213

    accuracy                           1.00      6735
   macro avg       1.00      1.00      1.00      6735
weighted avg       1.00      1.00      1.00      6735


Epoch 3/4


Training:   0%|          | 0/3929 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/842 [00:00<?, ?it/s]

Train Loss: 0.0022
Validation Accuracy: 0.9994
Validation report:
              precision    recall  f1-score   support

        fake       1.00      1.00      1.00      3522
        real       1.00      1.00      1.00      3213

    accuracy                           1.00      6735
   macro avg       1.00      1.00      1.00      6735
weighted avg       1.00      1.00      1.00      6735


Epoch 4/4


Training:   0%|          | 0/3929 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/842 [00:00<?, ?it/s]

Train Loss: 0.0003
Validation Accuracy: 0.9994
Validation report:
              precision    recall  f1-score   support

        fake       1.00      1.00      1.00      3522
        real       1.00      1.00      1.00      3213

    accuracy                           1.00      6735
   macro avg       1.00      1.00      1.00      6735
weighted avg       1.00      1.00      1.00      6735


 Loaded best model with val acc = 0.9994


In [30]:
test_acc, test_report = eval_model(model, test_loader, device)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")
print("Test Classification Report:")
print(test_report)

Evaluating:   0%|          | 0/842 [00:00<?, ?it/s]


Final Test Accuracy: 0.9994
Test Classification Report:
              precision    recall  f1-score   support

        fake       1.00      1.00      1.00      3523
        real       1.00      1.00      1.00      3212

    accuracy                           1.00      6735
   macro avg       1.00      1.00      1.00      6735
weighted avg       1.00      1.00      1.00      6735



In [31]:
df = pd.read_csv("../../data/WELFake_Dataset.csv")
print("Dataset Loaded:", df.shape)
print(df.head())
print(df.columns)

Dataset Loaded: (72134, 4)
   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
1     Did they post their votes for Hillary already?      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1  
Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')


In [32]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [33]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,558
text,39
label,0


In [34]:
df.dropna(inplace=True)

In [35]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
label,0


In [36]:
TEXT_COL = "text"
LABEL_COL = "label"

print("Text column:", TEXT_COL)
print("Label column:", LABEL_COL)

# Standardize labels to match training encoder
df["label"] = df[LABEL_COL].astype(str).str.lower()
df["label"] = df["label"].replace({
    "fake": "fake",
    "real": "real",
    "0": "fake",
    "1": "real"
})

df = df[df["label"].isin(le.classes_)]
df = df.dropna(subset=[TEXT_COL]).reset_index(drop=True)

print("After cleaning:", df.shape)
print(df["label"].value_counts())


Text column: text
Label column: label
After cleaning: (71537, 4)
label
real    36509
fake    35028
Name: count, dtype: int64


In [37]:
df.iloc[0]

Unnamed: 0,0
Unnamed: 0,0
title,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
text,No comment is expected from Barack Obama Membe...
label,real


In [38]:
def combine_title_text(df):
    # handle missing safely
    title = df["title"].fillna("")
    text = df["text"].fillna("")
    return (title + ". " + text).str.strip()
df["text"] = combine_title_text(df)

In [39]:
df.iloc[0]

Unnamed: 0,0
Unnamed: 0,0
title,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
text,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
label,real


In [40]:
df=df.drop(['title', 'Unnamed: 0'], axis=1)

In [41]:
df.columns

Index(['text', 'label'], dtype='object')

In [42]:
df.head()

Unnamed: 0,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,real
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,real
2,"Bobby Jindal, raised Hindu, uses story of Chri...",fake
3,SATAN 2: Russia unvelis an image of its terrif...,real
4,About Time! Christian Group Sues Amazon and SP...,real


In [43]:
# Apply cleaning
df["text_clean"] = df[TEXT_COL].astype(str).apply(clean_text)

# Encode labels using the saved encoder
y_true = le.transform(df["label"])


In [44]:
max_len = 256
BATCH_SIZE = 8
class TestDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(self.labels[index], dtype=torch.long)
        }

test_dataset = TestDataset(df["text_clean"], y_true, tokenizer, max_len)
test_loader = DataLoader(test_dataset, batch_size=8)


In [45]:
def evaluate_model(model, data_loader):
    model.eval()
    preds_all = []
    labels_all = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Testing"):
            inputs = batch["input_ids"].to(device)
            masks = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(inputs, attention_mask=masks)
            preds = torch.argmax(outputs.logits, dim=1)

            preds_all.extend(preds.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())

    acc = accuracy_score(labels_all, preds_all)
    report = classification_report(labels_all, preds_all, target_names=le.classes_)

    return acc, report


acc, report = evaluate_model(model, test_loader)
print("\n📌 Test Accuracy on NEW Dataset:", round(acc*100, 2), "%")
print("\nClassification Report:\n", report)


Testing:   0%|          | 0/8943 [00:00<?, ?it/s]


📌 Test Accuracy on NEW Dataset: 18.75 %

Classification Report:
               precision    recall  f1-score   support

        fake       0.27      0.38      0.31     35028
        real       0.01      0.00      0.00     36509

    accuracy                           0.19     71537
   macro avg       0.14      0.19      0.16     71537
weighted avg       0.13      0.19      0.16     71537



In [46]:
from sklearn.model_selection import train_test_split

print("New dataset size:", df.shape)
print("Label distribution:")
print(df["label"].value_counts(normalize=True))

train_df_new, val_df_new = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

print("Train new:", train_df_new.shape, "Val new:", val_df_new.shape)

y_train_new = le.transform(train_df_new["label"])
y_val_new   = le.transform(val_df_new["label"])


New dataset size: (71537, 3)
Label distribution:
label
real    0.510351
fake    0.489649
Name: proportion, dtype: float64
Train new: (57229, 3) Val new: (14308, 3)


In [47]:
from torch.utils.data import Dataset, DataLoader

class NewsDatasetNew(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        enc = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].flatten(),
            "attention_mask": enc["attention_mask"].flatten(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset_new = NewsDatasetNew(
    train_df_new["text_clean"], y_train_new, tokenizer, max_len
)
val_dataset_new = NewsDatasetNew(
    val_df_new["text_clean"], y_val_new, tokenizer, max_len
)

train_loader_new = DataLoader(train_dataset_new, batch_size=8, shuffle=True)
val_loader_new   = DataLoader(val_dataset_new, batch_size=8)

len(train_loader_new), len(val_loader_new)


(7154, 1789)

In [48]:
from sklearn.utils.class_weight import compute_class_weight
import torch.nn.functional as F

classes_new = np.unique(y_train_new)
cw = compute_class_weight(class_weight="balanced", classes=classes_new, y=y_train_new)
class_weights_new = torch.tensor(cw, dtype=torch.float).to(device)
print("New class weights:", class_weights_new)

EPOCHS_NEW = 2
LR_NEW = 2e-5

optimizer_new = optim.Adam(model.parameters(), lr=LR_NEW)

num_training_steps_new = EPOCHS_NEW * len(train_loader_new)
scheduler_new = get_linear_schedule_with_warmup(
    optimizer_new,
    num_warmup_steps=int(0.1 * num_training_steps_new),
    num_training_steps=num_training_steps_new
)

print("Training steps (new):", num_training_steps_new)


New class weights: tensor([1.0211, 0.9797], device='cuda:0')
Training steps (new): 14308


In [49]:
from tqdm.auto import tqdm

def train_one_epoch_new(model, data_loader, optimizer, scheduler, device, class_weights):
    model.train()
    total_loss = 0.0
    for batch in tqdm(data_loader, desc="Fine-tuning on new data"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = F.cross_entropy(logits, labels, weight=class_weights)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)


def eval_model_new(model, data_loader, device):
    model.eval()
    preds_all, labels_all = [], []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating on new val"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            preds_all.extend(preds.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())

    acc = accuracy_score(labels_all, preds_all)
    rep = classification_report(labels_all, preds_all, target_names=le.classes_, zero_division=0)
    return acc, rep


In [50]:
best_val_acc_new = 0.0
best_state_dict_new = None

for epoch in range(EPOCHS_NEW):
    print(f"\nEpoch {epoch+1}/{EPOCHS_NEW} (domain adaptation)")
    train_loss = train_one_epoch_new(model, train_loader_new, optimizer_new, scheduler_new, device, class_weights_new)
    val_acc_new, val_rep_new = eval_model_new(model, val_loader_new, device)

    print(f"Train loss (new data): {train_loss:.4f}")
    print(f"Val Accuracy (new data): {val_acc_new:.4f}")
    print(val_rep_new)

    if val_acc_new > best_val_acc_new:
        best_val_acc_new = val_acc_new
        best_state_dict_new = model.state_dict()
        print("New best model on new dataset saved.")

if best_state_dict_new is not None:
    model.load_state_dict(best_state_dict_new)
    print(f"\nLoaded best adapted model (val acc on new = {best_val_acc_new:.4f})")



Epoch 1/2 (domain adaptation)


Fine-tuning on new data:   0%|          | 0/7154 [00:00<?, ?it/s]

Evaluating on new val:   0%|          | 0/1789 [00:00<?, ?it/s]

Train loss (new data): 0.4016
Val Accuracy (new data): 0.9718
              precision    recall  f1-score   support

        fake       0.99      0.96      0.97      7006
        real       0.96      0.99      0.97      7302

    accuracy                           0.97     14308
   macro avg       0.97      0.97      0.97     14308
weighted avg       0.97      0.97      0.97     14308

New best model on new dataset saved.

Epoch 2/2 (domain adaptation)


Fine-tuning on new data:   0%|          | 0/7154 [00:00<?, ?it/s]

Evaluating on new val:   0%|          | 0/1789 [00:00<?, ?it/s]

Train loss (new data): 0.0586
Val Accuracy (new data): 0.9774
              precision    recall  f1-score   support

        fake       0.98      0.98      0.98      7006
        real       0.98      0.98      0.98      7302

    accuracy                           0.98     14308
   macro avg       0.98      0.98      0.98     14308
weighted avg       0.98      0.98      0.98     14308

New best model on new dataset saved.

Loaded best adapted model (val acc on new = 0.9774)


In [51]:
# Reuse earlier evaluate_model_new on full new dataset loader
test_acc_new_after, test_rep_new_after = eval_model_new(model, test_loader, device)

print("\nAccuracy on NEW dataset After adaptation:", round(test_acc_new_after*100, 2), "%")
print(test_rep_new_after)


Evaluating on new val:   0%|          | 0/8943 [00:00<?, ?it/s]


Accuracy on NEW dataset After adaptation: 99.15 %
              precision    recall  f1-score   support

        fake       0.99      0.99      0.99     35028
        real       0.99      0.99      0.99     36509

    accuracy                           0.99     71537
   macro avg       0.99      0.99      0.99     71537
weighted avg       0.99      0.99      0.99     71537



In [52]:
save_obj_adapted = {
    "model_state_dict": model.state_dict(),
    "model_name": "distilbert-base-uncased",
    "num_labels": num_labels,
    "max_len": MAX_LEN
}
dump(save_obj_adapted, "transformer_model.pkl")


['transformer_model.pkl']