In [11]:
import wandb
import pandas as pd
from datasets import Dataset
from datasets import DatasetDict
from sklearn.model_selection import train_test_split
from transformers import XLNetTokenizer
import sentencepiece
from transformers import XLNetForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score
import torch

In [2]:
df = pd.read_csv("./dataset/WELFake_Dataset.csv")
df = df.dropna()
df['text'] = df['title'] + " " + df['text']


In [3]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


In [4]:
# pip install sentencepiece

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', cache_dir='./model')

def preprocess(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=256,
        return_tensors="pt"
    )

train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['text', 'label']])

In [5]:
train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)

Map: 100%|██████████| 57229/57229 [00:57<00:00, 997.28 examples/s] 
Map: 100%|██████████| 14308/14308 [00:14<00:00, 1002.44 examples/s]


In [7]:
model = XLNetForSequenceClassification.from_pretrained(
    'xlnet/xlnet-base-cased',
    num_labels=2,
    problem_type="single_label_classification",
    cache_dir='./model'
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet/xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

In [16]:
def predict(text):
    inputs = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors="pt"
    )
    
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return probs.detach().numpy()


# sample_text = "Breaking: NASA announces discovery of alien life on Mars"
sample_text = "Mark Zuckerberg defends Meta’s social media acquisitions in first day of antitrust trial"
probabilities = predict(sample_text)
print(f"Fake: {probabilities[0][0]:.4f}, Real: {probabilities[0][1]:.4f}")

Fake: 0.4491, Real: 0.5509
