In [17]:
!pip install -U transformers datasets scikit-learn




In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load only top 100 rows from each
fake = pd.read_csv('Fake.csv').head(100)
true = pd.read_csv('True.csv').head(100)

fake['label'] = 0
true['label'] = 1

df = pd.concat([fake, true])
df = df.sample(frac=1).reset_index(drop=True)

X_train, X_temp, y_train, y_temp = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [19]:
from datasets import Dataset
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

train_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_train, 'label': y_train}))
val_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_val, 'label': y_val}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_test, 'label': y_test}))

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Remove text column (model doesn't need it)
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])


Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [20]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [21]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


TrainOutput(global_step=60, training_loss=0.11676035722096761, metrics={'train_runtime': 1611.7446, 'train_samples_per_second': 0.298, 'train_steps_per_second': 0.037, 'total_flos': 63584351354880.0, 'train_loss': 0.11676035722096761, 'epoch': 3.0})

In [22]:
val_metrics = trainer.evaluate()
print("✅ Validation Accuracy:", val_metrics)

test_metrics = trainer.evaluate(test_dataset)
print("✅ Test Accuracy:", test_metrics)

# Save model for Streamlit app
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")


✅ Validation Accuracy: {'eval_loss': 0.005911082029342651, 'eval_accuracy': 1.0, 'eval_runtime': 16.4617, 'eval_samples_per_second': 1.215, 'eval_steps_per_second': 0.182, 'epoch': 3.0}
✅ Test Accuracy: {'eval_loss': 0.005349916405975819, 'eval_accuracy': 1.0, 'eval_runtime': 17.3746, 'eval_samples_per_second': 1.151, 'eval_steps_per_second': 0.173, 'epoch': 3.0}


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json',
 './model/tokenizer.json')

In [None]:
!zip -r model.zip model/


In [None]:
from google.colab import files
files.download("model.zip")
