In [1]:
!pip install -q transformers datasets torch accelerate


In [2]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label={0: "FAKE", 1: "REAL"},
    label2id={"FAKE": 0, "REAL": 1}
).to(device)


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Fine tuning**

In [3]:
import pandas as pd
from datasets import Dataset

fake = pd.read_csv("/Fake.csv")
true = pd.read_csv("/True.csv")

fake["label"] = 0
true["label"] = 1

df = pd.concat([fake, true], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df = df[["text", "label"]]
dataset = Dataset.from_pandas(df)
dataset


Dataset({
    features: ['text', 'label'],
    num_rows: 44898
})

In [4]:
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 35918
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 8980
    })
})

**Tokenization**

In [5]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized = dataset.map(tokenize, batched=True)
tokenized = tokenized.remove_columns(["text"])
tokenized.set_format("torch")


Map:   0%|          | 0/35918 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

In [6]:
from transformers import Trainer, TrainingArguments

In [7]:
import inspect
from transformers import TrainingArguments

print(inspect.signature(TrainingArguments.__init__))




In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer
)


  trainer = Trainer(


In [9]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.0023,0.005829
2,0.0,0.003692


TrainOutput(global_step=4490, training_loss=0.005378150807606483, metrics={'train_runtime': 1694.579, 'train_samples_per_second': 42.392, 'train_steps_per_second': 2.65, 'total_flos': 4757964024926208.0, 'train_loss': 0.005378150807606483, 'epoch': 2.0})

In [10]:
model.save_pretrained("distilbert-fake-news")
tokenizer.save_pretrained("distilbert-fake-news")


('distilbert-fake-news/tokenizer_config.json',
 'distilbert-fake-news/special_tokens_map.json',
 'distilbert-fake-news/vocab.txt',
 'distilbert-fake-news/added_tokens.json',
 'distilbert-fake-news/tokenizer.json')

In [11]:
!zip -r distilbert-fake-news.zip distilbert-fake-news


updating: distilbert-fake-news/ (stored 0%)
updating: distilbert-fake-news/vocab.txt (deflated 53%)
updating: distilbert-fake-news/special_tokens_map.json (deflated 42%)
updating: distilbert-fake-news/model.safetensors (deflated 8%)
updating: distilbert-fake-news/config.json (deflated 46%)
updating: distilbert-fake-news/tokenizer.json (deflated 71%)
updating: distilbert-fake-news/tokenizer_config.json (deflated 75%)
