### Kaggle fake news dataset

In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv('../data/raw/kaggle_fake_news/train.csv')
test = pd.read_csv('../data/raw/kaggle_fake_news/test.csv')

In [3]:
train.shape

(20800, 5)

In [4]:
test.shape

(5200, 4)

In [5]:
train = train.dropna()
X_train, X_val, y_train, y_val = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42)

#### Preprocessing the dataset

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [7]:
X_train_encoded = tokenizer(
    list(X_train.values),
    padding=True,
    truncation=True,
    return_tensors='pt'
)
X_val_encoded = tokenizer(
    list(X_val.values),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

In [10]:
list(X_train.values)

 'The   polls are beginning to trickle in, and it already seems clear that Hillary Clinton has received a bump after the first presidential debate. A wave of   battleground state surveys released on Friday showed her with a comfortable advantage in New Hampshire, Nevada, Florida and Michigan. The gains suggest she might lead by as much as five percentage points nationwide, up from about two to three points before last Monday’s debate. It’s hard to know whether the shift will last. If you’ve been following The Upshot’s coverage of polling over the last two years, you know that we’re pretty circumspect about shifts in the polls. But no matter how you cut it, the debate is bad news for Donald J. Trump. As we wrote ahead of the debate, it has been hard to make sense of the polls over the last few months. Mrs. Clinton’s lead has bobbed up and down, between two and eight percentage points since the spring. To oversimplify, there are two basic ways to interpret it. One possibility is that eve

In [9]:
X_train_encoded['input_ids']

tensor([[  101,  2009,  1521,  ...,  9264,  2001,   102],
        [  101,  1996, 14592,  ...,  1037,  2695,   102],
        [  101,  2137,  9974,  ...,  2293,  2466,   102],
        ...,
        [  101,  4068,  1517,  ...,  1524,  2002,   102],
        [  101,  2899,  1010,  ...,  1012, 10474,   102],
        [  101,  6734,  1010,  ...,     0,     0,     0]])

In [6]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [7]:
train_dataset = FakeNewsDataset(X_train_encoded, y_train.values)
val_dataset = FakeNewsDataset(X_val_encoded, y_val.values)
train_loader = DataLoader(train_dataset, batch_size=32)
val_loader = DataLoader(val_dataset, batch_size=32)

## Training

**TODO**: How to add metric calculation during evaluation in trainer (compute_metric()) : https://huggingface.co/course/chapter3/3?fw=pt

In [10]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification
import evaluate
import numpy as np
import os
os.environ['HF_MLFLOW_LOG_ARTIFACTS'] = "1" # save models as artifact for the expirment

In [11]:
def compute_metrics(eval_preds):
    metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels) 

In [12]:
training_args = TrainingArguments(
    output_dir='../models/kaggle_dataset_models/distilbert/',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=400,
    evaluation_strategy='steps',
    eval_steps=400,
    load_best_model_at_end=True,
    save_total_limit=3,
    save_steps=400

)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

In [13]:
trainer.train()

***** Running training *****
  Num examples = 14628
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9145
  Number of trainable parameters = 66955010
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  4%|▍         | 400/9145 [02:11<47:09,  3.09it/s]  ***** Running Evaluation *****
  Num examples = 3657
  Batch size = 8


{'loss': 0.3353, 'learning_rate': 4e-05, 'epoch': 0.22}


                                                  
  4%|▍         | 400/9145 [03:04<47:09,  3.09it/s]Saving model checkpoint to ../models/kaggle_dataset_models/distilbert/checkpoint-400
Configuration saved in ../models/kaggle_dataset_models/distilbert/checkpoint-400\config.json


{'eval_loss': 0.22528265416622162, 'eval_accuracy': 0.9384741591468416, 'eval_f1': 0.9238578680203046, 'eval_precision': 0.9891304347826086, 'eval_recall': 0.8666666666666667, 'eval_runtime': 53.1886, 'eval_samples_per_second': 68.755, 'eval_steps_per_second': 8.611, 'epoch': 0.22}


Model weights saved in ../models/kaggle_dataset_models/distilbert/checkpoint-400\pytorch_model.bin
Logging checkpoint artifacts in checkpoint-400. This may take time.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  9%|▊         | 800/9145 [05:35<52:04,  2.67it/s]   ***** Running Evaluation *****
  Num examples = 3657
  Batch size = 8


{'loss': 0.0988, 'learning_rate': 4.826489300173511e-05, 'epoch': 0.44}


                                                  
  9%|▊         | 800/9145 [06:34<52:04,  2.67it/s]Saving model checkpoint to ../models/kaggle_dataset_models/distilbert/checkpoint-800
Configuration saved in ../models/kaggle_dataset_models/distilbert/checkpoint-800\config.json


{'eval_loss': 0.038192447274923325, 'eval_accuracy': 0.9937106918238994, 'eval_f1': 0.9926914521766761, 'eval_precision': 0.9936386768447837, 'eval_recall': 0.9917460317460317, 'eval_runtime': 58.6185, 'eval_samples_per_second': 62.386, 'eval_steps_per_second': 7.813, 'epoch': 0.44}


Model weights saved in ../models/kaggle_dataset_models/distilbert/checkpoint-800\pytorch_model.bin
Logging checkpoint artifacts in checkpoint-800. This may take time.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 13%|█▎        | 1200/9145 [09:16<55:23,  2.39it/s]  ***** Running Evaluation *****
  Num examples = 3657
  Batch size = 8


{'loss': 0.0443, 'learning_rate': 4.595141700404859e-05, 'epoch': 0.66}


                                                   
 13%|█▎        | 1200/9145 [10:19<55:23,  2.39it/s]Saving model checkpoint to ../models/kaggle_dataset_models/distilbert/checkpoint-1200
Configuration saved in ../models/kaggle_dataset_models/distilbert/checkpoint-1200\config.json


{'eval_loss': 0.03136672079563141, 'eval_accuracy': 0.9915231063713427, 'eval_f1': 0.9902177343010414, 'eval_precision': 0.9843161856963614, 'eval_recall': 0.9961904761904762, 'eval_runtime': 62.5644, 'eval_samples_per_second': 58.452, 'eval_steps_per_second': 7.32, 'epoch': 0.66}


Model weights saved in ../models/kaggle_dataset_models/distilbert/checkpoint-1200\pytorch_model.bin
Logging checkpoint artifacts in checkpoint-1200. This may take time.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 17%|█▋        | 1564/9145 [12:47<50:09,  2.52it/s]   

KeyboardInterrupt: 

## Load best model

In [20]:
model = DistilBertForSequenceClassification.from_pretrained('mlruns/0/01c9a40387ba4c33a455d1963c9099e8/artifacts/checkpoint-1200/artifacts/checkpoint-1200')

loading configuration file mlruns/0/01c9a40387ba4c33a455d1963c9099e8/artifacts/checkpoint-1200/artifacts/checkpoint-1200\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading weights file mlruns/0/01c9a40387ba4c33a455d1963c9099e8/artifacts/checkpoint-1200/artifacts/checkpoint-1200\pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassifica

In [21]:
model = model.to('cuda')

### Calculate performance

In [22]:
# calculate accuracy
from tqdm import tqdm
acc = 0.0
with torch.no_grad():
    for data in tqdm(val_loader):
        input_ids, labels = data['input_ids'].to('cuda'), data['labels'].to('cuda')
        out = torch.softmax(model(input_ids).logits, dim=1)
        acc += torch.sum(torch.argmax(out, dim=1) == labels) / len(input_ids)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 115/115 [00:46<00:00,  2.46it/s]


In [23]:
print(f"Accuracy: {acc / len(val_loader)}")

Accuracy: 0.9926630258560181


### Pipeline test

In [4]:
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

In [7]:
classifier = pipeline(
    'text-classification',
    model='../mlruns/0/0d522e876c104bf1a08ad1d908c4e612/artifacts/checkpoint-4800/artifacts/checkpoint-4800',
    tokenizer='distilbert-base-uncased'
)

In [8]:
classifier('Donald trump decided to go out and have fun!')

[{'label': 'LABEL_1', 'score': 0.9997836947441101}]