In [20]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split

#### Twitter fake news dataset

In [21]:
train = pd.read_csv('../data/raw/twitter/tweetstrain2015.csv')
train.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,tweetId,tweetText,label
0,0,0,263046056240115712,se acuerdan de la pel cula el despu de ana rec...,fake
1,1,1,262995061304852481,milenagimon miren sandi en ny tremenda imagen ...,fake
2,2,2,262979898002534400,buena la foto del hurac n sandi recuerda la pe...,fake
3,3,3,262996108400271360,scari shit hurrican ny http co e jlbufh,fake
4,4,4,263018881839411200,fave place world nyc hurrican sandi statueofli...,fake


In [22]:
test = pd.read_csv('../data/raw/twitter/tweetstest2015.csv')
test.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,tweetId,tweetText,label
0,0,0,578854927457349632,kereeen rt shyman eclips iss http co je hcfpvfn,fake
1,1,1,578874632670953472,absolut beauti rt shyman eclips iss http co oq...,fake
2,2,2,578891261353984000,shyman eclips iss http co c vfboscrj wow amaz,fake
3,3,3,578846612312748032,eclips iss http co en otvsu,fake
4,4,4,578975333841551360,ebonfigli clips vue de l iss autr chose http c...,fake


In [4]:
#encode output
label_mapping = {'fake': 1, 'real': 0}
train.label = train.label.map(label_mapping)
test.label = test.label.map(label_mapping)

### Train val split

In [5]:
train = train.dropna()
X_train, X_val, y_train, y_val = train_test_split(train['tweetText'], train['label'], test_size=0.2, random_state=42)
X_test, y_test = test['tweetText'], test.label

#### Preprocessing the dataset

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [7]:
X_train_encoded = tokenizer(
    list(X_train.values),
    padding=True,
    truncation=True,
    return_tensors='pt'
)
X_val_encoded = tokenizer(
    list(X_val.values),
    padding=True,
    truncation=True,
    return_tensors='pt'
)
X_test_encoded = tokenizer(
    list(X_test),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

In [8]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
train_dataset = FakeNewsDataset(X_train_encoded, y_train.values)
val_dataset = FakeNewsDataset(X_val_encoded, y_val.values)
test_dataset = FakeNewsDataset(X_test_encoded, y_test.values)
train_loader = DataLoader(train_dataset, batch_size=32)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

## Training

In [11]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification
import evaluate
import numpy as np
import os
os.environ['HF_MLFLOW_LOG_ARTIFACTS'] = "1" # save models as artifact for the expirment

In [11]:
def compute_metrics(eval_preds):
    metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels) 

In [None]:
training_args = TrainingArguments(
    output_dir='../models/twitter_dataset_models/distilbert/',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=400,
    evaluation_strategy='steps',
    eval_steps=400,
    load_best_model_at_end=True,
    save_total_limit=3,
    save_steps=400

)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)


In [18]:
trainer.train()

***** Running training *****
  Num examples = 11421
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4284
  Number of trainable parameters = 66955010
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 12%|█▏        | 500/4284 [02:35<19:41,  3.20it/s]***** Running Evaluation *****
  Num examples = 2856
  Batch size = 8


{'loss': 0.5091, 'learning_rate': 5e-05, 'epoch': 0.35}


                                                  
 12%|█▏        | 500/4284 [03:03<19:41,  3.20it/s]Saving model checkpoint to ../models\checkpoint-500
Configuration saved in ../models\checkpoint-500\config.json


{'eval_loss': 0.3523949384689331, 'eval_accuracy': 0.8662464985994398, 'eval_f1': 0.9028978139298424, 'eval_precision': 0.8538461538461538, 'eval_recall': 0.9579288025889967, 'eval_runtime': 28.5341, 'eval_samples_per_second': 100.091, 'eval_steps_per_second': 12.511, 'epoch': 0.35}


Model weights saved in ../models\checkpoint-500\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 23%|██▎       | 1000/4284 [06:00<21:14,  2.58it/s] ***** Running Evaluation *****
  Num examples = 2856
  Batch size = 8


{'loss': 0.3571, 'learning_rate': 4.339323467230444e-05, 'epoch': 0.7}


                                                   
 23%|██▎       | 1000/4284 [06:32<21:14,  2.58it/s]Saving model checkpoint to ../models\checkpoint-1000
Configuration saved in ../models\checkpoint-1000\config.json


{'eval_loss': 0.3548908531665802, 'eval_accuracy': 0.882703081232493, 'eval_f1': 0.9104995992519369, 'eval_precision': 0.9020645844362096, 'eval_recall': 0.919093851132686, 'eval_runtime': 31.9964, 'eval_samples_per_second': 89.26, 'eval_steps_per_second': 11.157, 'epoch': 0.7}


Model weights saved in ../models\checkpoint-1000\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 35%|███▌      | 1500/4284 [09:40<17:36,  2.64it/s]  ***** Running Evaluation *****
  Num examples = 2856
  Batch size = 8


{'loss': 0.3033, 'learning_rate': 3.678646934460888e-05, 'epoch': 1.05}


                                                   
 35%|███▌      | 1500/4284 [10:12<17:36,  2.64it/s]Saving model checkpoint to ../models\checkpoint-1500
Configuration saved in ../models\checkpoint-1500\config.json


{'eval_loss': 0.32016709446907043, 'eval_accuracy': 0.8998599439775911, 'eval_f1': 0.9270780214176441, 'eval_precision': 0.879110251450677, 'eval_recall': 0.9805825242718447, 'eval_runtime': 32.3245, 'eval_samples_per_second': 88.354, 'eval_steps_per_second': 11.044, 'epoch': 1.05}


Model weights saved in ../models\checkpoint-1500\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 47%|████▋     | 2000/4284 [13:21<14:00,  2.72it/s]  ***** Running Evaluation *****
  Num examples = 2856
  Batch size = 8


{'loss': 0.2608, 'learning_rate': 3.017970401691332e-05, 'epoch': 1.4}


                                                   
 47%|████▋     | 2000/4284 [13:53<14:00,  2.72it/s]Saving model checkpoint to ../models\checkpoint-2000
Configuration saved in ../models\checkpoint-2000\config.json


{'eval_loss': 0.3136407434940338, 'eval_accuracy': 0.898109243697479, 'eval_f1': 0.9223793011469726, 'eval_precision': 0.912401055408971, 'eval_recall': 0.9325782092772384, 'eval_runtime': 31.9028, 'eval_samples_per_second': 89.522, 'eval_steps_per_second': 11.19, 'epoch': 1.4}


Model weights saved in ../models\checkpoint-2000\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 58%|█████▊    | 2500/4284 [17:05<11:15,  2.64it/s]  ***** Running Evaluation *****
  Num examples = 2856
  Batch size = 8


{'loss': 0.2584, 'learning_rate': 2.357293868921776e-05, 'epoch': 1.75}


                                                   
 58%|█████▊    | 2500/4284 [17:37<11:15,  2.64it/s]Saving model checkpoint to ../models\checkpoint-2500
Configuration saved in ../models\checkpoint-2500\config.json


{'eval_loss': 0.2928997576236725, 'eval_accuracy': 0.9117647058823529, 'eval_f1': 0.9337539432176656, 'eval_precision': 0.9107692307692308, 'eval_recall': 0.9579288025889967, 'eval_runtime': 31.8895, 'eval_samples_per_second': 89.559, 'eval_steps_per_second': 11.195, 'epoch': 1.75}


Model weights saved in ../models\checkpoint-2500\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 70%|███████   | 3000/4284 [20:48<08:13,  2.60it/s]  ***** Running Evaluation *****
  Num examples = 2856
  Batch size = 8


{'loss': 0.2239, 'learning_rate': 1.69661733615222e-05, 'epoch': 2.1}


                                                   
 70%|███████   | 3000/4284 [21:20<08:13,  2.60it/s]Saving model checkpoint to ../models\checkpoint-3000
Configuration saved in ../models\checkpoint-3000\config.json


{'eval_loss': 0.4574283957481384, 'eval_accuracy': 0.8991596638655462, 'eval_f1': 0.9215258855585832, 'eval_precision': 0.9311674008810573, 'eval_recall': 0.9120819848975189, 'eval_runtime': 31.9216, 'eval_samples_per_second': 89.469, 'eval_steps_per_second': 11.184, 'epoch': 2.1}


Model weights saved in ../models\checkpoint-3000\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 82%|████████▏ | 3500/4284 [24:27<04:41,  2.78it/s]  ***** Running Evaluation *****
  Num examples = 2856
  Batch size = 8


{'loss': 0.1693, 'learning_rate': 1.0359408033826638e-05, 'epoch': 2.45}


                                                   
 82%|████████▏ | 3500/4284 [24:59<04:41,  2.78it/s]Saving model checkpoint to ../models\checkpoint-3500
Configuration saved in ../models\checkpoint-3500\config.json


{'eval_loss': 0.3657529950141907, 'eval_accuracy': 0.9110644257703081, 'eval_f1': 0.9327330508474575, 'eval_precision': 0.9162330905306972, 'eval_recall': 0.9498381877022654, 'eval_runtime': 31.8117, 'eval_samples_per_second': 89.778, 'eval_steps_per_second': 11.222, 'epoch': 2.45}


Model weights saved in ../models\checkpoint-3500\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 93%|█████████▎| 4000/4284 [28:09<01:48,  2.63it/s]  ***** Running Evaluation *****
  Num examples = 2856
  Batch size = 8


{'loss': 0.1665, 'learning_rate': 3.7526427061310788e-06, 'epoch': 2.8}


                                                   
 93%|█████████▎| 4000/4284 [28:42<01:48,  2.63it/s]Saving model checkpoint to ../models\checkpoint-4000
Configuration saved in ../models\checkpoint-4000\config.json


{'eval_loss': 0.39100727438926697, 'eval_accuracy': 0.9093137254901961, 'eval_f1': 0.930914910642838, 'eval_precision': 0.920844327176781, 'eval_recall': 0.9412081984897519, 'eval_runtime': 32.3478, 'eval_samples_per_second': 88.29, 'eval_steps_per_second': 11.036, 'epoch': 2.8}


Model weights saved in ../models\checkpoint-4000\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 4284/4284 [30:36<00:00,  2.87it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 4284/4284 [30:36<00:00,  2.33it/s]

{'train_runtime': 1836.8729, 'train_samples_per_second': 18.653, 'train_steps_per_second': 2.332, 'train_loss': 0.2736295999265185, 'epoch': 3.0}





TrainOutput(global_step=4284, training_loss=0.2736295999265185, metrics={'train_runtime': 1836.8729, 'train_samples_per_second': 18.653, 'train_steps_per_second': 2.332, 'train_loss': 0.2736295999265185, 'epoch': 3.0})

## Load best model

In [12]:
model = DistilBertForSequenceClassification.from_pretrained('../models/twitter_dataset_models/checkpoint-4000/')

In [13]:
model = model.to('cuda')

### Calculate performance

In [18]:
# calculate accuracy
from tqdm import tqdm
acc = 0.0
with torch.no_grad():
    for data in tqdm(test_loader):
        input_ids, labels = data['input_ids'].to('cuda'), data['labels'].to('cuda')
        out = torch.softmax(model(input_ids).logits, dim=1)
        acc += torch.sum(torch.argmax(out, dim=1) == labels) / len(input_ids)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
100%|██████████| 118/118 [00:35<00:00,  3.35it/s]


In [19]:
print(f"Accuracy: {acc / len(test_loader)}")

Accuracy: 0.8676087856292725
