In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from datasets import Dataset, load_dataset, Features, Value, ClassLabel

import torch
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import AdamW, Adafactor, get_linear_schedule_with_warmup

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report, confusion_matrix
from transformers import EvalPrediction

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
TEXT_LABELS = {0: 'Satire', 1: 'Hoax', 2: 'Propaganda', 3: 'Reliable'}

In [3]:
class_names = list(TEXT_LABELS.values())
num_labels = len(class_names)
features = Features({'text': Value('string'), 'label': ClassLabel(num_classes=len(class_names), names=class_names)})

In [4]:
data_files = {"train": "/kaggle/input/labeled-unreliable-news-lun/train.csv", 
              "test": "/kaggle/input/labeled-unreliable-news-lun/test.csv"}
dataset = load_dataset("csv", data_files=data_files)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-2b6a7fe4721662f1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-2b6a7fe4721662f1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
hf_model_name = "xlnet-base-cased"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)

def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
  encoding["labels"] = examples["label"]  
    
  return encoding

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

In [7]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names) # Remove the original columns as they are already encoded.
encoded_dataset.set_format("torch")

  0%|          | 0/49 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(hf_model_name,
                                                           ignore_mismatched_sizes=True,
                                                           num_labels=num_labels)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [9]:
batch_size = 8
metric_name = "f1"
epochs = 3

train_data_size = dataset['train'].num_rows
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

# optimizer = AdamW(model.parameters(), lr=3e-5)
optimizer = Adafactor(model.parameters(), lr=3e-5, relative_step=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)

In [10]:
args = TrainingArguments(
    f"xlnet_lun",
    logging_strategy='epoch',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    num_train_epochs=epochs,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    report_to="none",
    #push_to_hub=True,
)

In [11]:
def multi_class_metrics(predictions, labels):
    y_pred = np.argmax(predictions, axis=1)
    y_true = labels
    f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    metrics = {'f1': f1_macro_average,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_class_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

Training

In [12]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [13]:
trainer.train()

***** Running training *****
  Num examples = 48854
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18321
  Number of trainable parameters = 117312004
You're using a XLNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.3874,2.655768,0.615638,0.639
2,0.0541,2.989231,0.623022,0.648
3,0.0385,2.937502,0.636626,0.663667


***** Running Evaluation *****
  Num examples = 3000
  Batch size = 8
Saving model checkpoint to xlnet_lun/checkpoint-6107
Configuration saved in xlnet_lun/checkpoint-6107/config.json
Model weights saved in xlnet_lun/checkpoint-6107/pytorch_model.bin
tokenizer config file saved in xlnet_lun/checkpoint-6107/tokenizer_config.json
Special tokens file saved in xlnet_lun/checkpoint-6107/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3000
  Batch size = 8
Saving model checkpoint to xlnet_lun/checkpoint-12214
Configuration saved in xlnet_lun/checkpoint-12214/config.json
Model weights saved in xlnet_lun/checkpoint-12214/pytorch_model.bin
tokenizer config file saved in xlnet_lun/checkpoint-12214/tokenizer_config.json
Special tokens file saved in xlnet_lun/checkpoint-12214/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3000
  Batch size = 8
Saving model checkpoint to xlnet_lun/checkpoint-18321
Configuration saved in xlnet_lun/checkpoint-18321/con

TrainOutput(global_step=18321, training_loss=0.16001373052818274, metrics={'train_runtime': 17479.5016, 'train_samples_per_second': 8.385, 'train_steps_per_second': 1.048, 'total_flos': 4.175331599845786e+16, 'train_loss': 0.16001373052818274, 'epoch': 3.0})

In [14]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 3000
  Batch size = 8


{'eval_loss': 2.937501907348633,
 'eval_f1': 0.6366261723175463,
 'eval_accuracy': 0.6636666666666666,
 'eval_runtime': 145.0911,
 'eval_samples_per_second': 20.677,
 'eval_steps_per_second': 2.585,
 'epoch': 3.0}

## Predictions

In [15]:
y_logits = trainer.predict(encoded_dataset["test"]).predictions
y_pred = np.argmax(y_logits, axis=1)
y_true = dataset['test']['label']
print(classification_report(y_pred, y_true, target_names=class_names))
print(confusion_matrix(y_pred, y_true))

***** Running Prediction *****
  Num examples = 3000
  Batch size = 8


              precision    recall  f1-score   support

      Satire       0.95      0.90      0.92       796
        Hoax       0.53      0.98      0.69       406
  Propaganda       0.17      0.28      0.21       469
    Reliable       1.00      0.56      0.72      1329

    accuracy                           0.66      3000
   macro avg       0.66      0.68      0.64      3000
weighted avg       0.79      0.66      0.69      3000

[[715  12  67   2]
 [  7 399   0   0]
 [  8 332 129   0]
 [ 20   7 554 748]]
