In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/News_Classification

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/News_Classification


In [None]:
!pip install torch
!pip install transformers



In [None]:
import os
import time
import json
import logging

import torch
from torch import nn 
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import pipeline
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

In [None]:
epochs = 50
batch_size = 1
learning_rate = 1e-5
max_token_len = 1024

In [None]:
checkpoint_path = os.path.join(os.getcwd(), "checkpoints")
dataset_path = os.path.join(os.getcwd(), 'dataset')
train_data_path = os.path.join(dataset_path, 'processed_train.json')
test_data_path = os.path.join(dataset_path, 'processed_test.json')

In [None]:
# Reference from: https://github.com/yinwenpeng/BenchmarkingZeroShot/blob/master/src/train_yahoo.py

choice_to_hypothesis = {
    "Containment and Closure Policies": [
        'it is related with containment and closure policy from governments in the pandemic',
        'this news is talking about a government coronavirus policy for containment and closure',
        'this text describes a government policy about school closing, workspace closing, public event cancellation, restrictions on gatherings, public transport closing, stay at home requirement, restrictions on internel movement, and international travel control in the pandemic.'
    ],
    "Economic Policies": [
        'it is related with economic policy from governments in the pandemic', 
        'this news is talking about a government coronavirus policy for economy', 
        'this text describes a government policy about income support, debt or contract relief, fiscal measurements, and international support in the pandemic.'
    ],
    "Health System Policies": [
        'it is related with health system policy from governments in the pandemic', 
        'this news is talking about a government coronavirus policy for health system', 
        'this text describes a government policy about public health compaigns, testing policy, contact tracing, emergency investment in health care, investment in vaccines, facial coverings, and vaccination policy in the pandemic.'
    ],
    "Miscellaneous Policies": [
        'it is related with miscellaneous policy from governments in the pandemic', 
        'this news is talking about a government policy irrelevant to coronavirus', 
        'this text describes a government policy that do not fit anywhere else in the pandemic.'
    ]
}


# Load the pretrained Model

In [None]:
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
)

Some weights of the model checkpoint at facebook/bart-large-mnli were not used when initializing BartModel: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/bart-large-mnli were not used when initializing BartForSequenceClassification: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification m

# Prepare Data

In [None]:
class CovidNewsData(object):
    def __init__(self, id, premise=None, hypothesis=None, label=None):
        self.id = id
        self.premise = premise
        self.hypothesis = hypothesis
        self.label = label
        

In [None]:
class CovidNewsDataset(Dataset):
    """
    Reference from https://huggingface.co/transformers/_modules/transformers/pipelines/zero_shot_classification.html#ZeroShotClassificationPipeline.__call__
    """
    def __init__(
        self, 
        data_path=None, 
        *,
        tokenizer=None, 
        choice_to_hypothesis=None,
        target_text='summary', 
        max_token_len=1024, 
        device="cpu", 
        transform=None,
      ):
        """
        Args: 
            data_path (str): The full path of the dataset. Required.
            target_text (str): Either use 'summary' or 'article' as inputs of the model. Default: 'summary'.
            tokenizer: The model's tokenizer. Required.
        """

        assert data_path is not None, f"[self.__class__.__name__] Please specify a data path."
        assert tokenizer is not None, f"[self.__class__.__name__] Please give a tokenizer."
        assert isinstance(choice_to_hypothesis, dict), f"[self.__class__.__name__] Please give a dictionary for choices to hypothesis."
        assert target_text in ['summary', 'article'], f"[self.__class__.__name__] Please pick a target_text from either 'summary' or 'article."
        self.data_path = data_path
        self.target_text = target_text
        self.tokenizer = tokenizer
        self.choice_to_hypothesis = choice_to_hypothesis
        self.class_to_id = {
            "Contradiction": 0, 
            "Neutral": 1, 
            "Entailment": 2, 
        }
        self.max_token_len = max_token_len if max_token_len < tokenizer.model_max_length else tokenizer.model_max_length
        self.device = device
        self.transform = transform

        # Init data
        self.data_list = self._get_data()

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.item()

        data = self.data_list[idx]
        encoding = self.tokenizer(
            self._get_sentence_pair(data), 
            return_tensors='pt', 
            padding='max_length',
            add_special_tokens=True, 
            truncation='only_first',  # prevent from truncating the hypothesis
            max_length=self.max_token_len
        )
        encoding['input_ids'] = encoding['input_ids'].squeeze()
        encoding['attention_mask'] = encoding['attention_mask'].squeeze()

        sample = {
            'id': data.id,
            'inputs': self._ensure_tensor_on_device(**encoding), 
            'label': torch.tensor(self.class_to_id[data.label], dtype=torch.long).to(device)
        }

        if self.transform is not None:
            sample = self.transform(sample)

        return sample

    def _get_sentence_pair(self, data):

        return [[data.premise, data.hypothesis]]

    def _ensure_tensor_on_device(self, **inputs):

        return {name: tensor.to(self.device) for name, tensor in inputs.items()}

    def _get_data(self):
        with open(self.data_path) as f:
            raw_data_list = json.load(f)

        labels = list(self.class_to_id.keys())
        data_list = []
        for data in raw_data_list:
            current_choice = data['choice']
            id = data['id']
            premise = data[self.target_text]

            # True, 3 from the current choice
            label = labels[2]   # Entailment
            for hypothesis in self.choice_to_hypothesis[current_choice]:
                data_list.append(CovidNewsData(
                    id=id,
                    premise=premise,
                    hypothesis=hypothesis,
                    label=label,
                ))

            # False, 3 from other choices seperately
            label = labels[0]   # Contradiction or Not Entailment
            for other_choice in self.choice_to_hypothesis.keys():
                if current_choice == other_choice: continue

                randIdx = torch.randperm(3)[0].item()
                hypothesis = self.choice_to_hypothesis[other_choice][randIdx]
                data_list.append(CovidNewsData(
                    id=id,
                    premise=premise,
                    hypothesis=hypothesis,
                    label=label,
                ))

        return data_list


In [None]:
train_dataset = CovidNewsDataset(
    train_data_path,
    tokenizer=classifier.tokenizer,
    choice_to_hypothesis=choice_to_hypothesis,
    device=device
)
train_data_loader = DataLoader(
    dataset=train_dataset, 
    batch_size=batch_size,
    shuffle=True
)

In [None]:
test_dataset = CovidNewsDataset(
    test_data_path, 
    tokenizer=classifier.tokenizer, 
    choice_to_hypothesis=choice_to_hypothesis,
    device=device
)
test_data_loader = DataLoader(
    dataset=test_dataset, 
    batch_size=batch_size,
    shuffle=False
)

In [None]:
print(f"Length of the training dataset: {len(train_dataset)}")
print(f"Length of the testing dataset: {len(test_dataset)}")

Length of the training dataset: 720
Length of the testing dataset: 2616


# Prepare for training

In [None]:
log_interval = 32  # unit: step
num_steps_per_epoch = len(train_dataset)//batch_size + 1
store_interval = num_steps_per_epoch//2 + 1
num_train_steps = epochs*num_steps_per_epoch
num_warmup_steps = 1*num_steps_per_epoch  # 1 epoch

In [None]:
no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']
optimizer_grouped_parameters = [
    {'params': [p for n, p in classifier.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in classifier.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

In [None]:
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_train_steps)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
writer = SummaryWriter()

In [None]:
classifier.model.to(device)

BartForSequenceClassification(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024, padding_idx=1)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_

# Training

In [None]:
for epoch in range(epochs):
    classifier.model.train()
    losses = 0.
    num_trained_seq = 0
    start_time = time.time()
    
    for batch_idx, batch in enumerate(train_data_loader):
        optimizer.zero_grad()

        inputs = batch['inputs']
        label = batch['label']

        outputs = classifier.model(**inputs)
        loss = criterion(outputs.logits, label)

        loss.backward()
        optimizer.step()

        current_step = epoch*num_steps_per_epoch + (batch_idx + 1)
        writer.add_scalar("Loss/train", loss, current_step)

        if current_step % store_interval == 0:
            checkpoint_full_path = os.path.join(checkpoint_path, f"checkpoint_{current_step}.bin")
            classifier.model.save_pretrained(checkpoint_full_path)

        current_batch_size = len(batch)
        num_trained_seq += current_batch_size
        losses += current_batch_size*loss.item()
        
        if (batch_idx + 1) % log_interval == 0:
            current_loss = losses / num_trained_seq
            elapsed = time.time() - start_time
            print('epoch: {:3d} | step: {:5d} | batch: {:5d} | lr: {:5.6f} | ms/batch: {:5.2f} | loss: {:5.3f}'.format(
                epoch, 
                current_step,
                (batch_idx + 1),
                optimizer.param_groups[0]['lr'],
                elapsed * 1000 / log_interval,
                current_loss
            ))

            losses = 0.
            num_trained_seq = 0
            start_time = time.time()

        scheduler.step()

writer.flush()
writer.close()

epoch:   0 | step:    32 | batch:    32 | lr: 0.000000 | ms/batch: 1044.04 | loss: 3.116
epoch:   0 | step:    64 | batch:    64 | lr: 0.000001 | ms/batch: 1049.02 | loss: 1.399
epoch:   0 | step:    96 | batch:    96 | lr: 0.000001 | ms/batch: 1023.86 | loss: 0.565
epoch:   0 | step:   128 | batch:   128 | lr: 0.000002 | ms/batch: 1037.67 | loss: 0.982
epoch:   0 | step:   160 | batch:   160 | lr: 0.000002 | ms/batch: 1034.52 | loss: 0.703
epoch:   0 | step:   192 | batch:   192 | lr: 0.000003 | ms/batch: 1029.82 | loss: 0.762
epoch:   0 | step:   224 | batch:   224 | lr: 0.000003 | ms/batch: 1032.81 | loss: 0.808
epoch:   0 | step:   256 | batch:   256 | lr: 0.000004 | ms/batch: 1035.05 | loss: 0.743
epoch:   0 | step:   288 | batch:   288 | lr: 0.000004 | ms/batch: 1033.07 | loss: 0.898
epoch:   0 | step:   320 | batch:   320 | lr: 0.000004 | ms/batch: 1034.50 | loss: 0.763
epoch:   0 | step:   352 | batch:   352 | lr: 0.000005 | ms/batch: 1034.14 | loss: 0.721
epoch:   0 | step:   