# Importing Libraries and Dataset

In [132]:
!pip install transformers
!pip install pytorch-lightning


import os
from google.colab import drive


from transformers import AutoTokenizer, AutoModel, AdamW, get_cosine_schedule_with_warmup
from torchmetrics.functional.classification import auroc
import torch.nn.functional as F

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import pytorch_lightning as pl


import numpy as np
import pandas as pd


import math



In [None]:
drive.mount('/content/drive')
articles_path = '/content/drive/MyDrive/combined_data.csv'
labeled_articles = pd.read_csv(articles_path)

# EDA and Data Pre-Processing

In [133]:
## gut check - should be about 37k articles and 3 columns

labeled_articles = labeled_articles[['title', 'content_original', 'bias']]

print(f"shape: {labeled_articles.shape}")
print(f"columns: {list(labeled_articles.columns)}")

## relevant columns: source, bias, content, title

# labeled_articles_head = labeled_articles.head()
# labeled_articles_head.to_csv("head.csv", index=False)

print(f"counts:\n\n{labeled_articles['bias'].value_counts()}\n")
print(f"percentages:\n\n{labeled_articles['bias'].value_counts(normalize=True)}\n")


# base model: always predict conservative - most common class: should have 36.5% accuracy (easy to beat)

shape: (37554, 3)
columns: ['title', 'content_original', 'bias']
counts:

2    13734
0    13005
1    10815
Name: bias, dtype: int64

percentages:

2    0.365713
0    0.346301
1    0.287985
Name: bias, dtype: float64



Big problem: RoBERTa can only take in so many tokens and the average political article is way too big for the encoder. This is where we could do more pre-processing (i.e. breaking down the article to its "essential components).

also, this is where i would split the articles into training, validation, and test

# Dataset

In [100]:
class ArticleDataSet(Dataset):
  def __init__(self, article_df, label_df, tokenizer, max_token_len: int = 128):
    '''
    takes feature dataframe (article text + title) and the label dataframe (numeric political bias)
    training, validation, test splits must be done BEFORE this step
    '''

    self.article_df = article_df
    self.tokenizer = tokenizer
    self.label_df = label_df
    self.max_token_len = max_token_len


  def __len__(self):
    return len(self.article_df)

  def __getitem__(self, index):
    item = self.article_df.iloc[index]
    label = self.label_df.iloc[index]

    text_content = str(item.content_original)




    ## vectorize / encode
    tokens = self.tokenizer.encode_plus(text_content,
                                        add_special_tokens=True,
                                        return_tensors='pt',
                                        truncation=True,
                                        max_length=self.max_token_len,
                                        padding='max_length',
                                        return_attention_mask=True)

    return {'input_ids': tokens.input_ids.flatten(),
            'attention_mask': tokens.attention_mask.flatten(),
            'label': label}


In [102]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
features = labeled_articles[['content_original']]
labels = labeled_articles[['bias']]

train_ds = ArticleDataSet(features, labels, tokenizer)
## gut check
## train_ds.__getitem__(0)

In [103]:
len(train_ds)

37554

# Data Module

The purpose of a PyTorch data module is to separate data-related code from model-related code

In [105]:
class Article_Data_Module(pl.LightningDataModule):

  def __init__(self, train_ds, labels, val_ds=None, test_ds=None, batch_size: int = 16, max_token_len=128, model_name = 'roberta-base'):
    '''
    train_ds, val_ds, labels should all be data frames
    '''
    super().__init__()

    self.train_ds = train_ds
    self.val_ds = val_ds
    self.test_ds = test_ds
    self.labels = labels
    self.batch_size = batch_size
    self.max_token_len = max_token_len
    self.model_name = model_name
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)



  def setup(self, stage=None):
    if stage in (None, "fit"):
      self.train_dataset = ArticleDataSet(self.train_ds, self.tokenizer, self.labels)
      if self.val_ds is not None:
        self.val_dataset = ArticleDataSet(self.val_ds, self.tokenizer, self.labels)
    if stage in (None, "predict", "test"):
      self.test_dataset = ArticleDataSet(self.test_ds, self.tokenizer, self.labels)


  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size = self.batch_size, num_workers=4, shuffle=True)

  def val_dataloader(self):
    return DataLoader(self.val_dataset, batch_size = self.batch_size, num_workers=4, shuffle=False)

  def predict_dataloader(self):
    return DataLoader(self.val_dataset, batch_size = self.batch_size, num_workers=4, shuffle=False)

In [None]:
article_dm = Article_Data_Module(features, labels)

In [None]:
article_dm.setup()

In [None]:
article_dl = article_dm.train_dataloader()



In [None]:
len(article_dl)

2348

# Model

In [139]:
class Article_Classifier(pl.LightningModule):
  def __init__(self, config: dict):
    super().__init__()
    self.config = config
    self.pretrained_model = AutoModel.from_pretrained(config['model_name'], return_dict = True)

    ## add in hidden layer and final layer

    self.hidden = nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
    self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])

    torch.nn.init.xavier_uniform_(self.classifier.weight)
    torch.nn.init.xavier_uniform_(self.hidden.weight)
    self.loss_func = nn.CrossEntropyLoss()
    self.dropout = nn.Dropout()

  def forward(self, input_ids, attention_mask, label=None):
    '''
    label is None during prediction
    '''
    print(f"label: {label}")

    # roberta model
    output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = torch.mean(output.last_hidden_state, 1)

    # neural network classification layers

    pooled_output = self.hidden(pooled_output)
    pooled_output = self.dropout(pooled_output)
    pooled_output = F.relu(pooled_output)

    logits = self.classifier(pooled_output)


    loss = 0
    if label is not None:
      loss = self.loss_func(logits, label)
      return loss, logits

    return logits

  def training_step(self, batch, batch_index):
    loss, logits = self(**batch)  # unpack - will call forward pass
    self.log("train loss", loss, prog_bar = True, logger = True)
    return {"loss": loss, "predictions": logits, "labels": batch['labels']}

  def validation_step(self, batch, batch_index):
    loss, logits = self(**batch)  # unpack - will call forward pass
    self.log("validation loss", loss, prog_bar = True, logger = True)
    return {"val_loss": loss, "predictions": logits, "labels": batch['labels']}

  def predict_step(self, batch, batch_index):
    _, logits = self(**batch)  # unpack - will call forward pass
    return logits

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['w_decay'])
    total_steps = self.config['train_size'] / self.config['batch_size']
    warmup_steps = math.floor(total_steps * self.config['warmup'])
    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
    return [optimizer], [scheduler]



In [140]:
# could just use regular roberta
config = {
  'model_name' : 'distilroberta-base',
  'n_labels': 3,
  'batch_size': 128,
  'lr': 1.5e-6,
  'warmup': 0.2,
  'train_size': len(article_dl),
  'w_decay': 0.001,
  'n_epochs': 100
}

model = Article_Classifier(config)

In [147]:
idx = 0
input_ids = train_ds.__getitem__(idx)['input_ids']
am = train_ds.__getitem__(idx)['attention_mask']
lbl_value = train_ds.__getitem__(idx)['label'].item()
lbl_tensor = torch.tensor([lbl_value], dtype=torch.long)
#print(f"label: {lbl_value}")

loss, output = model(input_ids.unsqueeze(dim=0), am.unsqueeze(dim=0), lbl_tensor)  # need to fix

label: tensor([0])


In [148]:
print(loss)
print(output)

tensor(0.7043, grad_fn=<NllLossBackward0>)
tensor([[1.1278, 0.4321, 0.4809]], grad_fn=<AddmmBackward0>)


# Training