# Importing Libraries and Dataset

In [1]:
%%capture
!pip install transformers
!pip install pytorch-lightning


from google.colab import drive
import re


from transformers import AutoTokenizer, AutoModel, AdamW, get_cosine_schedule_with_warmup
from torchmetrics.functional.classification import auroc
import torch.nn.functional as F

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

import math

import gc

gc.collect()

torch.cuda.empty_cache()

In [2]:
drive.mount('/content/drive')
articles_path = '(/content/drive/MyDrive/combined_data.csv)'
labeled_articles = pd.read_csv(articles_path)

Mounted at /content/drive


FileNotFoundError: ignored

# EDA and Data Pre-Processing

In [None]:
## gut check - should be about 37k articles and 3 columns

labeled_articles = labeled_articles[['content_original', 'bias', 'title', 'source']]

# b = labeled_articles.tail()
# b.to_csv("pre-cleaning.csv", encoding='utf-8', index=False)

def clean_column(df, column_name):
    terms_to_remove = {
        'source': ["Web News", "Online", "Opinion", "News"],
        'content_original': ["JUST WATCHED", "MUST WATCH", "Replay More Videos ...", "Advertisement:", "SHARE THIS ARTICLE",
                             "LISTEN TO ARTICLE", "Share Tweet Post Email", "Story Continued Below", "ADVERTISEMENT"]
    }


    if column_name not in terms_to_remove:
        return df


    for term in terms_to_remove[column_name]:
        df[column_name] = df[column_name].str.replace(f"(?i){re.escape(term)}", "", regex=True)

    if column_name == 'source':
        df[column_name] = df[column_name].str.replace(" - ", " ", regex=False)
        df[column_name] = df[column_name].str.replace("-", " ", regex=False)

    df[column_name] = df[column_name].str.strip()

    return df

## CLEANING DATA
labeled_articles = clean_column(labeled_articles, 'source')
labeled_articles = clean_column(labeled_articles, 'content_original')
labeled_articles['content_original'] = labeled_articles['title'] + ". " + labeled_articles['content_original']
labeled_articles['content_original'] = labeled_articles.apply(lambda row: row['content_original'].replace(row['source'], ''), axis=1)



# a = labeled_articles.tail()
# a.to_csv("post-cleaning.csv", encoding='utf-8', index=False)


labeled_articles = labeled_articles[['content_original', 'bias']]

print(f"shape: {labeled_articles.shape}")
print(f"columns: {list(labeled_articles.columns)}")

## relevant columns: source, bias, content, title

print(f"counts:\n\n{labeled_articles['bias'].value_counts()}\n")
print(f"percentages:\n\n{labeled_articles['bias'].value_counts(normalize=True)}\n")


# base model: always predict conservative - most common class: should have 36.5% accuracy (easy to beat)

shape: (37554, 2)
columns: ['content_original', 'bias']
counts:

2    13734
0    13005
1    10815
Name: bias, dtype: int64

percentages:

2    0.365713
0    0.346301
1    0.287985
Name: bias, dtype: float64



Big problem: RoBERTa can only take in so many tokens and the average political article is way too big for the encoder. This is where we could do more pre-processing (i.e. breaking down the article to its "essential components).

Split articles into train, validation, test, remove souc

In [None]:
## split into training, validation, and test - APPEND title and REMOVE source


NUM_NOT_TEST = 35500
NUM_TEST = 1500

NUM_TRAIN = 34000
NUM_VAL = 1500

not_test_df, test_df = train_test_split(labeled_articles, train_size=NUM_NOT_TEST, test_size=NUM_TEST, random_state=42)
train_df, val_df = train_test_split(not_test_df, train_size=NUM_TRAIN, test_size=NUM_VAL, random_state = 38)

print(f"shape train: {train_df.shape}")
print(f"shape val: {val_df.shape}")
print(f"shape test: {test_df.shape}")

print(f"columns: {list(train_df.columns)}")

print(f"training counts:\n\n{train_df['bias'].value_counts()}\n")
print(f"training percents:\n\n{train_df['bias'].value_counts(normalize=True)}\n")

print(f"validation counts:\n\n{val_df['bias'].value_counts()}\n")
print(f"validation percents:\n\n{val_df['bias'].value_counts(normalize=True)}\n")

print(f"test counts:\n\n{test_df['bias'].value_counts()}\n")
print(f"test percents:\n\n{test_df['bias'].value_counts(normalize=True)}\n")

test_df.to_csv('/content/drive/MyDrive/test.csv', encoding='utf-8', index=False)
train_df.to_csv('/content/drive/MyDrive/train.csv', encoding='utf-8', index=False)
val_df.to_csv('/content/drive/MyDrive/val.csv', encoding='utf-8', index=False)

shape train: (34000, 2)
shape val: (1500, 2)
shape test: (1500, 2)
columns: ['content_original', 'bias']
training counts:

2    12430
0    11747
1     9823
Name: bias, dtype: int64

training percents:

2    0.365588
0    0.345500
1    0.288912
Name: bias, dtype: float64

validation counts:

2    540
0    503
1    457
Name: bias, dtype: int64

validation percents:

2    0.360000
0    0.335333
1    0.304667
Name: bias, dtype: float64

test counts:

2    566
0    550
1    384
Name: bias, dtype: int64

test percents:

2    0.377333
0    0.366667
1    0.256000
Name: bias, dtype: float64



# Dataset

In [None]:
class ArticleDataSet(Dataset):
  def __init__(self, article_df, label_name, tokenizer, max_token_len: int = 512):
    '''
    takes feature dataframe (article text + title) and the label dataframe (numeric political bias)
    training, validation, test splits must be done BEFORE this step
    '''
    self.article_df = article_df
    self.tokenizer = tokenizer
    self.label_name = label_name
    self.max_token_len = max_token_len



  def __len__(self):
    return len(self.article_df)

  def __getitem__(self, index):
    item = self.article_df.iloc[index]
    label = item['bias']

    text_content = str(item.content_original)


    ## vectorize / encode

    tokens = self.tokenizer.encode_plus(text_content,
                                        add_special_tokens=True,
                                        return_tensors='pt',
                                        truncation=True,
                                        max_length=self.max_token_len,
                                        padding='max_length',
                                        return_attention_mask=True)

    return {'input_ids': tokens.input_ids.flatten(),
            'attention_mask': tokens.attention_mask.flatten(),
            'label': label}


In [None]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
features = labeled_articles[['content_original']]
labels = 'bias'

train_ds = ArticleDataSet(train_df, labels, tokenizer)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
train_ds.__getitem__(0)

{'input_ids': tensor([    0,   347, 35953,  9565,  6691,    15,  1854,   900,    25,  3345,
            18,  5184, 26165,    81,  1444,     4,    96,    10,  5835,  4627,
             6,  1489,   692,   871,  5628, 13968,   685,    10,  7179,   900,
           296,   363,    11,  4402,     9,  5690,  1854, 16959, 19048,     4,
            85,    21,    10,  3159,  3002,    11,  1444,     6,   147,  2654,
          5118,    32,   341,     7,  1298,  7179,   323,    13,    49,   538,
          1093,   714,  5287,     4, 50118, 50118,  1708,    24,    21,    67,
            41,  7152,  4627,     7,     5,   382,     6,    61,    34,    13,
          1724, 11590,    15,  1444,     7,  1807,    30,    63,   831,   526,
             6,    31,  4035,     8,  3345,     7,     6,    55,   682,     6,
          7662,     4, 50118, 50118, 10980,     4,  5628,  3903,     5,  3002,
             6,    30, 31023,     7, 36745,  2834,     6,     9,    10,  2450,
            14,    74,    33, 21100,   

# Data Module

The purpose of a PyTorch data module is to separate data-related code from model-related code

In [None]:
class Article_Data_Module(pl.LightningDataModule):

  def __init__(self, train_ds, val_ds, test_ds, labels, batch_size: int = 16, max_token_len=512, model_name = 'roberta-base'):
    '''
    train_ds, val_ds, labels should all be data frames
    '''
    super().__init__()

    self.train_ds = train_ds
    self.val_ds = val_ds
    self.test_ds = test_ds
    self.labels = labels
    self.batch_size = batch_size
    self.max_token_len = max_token_len
    self.model_name = model_name
    self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)



  def setup(self, stage=None):
    if stage in (None, "fit"):
      self.train_dataset = ArticleDataSet(self.train_ds, self.labels, self.tokenizer)
      self.val_dataset = ArticleDataSet(self.val_ds, self.labels, self.tokenizer)
    if stage in (None, "predict", "test"):
      self.test_dataset = ArticleDataSet(self.test_ds, self.labels, self.tokenizer)



  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size = self.batch_size, num_workers=4, shuffle=True)


  def val_dataloader(self):
    return DataLoader(self.val_dataset, batch_size = self.batch_size, num_workers=4, shuffle=False)


  def predict_dataloader(self):
    return DataLoader(self.val_dataset, batch_size = self.batch_size, num_workers=4, shuffle=False)

In [None]:
article_dm = Article_Data_Module(train_df, val_df, test_df, labels)
# note - this is not the actual data module we use

In [None]:
article_dm.setup()

In [None]:
article_dl = article_dm.train_dataloader()



In [None]:
len(article_dl)

2125

# Model

In [None]:
class Article_Classifier(pl.LightningModule):
  def __init__(self, config: dict):
    super().__init__()
    self.config = config
    self.pretrained_model = AutoModel.from_pretrained(config['model_name'], return_dict = True)

    ## add in hidden layer and final layer

    self.hidden = nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
    torch.nn.init.xavier_uniform_(self.hidden.weight)
    self.hidden.bias.data.fill_(0)

    self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])
    torch.nn.init.xavier_uniform_(self.classifier.weight)
    self.classifier.bias.data.fill_(0)

    self.loss_func = nn.CrossEntropyLoss()
    self.dropout = nn.Dropout()


  def forward(self, input_ids, attention_mask, label=None):
    '''
    label is None during prediction
    '''

    # roberta model
    output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = torch.mean(output.last_hidden_state, 1)

    # neural network classification layers

    pooled_output = self.hidden(pooled_output)
    pooled_output = self.dropout(pooled_output)
    pooled_output = F.relu(pooled_output)
    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)



    loss = 0
    if label is not None:
      loss = self.loss_func(logits, label)
    return loss, logits

  def training_step(self, batch, batch_index):
    loss, logits = self(**batch)  # unpack - will call forward pass

    self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": logits, "labels": batch['label']}


  def validation_step(self, batch, batch_index):
    loss, logits = self(**batch)

    self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
    return {"val_loss": loss, "predictions": logits, "labels": batch['label']}


  def predict_step(self, batch, batch_index):
    _, logits = self(**batch)
    return logits

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['w_decay'])
    total_steps = self.config['train_size'] / self.config['batch_size']
    warmup_steps = math.floor(total_steps * self.config['warmup'])
    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
    return [optimizer], [scheduler]

  def on_save_checkpoint(self, checkpoint):
    print("Saving checkpoint!\n")

    super().on_save_checkpoint(checkpoint)

In [None]:
# could just use regular roberta
config = {
  'model_name' : 'distilroberta-base',
  'n_labels': 3,
  'batch_size': 32,
  'lr': 1.5e-6,
  'warmup': 0.2,
  'train_size': len(article_dl),
  'w_decay': 0.001,
  'n_epochs': 15
}

model = Article_Classifier(config)

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

In [None]:
## test single input

idx = 0
input_ids = train_ds.__getitem__(idx)['input_ids']
am = train_ds.__getitem__(idx)['attention_mask']
lbl_value = train_ds.__getitem__(idx)['label'].item()
lbl_tensor = torch.tensor([lbl_value], dtype=torch.long)

loss, output = model(input_ids.unsqueeze(dim=0), am.unsqueeze(dim=0), lbl_tensor)
print(loss)

tensor(0.7829, grad_fn=<NllLossBackward0>)


# Training

In [None]:
# data module
from pytorch_lightning.callbacks import ModelCheckpoint

## checkpointing due to connectivity concerns
checkpoint_callback = ModelCheckpoint(
    dirpath='/content/drive/MyDrive',
    filename='{epoch}-{val_loss:.2f}',
    monitor='val_loss',
    save_top_k=1,
    every_n_epochs=1,
    mode='min'
)

article_dm = Article_Data_Module(train_df, val_df, test_df, labels, batch_size = config['batch_size'])

article_dm.setup()


# model

model = Article_Classifier(config)


# train

trainer = pl.Trainer(max_epochs=config['n_epochs'], accelerator="auto", num_sanity_val_steps=4, callbacks=[checkpoint_callback])

trainer.fit(model, article_dm)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /content/drive/MyDrive exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name             | Type             | Params
------------------------------------------------------
0 | pretrained_model | RobertaModel     | 82.1 M
1 | hidden           | Linear           | 590 K 
2 | classifier       | Linear           | 2.3 K 
3 | loss_func        | CrossEntropyLoss | 0     
4 | dropout          | Dropout          | 0     
----------------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Saving checkpoint!



INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=15` reached.


In [None]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs/

<IPython.core.display.Javascript object>

In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/model_final.ckpt")