In [1]:
import os
os.environ["WANDB_MODE"] = "dryrun"

In [2]:
from config import CONFIG
from training.module import ClassificationModule
from pytorch_lightning.loggers import WandbLogger
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
import pytorch_lightning as pl
from transformers import AutoTokenizer
import pandas as pd

In [3]:
# TODO: generalize to [String] -> [Class] system 
# TODO: generalize forward pass
# TODO: set parameter that enables cls token utilization or arbitrary hidden layer utilization
# TODO: (MAYBE) generalize models to extend BASE, otherwise add 
from config import CONFIG

from transformers import AutoModel, AutoTokenizer, AutoConfig
from torch import nn

class BaseClassificationModel(nn.Module):
    
    def __init__(self, dropout = 0.05, n_classes = 2, injection = False):
        super(BaseClassificationModel, self).__init__()
        
        
        # model body
        self.model = AutoModel.from_pretrained(CONFIG.pretrained_model_name)
        
        self.hidden_size = self.model.config.hidden_size #768
        
        # (standard) model classification head
        self.head = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.Tanh(),
            nn.Dropout(dropout),
            nn.Linear(self.hidden_size, n_classes),
            nn.Softmax(dim=1)
        )
        
        # initialize weights in linear layers
        self.init_weights(self.head)
        
        
    def init_weights(self, module):
        for layer in module:
            if isinstance(layer, nn.Linear):
                layer.weight.data.normal_(mean = 0.0, std = 0.02)
                if layer.bias is not None:
                    layer.bias.data.zero_()
                    
    
    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        
        output = self.model(input_ids = input_ids, 
                       token_type_ids = token_type_ids,
                       attention_mask = attention_mask,
                       output_hidden_states = True)
        
        # last hidden state of all tokens
        last_hidden_state = output.last_hidden_state
        
        return last_hidden_state[:, 0, :]  # Returns the hidden state of the first token in the sequence

    def predict(self, input_ids, attention_mask=None, token_type_ids=None):
        
        cls_hidden_state = self.forward(input_ids, attention_mask, token_type_ids)
        
        return self.head(cls_hidden_state)  # Returns the predicted classes

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
# Program
torch.manual_seed(0)

class RelationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, index):
        premise = self.data["premise"].iloc[index]
        claim = self.data["claim"].iloc[index]

        encoding = self.tokenizer.encode_plus(
            premise,
            claim,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )


        if 'label' in self.data.columns:
            
            label = torch.tensor(0 if self.data["label"].iloc[index] == "Attack" else 1, dtype=torch.int64)
            
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
         }
            
        else:
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
               

    def __len__(self):
        return len(self.data)

In [5]:
def create_data_loader(mode: str, tokenizer, shuffle=False):
    
    df = pd.read_pickle("../data/microtext_references.pickle")
    split = df[df['mode'] == mode]

    if mode == "test":
        split = split[['premise','claim']]
        #split = split[(split['mode'] == 'test') & (split['label'] != 'Rephrase')] #This is for kialo data set  
        
    return DataLoader(
        RelationDataset(split, tokenizer),
        batch_size = CONFIG.batch_size if mode == "train" else CONFIG.batch_size // 4,
        shuffle=shuffle, num_workers = CONFIG.num_workers
    )
    
    
#Create the datasets
tokenizer = AutoTokenizer.from_pretrained(CONFIG.pretrained_model_name)
#tokenizer = BertTokenizerFast.from_pretrained('nghuyong/ernie-2.0-en')
train_dataloader = create_data_loader("train", tokenizer, True)
val_dataloader = create_data_loader("validate", tokenizer, False)

In [6]:
from config import CONFIG

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from typing import List
import pandas

import pytorch_lightning as pl
import torchmetrics 
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup


class ClassificationModule(pl.LightningModule):
    
    def __init__(self, model):
        super().__init__()

        self.model = model
        
        self.loss = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task="binary")

    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    
    def step(self, batch, batch_idx, mode):
        input_ids = batch["input_ids"]
        token_type_ids = batch["token_type_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        #x, y = batch
        #logits = self.forward(input_ids, attention_mask, token_type_ids )
        logits = self.model.predict(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        predictions = logits.argmax(dim = 1)
        
        loss = self.loss(logits, labels)
        accuracy = self.accuracy(predictions, labels)

        self.log(f'{mode}_loss', loss, on_epoch=True, prog_bar=True)
        self.log(f'{mode}_accuracy', accuracy, on_epoch=True, prog_bar=True)

        return loss

    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, 'train')
    
    def validation_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, 'val')

    def test_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, 'test')

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        
        input_ids = batch["input_ids"]
        token_type_ids = batch["token_type_ids"]
        attention_mask = batch["attention_mask"]
        #logits = self(input_ids, attention_mask, token_type_ids)
        logits = self.model.predict(input_ids, attention_mask, token_type_ids)
        predictions = logits.argmax(dim=-1)
        
        return predictions

        
    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=CONFIG.learning_rate)
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=CONFIG.warmup_steps,
            num_training_steps=len(self.train_dataloader().dataset) // CONFIG.batch_size * CONFIG.epochs,
        )
        return [optimizer], [{"scheduler": lr_scheduler, "interval": "step"}]

    def create_data_loader(self, mode: str, shuffle=False):       
        df = pd.read_pickle("../data/microtext_references.pickle")
        split = df[df['mode'] == mode]
        
        tokenizer = AutoTokenizer.from_pretrained(CONFIG.pretrained_model_name)
            
        return DataLoader(
            RelationDataset(split, tokenizer),
            batch_size = CONFIG.batch_size if mode == "train" else CONFIG.batch_size // 4,
            shuffle=shuffle, num_workers = CONFIG.num_workers
        )
    
    def train_dataloader(self):
        return self.create_data_loader(mode = "train", shuffle=True)

    def val_dataloader(self):
        return self.create_data_loader(mode = "validate")

    def test_dataloader(self):
        return self.create_data_loader(mode = "test")
        


In [11]:
model = BaseClassificationModel()

module = ClassificationModule(model)

trainer = pl.Trainer(accelerator = "gpu", 
                    devices = CONFIG.device_number, 
                    max_epochs=CONFIG.epochs) 

trainer.fit(module)
#trainer.fit(model, train_dataloader, val_dataloader)

trainer.test(module)
# Save the model
#trainer.save_checkpoint("./trained_model/base_model.ckpt")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES:

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.5124342441558838, 'test_accuracy': 0.795918345451355}]

In [12]:
from sklearn.metrics import classification_report

mapping = {'Attack':0, 'Support':1}
df = pd.read_pickle("../data/microtext_references.pickle")
split = df[df['mode'] == 'test']
true_labels = split['label'].map(mapping)
#split = df[(df['mode'] == 'test') & (df['label'] != 'Rephrase')] #Kialo data set
#true_labels = split['label'].map(mapping) #kialo data set

test_dataloader = create_data_loader("test", tokenizer, False) #RelationDataset(split[['premise','claim']], tokenizer)

# Make predictions as before
raw_pred = trainer.predict(module, test_dataloader)

# Concatenate the batched outputs into a single list
preds = torch.cat([torch.tensor(x) for x in raw_pred])

# Move tensor to CPU and convert to numpy
preds = preds.cpu().numpy()

# Print classification report
report = classification_report(true_labels, preds)
print(report)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

              precision    recall  f1-score   support

           0       0.78      0.47      0.58        15
           1       0.80      0.94      0.86        34

    accuracy                           0.80        49
   macro avg       0.79      0.70      0.72        49
weighted avg       0.79      0.80      0.78        49



  preds = torch.cat([torch.tensor(x) for x in raw_pred])


In [16]:
base_model_checkpoint = "./trained_model/base_model.ckpt"



In [32]:
from sklearn.metrics import classification_report

mapping = {'Attack':0, 'Support':1}

# Load the test dataset
df = pd.read_pickle("../data/kialo_references.pickle")
split = df[(df['mode'] == 'test') & (df['label'] != 'Rephrase')]
true_labels = split['label'].map(mapping)
test_dataset = RelationDataset(split[['premise', 'claim']], tokenizer)

test_dataloader = create_data_loader("test", tokenizer, False) #RelationDataset(split[['premise','claim']], tokenizer)

# Make predictions as before
raw_pred = trainer.predict(module, test_dataloader)

# Concatenate the batched outputs into a single list
preds = torch.cat([torch.tensor(x) for x in raw_pred])

# Move tensor to CPU and convert to numpy
preds = preds.cpu().numpy()

# Print classification report
report = classification_report(true_labels, preds)
print(report)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

  preds = torch.cat([torch.tensor(x) for x in raw_pred])


In [17]:
# Save the model
trainer.save_checkpoint("./trained_model/base_model.ckpt")

In [9]:
# TODO: generalize to [String] -> [Class] system 
# TODO: generalize forward pass
# TODO: set parameter that enables cls token utilization or arbitrary hidden layer utilization
# TODO: (MAYBE) generalize models to extend BASE, otherwise add 
from config import CONFIG

from transformers import AutoModel, AutoTokenizer, AutoConfig
from torch import nn

class BaseClassificationModel(nn.Module):
    
    def __init__(self, dropout = 0.05, n_classes = 2, injection = False):
        super(BaseClassificationModel, self).__init__()
        
        
        # model body
        self.model = AutoModel.from_pretrained("bert-base-uncased")
        
        self.hidden_size = self.model.config.hidden_size #768
        
        # (standard) model classification head
        self.head = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.Tanh(),
            nn.Dropout(dropout),
            nn.Linear(self.hidden_size, n_classes),
            nn.Softmax(dim=1)
        )
        
        # initialize weights in linear layers
        self.init_weights(self.head)
        
        
    def init_weights(self, module):
        for layer in module:
            if isinstance(layer, nn.Linear):
                layer.weight.data.normal_(mean = 0.0, std = 0.02)
                if layer.bias is not None:
                    layer.bias.data.zero_()
                    
    
    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        
        output = self.model(input_ids = input_ids, 
                       token_type_ids = token_type_ids,
                       attention_mask = attention_mask,
                       output_hidden_states = True)
        
        # last hidden state of all tokens
        last_hidden_state = output.last_hidden_state
        
        return last_hidden_state[:, 0, :]  # Returns the hidden state of the first token in the sequence

    def predict(self, input_ids, attention_mask=None, token_type_ids=None):
        
        cls_hidden_state = self.forward(input_ids, attention_mask, token_type_ids)
        
        return self.head(cls_hidden_state)  # Returns the predicted classes
        
import torch
from torch.utils.data import Dataset, DataLoader
# Program
torch.manual_seed(0)

class RelationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, index):
        premise = self.data["premise"].iloc[index]
        claim = self.data["claim"].iloc[index]

        encoding = self.tokenizer.encode_plus(
            premise,
            claim,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )


        if 'label' in self.data.columns:
            
            label = torch.tensor(0 if self.data["label"].iloc[index] == "Attack" else 1, dtype=torch.int64)
            
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
         }
            
        else:
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
               

    def __len__(self):
        return len(self.data)
        
def create_data_loader(mode: str, tokenizer, shuffle=False):
    
    df = pd.read_pickle("../data/kialo_references.pickle")
    split = df[df['mode'] == mode]

    if mode == "test":
        #split = split[['premise','claim']]
        split = split[(split['mode'] == 'test') & (split['label'] != 'Rephrase')] #This is for kialo data set  
        
    return DataLoader(
        RelationDataset(split, tokenizer),
        batch_size = CONFIG.batch_size if mode == "train" else CONFIG.batch_size // 4,
        shuffle=shuffle, num_workers = CONFIG.num_workers
    )
    
    
#Create the datasets
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_dataloader = create_data_loader("train", tokenizer, True)
val_dataloader = create_data_loader("validate", tokenizer, False)

from config import CONFIG

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from typing import List
import pandas

import pytorch_lightning as pl
import torchmetrics 
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup


class ClassificationModule(pl.LightningModule):
    
    def __init__(self, model):
        super().__init__()

        self.model = model
        
        self.loss = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task="binary")

    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    
    def step(self, batch, batch_idx, mode):
        input_ids = batch["input_ids"]
        token_type_ids = batch["token_type_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        #x, y = batch
        logits = self.forward(input_ids, attention_mask, token_type_ids )

        predictions = logits.argmax(dim = 1)
        
        loss = self.loss(logits, labels)
        accuracy = self.accuracy(predictions, labels)

        self.log(f'{mode}_loss', loss, on_epoch=True, prog_bar=True)
        self.log(f'{mode}_accuracy', accuracy, on_epoch=True, prog_bar=True)

        return loss

    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, 'train')
    
    def validation_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, 'val')

    def test_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, 'test')

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        
        input_ids = batch["input_ids"]
        token_type_ids = batch["token_type_ids"]
        attention_mask = batch["attention_mask"]
        logits = self(input_ids, attention_mask, token_type_ids)
        predictions = logits.argmax(dim=-1)
        
        return predictions

        
    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=CONFIG.learning_rate)
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=CONFIG.warmup_steps,
            num_training_steps=len(self.train_dataloader().dataset) // CONFIG.batch_size * CONFIG.epochs,
        )
        return [optimizer], [{"scheduler": lr_scheduler, "interval": "step"}]

    def create_data_loader(self, mode: str, shuffle=False):       
        df = pd.read_pickle("../data/kialo_references.pickle")
        split = df[df['mode'] == mode]
        
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
            
        return DataLoader(
            RelationDataset(split, tokenizer),
            batch_size = CONFIG.batch_size if mode == "train" else CONFIG.batch_size // 4,
            shuffle=shuffle, num_workers = CONFIG.num_workers
        )
    
    def train_dataloader(self):
        return self.create_data_loader(mode = "train", shuffle=True)

    def val_dataloader(self):
        return self.create_data_loader(mode = "validate")

    def test_dataloader(self):
        return self.create_data_loader(mode = "test")
        
model = BaseClassificationModel()

module = ClassificationModule(model)

trainer = pl.Trainer(accelerator = "gpu", 
                    devices = 1, 
                    max_epochs=5) 

trainer.fit(module)
#trainer.fit(model, train_dataloader, val_dataloader)

trainer.test(module)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES:

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
