In [None]:

!pip install transformers
!pip install datasets
!pip install bertviz

from IPython.display import clear_output
import time

from bertviz import head_view, model_view
clear_output()

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

# !cp -r /content/drive/MyDrive/ZahraD/BAMT/AdapterBERT .

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
from  AdapterBERT import trainer
from transformers import BertTokenizer, BertConfig, BertModel
from AdapterBERT.adapter import AdapterBertModel, AdapterBertForSequenceClassification, ParallelAdapterBertForSequenceClassification
import helper
import torch
import numpy as np
import logging
import torch
import os
from AdapterBERT.config import RunConfig, ParallelAdapterBertConfig, BottleneckAdapterBertConfig
import torch
import sys
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import load_metric
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score
from seaborn import heatmap
from scipy.stats import spearmanr, pearsonr
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers.optimization import AdamW
import datasets
logger = logging.getLogger(__name__)


# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

In [None]:
METRIC_NAME_TO_FUNCTION = {
    "MCC": matthews_corrcoef,
    "Accuracy": accuracy_score,
    "F1": f1_score,
    "Spearman": spearmanr,
    "Pearson": pearsonr,
}


TASK_TO_METRICS = {
    "cola": ["MCC"],
    "mnli": ["Accuracy"],
    "mrpc": ["Accuracy", "F1"],
    "qnli": ["Accuracy"],
    "qqp": ["Accuracy", "F1"],
    "rte": ["Accuracy"],
    "sst2": ["Accuracy"],
    "stsb": ["Spearman", "Pearson"],
    "wnli": ["Accuracy"],
}




def train_and_evaluate(model, optimizer, data_loaders, num_epochs, task_name , output_path=None, evaluation_frequency=1):
    """Trains the encoder model and evaluate it on validation set.
    Learning curves will be saved to the output_path.
    Args:
        num_epochs (int): Number of epochs to perform.
        output_path (str): Directory path to save the learning curves too.
        evaluation_frequency (int): will evaluate every `evaluation_frequency` epochs.
    """
    model.cuda()

    evaluations = {k: {metric_name: [] for metric_name in TASK_TO_METRICS[task_name]} for k in
                            data_loaders.keys()}
    for epoch in range(num_epochs):
        # training for a single epoch
        _train(model,optimizer, data_loaders['train'], epoch, task_name )

        # evaluation
        if not epoch % evaluation_frequency:
            for dataloader_type, dataloader in data_loaders.items():
                if not ('test' in dataloader_type):
                    results = _evaluate(model, dataloader, dataloader_type.upper(), task_name)
                    for metric_name, result in results.items():
                        evaluations[dataloader_type][metric_name].append(result)
        print('')
    return evaluations
        




def _evaluate(model,  dataloader, dataloader_type, task_name):
    """Evaluates the model on the dataloader
    Args:
        dataloader (torch.utils.data.DataLoader): the data loader we evaluate the model on
        dataloader_type (str): the dataloader type (train/validation)
    Returns:
        (Dict[str, float]): dictionary that maps between metric_name and the metric result
    """
    # move to eval mode
    model.eval()

    evaluated_samples = accuracy_sum = 0
    all_predictions, all_labels = [], []
    for step, batch in enumerate(dataloader):
        batch = tuple(obj.cuda() for obj in batch)

        input_ids, attention_mask, token_type_ids, labels = batch

        # forward pass
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            outputs = outputs.logits

        # reshaping
        labels = labels.view(-1)
        outputs = outputs.view(-1) if task_name == 'stsb' else outputs


        # moving tensor to cpu and detaching for aggregation
        outputs = outputs.detach().cpu().numpy()
        labels = labels.cpu().numpy()

        evaluated_samples += len(labels)

        # calculate the accuracy in the classification case

        if not task_name == 'stsb':
            outputs = np.argmax(outputs, axis=1)
            # accuracy calculation
            accuracy_sum += accuracy_score(labels, outputs) * len(labels)
            

        # aggregate predictions and labels
        all_predictions.extend(list(outputs))
        all_labels.extend(list(labels))

    print(f'{dataloader_type} ACC: {round(accuracy_sum / evaluated_samples, 5)}')
    print('')


    # calculate the required metrics
    results = {}
    for metric_name in TASK_TO_METRICS[task_name]:
        metric = METRIC_NAME_TO_FUNCTION[metric_name]
        result = metric(all_labels, all_predictions)
        result = result[0] if task_name == 'stsb' else result
        results[metric_name] = result

        

    return results



def _train(model,optimizer, train_dataloader, epoch, task_name, max_grad_norm=1.0):
    """Trains the model for a single epoch
    Args:
        train_dataloader (torch.utils.data.DataLoader): the train data loader
        epoch (int): the epoch number (for logging)
        max_grad_norm (float): the maximum gradient norm we allow. The norm is computed over all gradients together,
        as if they were concatenated into a single vector.
    """
    # move to train mode
    model.train()

    # loss initialization

    criteria = torch.nn.MSELoss() if task_name == 'stsb' else torch.nn.CrossEntropyLoss()



    n = len(train_dataloader.dataset)
    trained_samples = loss_sum = 0
    for step, batch in enumerate(train_dataloader):
        # move batch data to gpu

        batch = tuple(obj.cuda() for obj in batch)

        input_ids, attention_mask, token_type_ids, labels = batch

        # forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        outputs = outputs.logits
        outputs = outputs.view(-1) if task_name == 'stsb' else outputs
        
        # loss calculation
        labels = labels.view(-1)


        loss = criteria(outputs, labels)

        # backward pass (gradients calculation)
        loss.backward()

    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)

        # update parameters
        optimizer.step()
        model.zero_grad()

        # track train loss
        loss_sum += loss.item()
        trained_samples += len(labels)

        # printing training progress
        print(f'EPOCH: {epoch}   TRAIN: {trained_samples}/{n}   LOSS: {round(loss_sum / (step + 1), 3)}\r', end='')
    print('')






def preprocess_dataset(datasets, batch_size):
        data_loaders = dict()


        data_loaders['train'] = datasets['train']
        data_loaders['validation'] = datasets['validation']
        data_loaders['test'] = datasets['test']

        for dataset_name, dataset in data_loaders.items():
            if dataset_name is "validation":
                print("ok it has changed")
                data_loaders[dataset_name] = _convert_dataset_to_data_loader(dataset=dataset,
                                                                                #    model_name=model_name,
                                                                                   batch_size=128,
                                                                                   random_sampler=dataset_name == 'train',
                                                                                   test='test' in dataset_name)
            else:
                data_loaders[dataset_name] = _convert_dataset_to_data_loader(dataset=dataset,
                                                                                #    model_name=model_name,
                                                                                   batch_size=batch_size,
                                                                                   random_sampler=dataset_name == 'train',
                                                                                   test='test' in dataset_name)
                

        return data_loaders   





def _convert_dataset_to_data_loader(dataset, batch_size, random_sampler, test=False):
        """converts a datasets.arrow_dataset.Dataset to torch.utils.data.DataLoader.
        Args:
            dataset (datasets.arrow_dataset.Dataset): the Dataset to convert to DataLoader.
            model_name (str): model name (e.g. bert-base-uncased).
            batch_size (int): batch size for training and evaluation.
            random_sampler (bool): if True, DataLoader will sample randomly else sequentially.
            test (bool): if True, dataset contains test samples.
        Returns:
            (torch.utils.data.DataLoader): the data loader
        """
        if test:
            keys = ['input_ids', 'attention_mask', 'token_type_ids']
        else:
            keys = ['input_ids', 'attention_mask', 'token_type_ids', 'label']

        data = {key: list() for key in keys}
        # max_len_batch = 0
        for sample in dataset:
            # if len(sample) > max_len_batch:
            #     max_len_batch = len(sample)
            for key in keys:
                data[key].append(sample[key])
        # print(data)
        for k, v in data.items():
            # v += [0] * (max_len_batch - len(v))
            # print
            data[k] = torch.tensor(v)

        tensor_dataset = TensorDataset(*[data[key] for key in keys])
        data_sampler = RandomSampler(tensor_dataset) if random_sampler else SequentialSampler(tensor_dataset)
        return DataLoader(tensor_dataset, sampler=data_sampler, batch_size=batch_size)


In [None]:
def unfreez_bias(model, layer_to_unfreez):

    for name, param in model.named_parameters():
            if 'adapter' in name:
                param.requires_grad = True
                continue
            param.requires_grad = False

    for name, param in model.named_parameters():
                if 'pooler.dense.bias' in name:
                    param.requires_grad = True
                    continue
                if 'classifier' in name:
                    print("in class layer")
                    param.requires_grad = True
                    continue
                for layer in layer_to_unfreez:
                    if 'bias' in name and layer in name :
                        param.requires_grad = True
                        break
def total_trainable_parameters(model):
        total_trainable_params = 0
        for name, param in model.named_parameters():
            if param.requires_grad:
                print(f'{name:<80}   --->  {param.shape}')
                total_trainable_params += param.shape[0] if len(param.shape) == 1 else param.shape[0] * param.shape[
                    1]
        print(
            f'\n----------------------------------------\nNumber of Trainable Parameters: {total_trainable_params:>65}\n')


In [None]:


task_name = 'mrpc'
 
dataset = datasets.load_dataset('glue', task_name)
dataset = dataset.filter(lambda example: example['label'] is not -1)
clear_output()
num_labels = 1 if task_name=='stsb' else len(set(dataset['validation']['label']))
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [None]:

def tokenizing(example):
  return tokenizer( example["sentence1"], example["sentence2"], padding= 'max_length', max_length= max_sequence_length,  truncation="longest_first")




c = RunConfig()


max_sequence_length = 128
tokenizer = BertTokenizer.from_pretrained(c.pretrained_transformer, max_sequence_length = max_sequence_length)
tokenized_dataset = dataset.map(tokenizing, batched=True, batch_size=64)
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


set_seed(0)
data_loaders = preprocess_dataset(tokenized_dataset, batch_size = 16)
data_loaders

clear_output()


In [None]:

selected_layers= [7,8] 
selected_layers_string= [str(i) for i in selected_layers] 

config = BottleneckAdapterBertConfig.from_pretrained(c.pretrained_transformer, layers_to_adapt= selected_layers,num_labels = num_labels, attention_outputs = True)
model = AdapterBertForSequenceClassification.from_pretrained(c.pretrained_transformer, config=config)
model.to(c.device)
unfreez_bias(model, selected_layers_string)
logger.info("Training/evaluation starts...")


Some weights of the model checkpoint at bert-base-cased were not used when initializing AdapterBertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing AdapterBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AdapterBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AdapterBertForSequenceClassification were not initialized fr

in class layer
in class layer


In [None]:
total_trainable_parameters(model)
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)

bert.encoder.layer.7.attention.self.query.bias                                     --->  torch.Size([768])
bert.encoder.layer.7.attention.self.key.bias                                       --->  torch.Size([768])
bert.encoder.layer.7.attention.self.value.bias                                     --->  torch.Size([768])
bert.encoder.layer.7.attention.output.dense.bias                                   --->  torch.Size([768])
bert.encoder.layer.7.attention.output.LayerNorm.bias                               --->  torch.Size([768])
bert.encoder.layer.7.intermediate.dense.bias                                       --->  torch.Size([3072])
bert.encoder.layer.7.intermediate.intermediate_adapter.down_proj.weight            --->  torch.Size([64, 768])
bert.encoder.layer.7.intermediate.intermediate_adapter.down_proj.bias              --->  torch.Size([64])
bert.encoder.layer.7.intermediate.intermediate_adapter.up_proj.weight              --->  torch.Size([768, 64])
bert.encoder.layer.7.intermed

In [None]:


opt = AdamW(model.parameters(), lr=5e-4, correct_bias=True)

# output_path = '/content/drive/MyDrive/ZahraD/MPRC'



set_seed(0)  
train_and_evaluate(model, opt, data_loaders, num_epochs =8, task_name = task_name)



EPOCH: 0   TRAIN: 3668/3668   LOSS: 0.602
TRAIN ACC: 0.75845

VALIDATION ACC: 0.7598


EPOCH: 1   TRAIN: 3668/3668   LOSS: 0.474
TRAIN ACC: 0.82906

VALIDATION ACC: 0.80392


EPOCH: 2   TRAIN: 3668/3668   LOSS: 0.351
TRAIN ACC: 0.88659

VALIDATION ACC: 0.80882


EPOCH: 3   TRAIN: 3668/3668   LOSS: 0.298
TRAIN ACC: 0.92884

VALIDATION ACC: 0.83578


EPOCH: 4   TRAIN: 3668/3668   LOSS: 0.215
TRAIN ACC: 0.9602

VALIDATION ACC: 0.84559


EPOCH: 5   TRAIN: 3668/3668   LOSS: 0.176
TRAIN ACC: 0.9861

VALIDATION ACC: 0.84069


EPOCH: 6   TRAIN: 3668/3668   LOSS: 0.125
TRAIN ACC: 0.98719

VALIDATION ACC: 0.84314


EPOCH: 7   TRAIN: 3668/3668   LOSS: 0.105
TRAIN ACC: 0.99237

VALIDATION ACC: 0.83088




{'test': {'Accuracy': [], 'F1': []},
 'train': {'Accuracy': [0.7584514721919302,
   0.829062159214831,
   0.8865866957470011,
   0.9288440567066522,
   0.9601962922573609,
   0.9860959651035986,
   0.987186477644493,
   0.9923664122137404],
  'F1': [0.8435181914517837,
   0.8603252394742704,
   0.9110350727117194,
   0.9490333919156415,
   0.970361347949655,
   0.9897280966767371,
   0.9904954499494439,
   0.9943480016148567]},
 'validation': {'Accuracy': [0.7598039215686274,
   0.803921568627451,
   0.8088235294117647,
   0.8357843137254902,
   0.8455882352941176,
   0.8406862745098039,
   0.8431372549019608,
   0.8308823529411765],
  'F1': [0.8478260869565217,
   0.8412698412698413,
   0.8500000000000001,
   0.8896210873146623,
   0.8880994671403197,
   0.8869565217391304,
   0.8885017421602788,
   0.8812392426850257]}}

In [None]:
#loading models for RSA

model2_path = '/content/drive/MyDrive/MSc/NLP_Project/saved_model/COLA_57'
model2 = AdapterBertForSequenceClassification.from_pretrained(model2_path, local_files_only=True)

model1 = BertModel.from_pretrained('bert-base-cased')

In [None]:
def get_models_rep(model1, model2,dataloader,  layer):
        representation_1 = []
        representation_2 = []
        model1.cuda()
        model2.cuda()
        model1.eval()
        model2.eval()

        for step, batch in enumerate(dataloader):

          batch = tuple(obj.cuda() for obj in batch)

          input_ids, attention_mask, _, _ = batch

          # forward pass
          with torch.no_grad():
              outputs = model1(input_ids=input_ids, output_hidden_states = True)
              representation_1.extend(outputs[2][layer+1][attention_mask==1])
              # print(len(representation_1[step]))

              outputs2 = model2(input_ids=input_ids, output_hidden_states = True)
              representation_2.extend(outputs2[1][layer+1][attention_mask==1])
        return torch.stack((representation_1)), torch.stack((representation_2))


def calculate_RSA(rep1, rep2, random_index):

    
  rep1_random_samples = rep1[random_index]
  rep2_random_samples = rep2[random_index]



  rep1_norm = torch.norm(rep1_random_samples, dim=1).view(-1,1)
  rep2_norm = torch.norm(rep2_random_samples, dim=1).view(-1,1)
  sim1 = rep1_random_samples @ rep1_random_samples.T / (rep1_norm @ rep1_norm.T)
  sim1_flat = sim1[torch.triu(torch.ones(rs, rs), diagonal=1) == 1]
  sim2 = rep2_random_samples @ rep2_random_samples.T / (rep2_norm @ rep2_norm.T)
  sim2_flat = sim2[torch.triu(torch.ones(rs, rs), diagonal=1) == 1] 
  cos = torch.nn.CosineSimilarity(dim = 0)
  pearson = cos(sim1_flat - sim1_flat.mean(dim=0, keepdim=True), sim2_flat - sim2_flat.mean(dim=0, keepdim=True))
  return pearson    


In [None]:
def RSA_two_model(model1, model2):
  RSAs = torch.ones((12,1))
  rand_gen = True
 
  # rep2_random_samples = rep2[random_index]


  for i in range(7,12):

      rep1, rep2 = get_models_rep(model1, model2, data_loaders['validation'], layer = i)     
      if rand_gen:
          rand_gen = True
          n = len(rep1)
          rs = 5000
          random_index = random.sample([i for i in range(n)], rs)
          rand_gen = False
      p = calculate_RSA(rep1, rep2, random_index)
      RSAs[i] = p
  
  return RSAs


RSA = RSA_two_model(model1, model2)