# Initialize

## General

In [None]:
import os
import sys
import logging
import shutil

In [None]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

## Environment Checking

In [None]:
def is_colab(): # Is environment a Google Colab instance?
    return 'google.colab' in sys.modules

In [None]:
def is_kaggle(): # Is environment a Kaggle Notebook?
    return os.environ.get('PWD') == '/kaggle/working' and os.environ.get('KAGGLE_URL_BASE') == 'https://www.kaggle.com'

In [None]:
print(os.getcwd(), "----", os.cpu_count())

## Config

Pre-Set the Dataset Folder

Automatic Dataset Download and Drive mounting for Colab

Kaggle needs an Dataset createt like the folder Structure and needs to be integrated in the Notebook

Local needs an manual Downloaded Dataset 

Kaggle: 
```
/semeval2024-task2/semeval_task2_training_data
│   dev.json
│   train.json   
└───CT json
    └───...
```


Colab:
```
MyDrive
│   dev.json
│   train.json   
└───CT json
    └───...
```


Local:
```
./tmp/Task-2-SemEval-2024/training_data
│   dev.json
│   train.json   
└───CT json
    └───...
```

### Logging Folder and Dataset Folder

In [None]:
FOLDER = "./dataset/training_data" #Local
if is_colab():  FOLDER = "/content/drive/MyDrive" #Colab
if is_kaggle(): FOLDER = "/kaggle/input/semeval2024-task2/semeval_task2_training_data" #Kaggle

In [None]:
FOLDER

In [None]:
LOG_FOLDER = "./" #Local
if is_colab():  LOG_FOLDER = "/content/drive/MyDrive" #Colab
if is_kaggle(): LOG_FOLDER = "/kaggle/working" #Kaggle

In [None]:
LOG_FOLDER

### Mount

In [None]:
if is_colab():
  from google.colab import drive
  drive.mount('/content/drive')

In [None]:
if is_kaggle():
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

### Install Dataset for Colab

In [None]:
if is_colab():
  !ls
  %cd /content/drive/MyDrive

  PROJECT_DIR = '/content/drive/MyDrive/semeval-2024'
  PROJECT_GITHUB_URL = 'https://github.com/ai-systems/Task-2-SemEval-2024.git'

  if not os.path.isdir(PROJECT_DIR):
    !git clone {PROJECT_GITHUB_URL}
  else:
    %cd {PROJECT_DIR}
    !git pull {PROJECT_GITHUB_URL}

  !unzip -n /content/drive/MyDrive/Task-2-SemEval-2024/training_data.zip

  %cd /content
  !ls

In [None]:
if not is_colab() and not is_kaggle():
    !cd dataset
    shutil.unpack_archive("dataset/training_data.zip", "dataset") 
    !cd ..

# Imports

Installing and importing all the neccesary python libaries

In [None]:
if is_colab():
  !pip install datasets
  !pip install pytorch_lightning
  !pip install torchmetrics
  !pip install transformers
  #!pip install torchinfo
  !pip install sentencepiece
  !pip install mlxtend

In [None]:
if not is_colab() and not is_kaggle():
  !pip install json
  !pip install numpy
  !pip install matplotlib
  #!pip install seaborn
  #!pip install tqdm
  #!pip install pandas
  !pip install torch
  !pip install pytorch_lightning
  !pip install torchmetrics
  !pip install transformers
  !pip install sentencepiece
  !pip install datasets
  !pip install mlxtend
  !pip install scikit-learn

In [None]:
from typing import List, Tuple, Any
import os
import sys
#import timeit
import random
from collections import Counter
import json
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sn
#from tqdm import tqdm
#import pandas as pd

import torch
#import torch.nn.functional as F
import pytorch_lightning as pl
#from pytorch_lightning.utilities import CombinedLoader
import torchmetrics
import transformers
from datasets import load_dataset

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
#from sklearn.model_selection import train_test_split

from functools import lru_cache
from mlxtend.plotting import plot_confusion_matrix


#from pytorch_lightning.utilities.model_summary import summarize


# Modules

## Helper Function

In [None]:
#@lru_cache(maxsize=1024)
def get_file(pth:str) -> dict:
    """Loading a JSON File

    Args:
        pth (str): Path to File

    Returns:
        dict: Dictionary with the Entries of the json file
    """

    with open(pth) as json_file:
        dev = json.load(json_file)
    return dev

In [None]:
def extract(itm:dict, permutate:bool=False) -> Tuple[List[str], str]:
    """Return all the neccessary information of one Datapoint which includes the relevant Statement, the Section and
    Primary Trial. If the type is Comparison also the Secondary Trial is added

    Args:
        itm (dict): Datapoint/Hypothesis
        permutate (bool, optional): Activates Permutation of Primary and Secondary Trial. Defaults to False.

    Returns:
        Tuple[List[str], str]: Returns List[Statement, Section, Primary, Secondary] and Label of a Datapoint
    """

    primary   = get_file(f"{FOLDER}/CT json/{itm['Primary_id']}.json"  )[itm['Section_id']]
    secondary = get_file(f"{FOLDER}/CT json/{itm['Secondary_id']}.json")[itm['Section_id']] if itm['Type'] == 'Comparison' else []
    if permutate: random.shuffle(primary)
    if permutate: random.shuffle(secondary)
    return [itm['Statement'], itm['Section_id'], ", ".join([i.strip() for i in primary]), ", ".join([i.strip() for i in secondary])], itm['Label']

In [None]:
def split_v2(strng:str, sep:str, pos:int) -> List[str]:
    """splitting a input string with a seperator and putting the elements before a position and after a position with the same seperator together

    Args:
        strng (str): Input String
        sep (str): Seperator
        pos (int): position which String are being fused togehter

    Returns:
        List[str]: Seperated String
    

    Example Input 1: 
        strng='aaa,bbb,ccc,ddd'
        sep  =','
        pos  =3
        --------------> ['aaa,bbb,ccc', 'ddd']
    Example Input 2: 
        strng='aaa,bbb,ccc,ddd'
        sep  =','
        pos  =2
        --------------> ['aaa,bbb','ccc,ddd']
    """
    strng = strng.split(sep)
    return [sep.join(strng[:pos]), sep.join(strng[pos:])]

In [None]:
def class_to_cos(inp:torch.Tensor) -> torch.Tensor:
    """
    Mapping 0 Class to -1
    Mapping 1 Class to +1
    
    Args:
        inp (torch.Tensor): Input Tensor

    Returns:
        torch.Tensor: Mapped Tensor
    """
    inp = torch.clone(inp.detach())
    inp[inp == 0] = -1
    inp[inp == 1] = 1
    return inp

In [None]:
def cos_to_class(inp:torch.Tensor) -> torch.Tensor:
    """
    Mapping < 0 to Class 0 and 
    Mapping > 0 to Class 1

    Args:
        inp (torch.Tensor): Input Tensor

    Returns:
        torch.Tensor: Mapped Tensor
    """
    inp = torch.clone(inp.detach())
    inp[inp < 0] = 0
    inp[inp > 0] = 1
    return inp

In [None]:
def get_nonlin_func(nonlin: str):
    """Returns a NON-Linearity Activation function

    Args:
        nonlin (str): Function name

    Raises:
        ValueError: no suitable functionn choosen

    Returns:
        func: Activation Function
    """
    if nonlin == "tanh":    return torch.tanh
    if nonlin == "relu":    return torch.relu
    if nonlin == "gelu":    return torch.nn.functional.gelu
    if nonlin == "sigmoid": return torch.sigmoid
    raise ValueError("Unsupported nonlinearity!")

In [None]:
def upd(dic:dict, key:Any, val:Any) -> dict:
    """Updates A dictionary key-value Entry

    Args:
        dic (dict): Input Dictionary
        key (Any): Key which Value needs to be updated
        val (Any): ned Value

    Returns:
        dict: Updated Dictionary
    """
    dic.update({key: val})
    return dic

## Dataset & Dataloader & LightningDataModule

In [None]:
label_enum = {"Contradiction":0,"Entailment":1} #Text Classes to Numbers

### Datasets

In [None]:
class SemEvalDatasetModule:
    def __init__(self, dataset, permutate=False):
        self.dataset = dataset
        self.permutate = permutate

    def __len__(self) -> int:
        """Returns the length of the Dataset

        Returns:
            int: Length of the Dataset
        """
        return len(self.dataset)
    
    def name(self) -> str:
        """Returns a Name

        Returns:
            str: name of the Class
        """
        return 'semeval'

    def __getitem__(self, idx:int)-> dict:
        """Returns the Datapoint from the Dataset on position index

        Args:
            idx (int): position in the set

        Returns:
            dict: Sencence and Label
        """
        tmp = extract(self.dataset[idx], self.permutate)
        return {'image': " [SEP] ".join(tmp[0]), 'label':torch.tensor(label_enum[tmp[1]])}

In [None]:
class ScifactDatasetModule:
    def __init__(self):
        """Scifact Dataset Loader loading without the Neutral Class with a mapping to the text Classes of SemEval
        Due to small dataset train and validation is fused an be interpreted as one
        Original Classes being either SUPPORT, NEI, CONTRADICT
        
        Task is to check the entailment if the claim matches to a paper title and abstract
        
        https://huggingface.co/datasets/allenai/scifact_entailment
        """
        self.ds = load_dataset("allenai/scifact_entailment")
        self.dataset = [upd(i, "label", "Entailment" if (i['verdict'] == "SUPPORT") else "Contradiction") for i in [*self.ds["train"], *self.ds["validation"]] if not (i['verdict'] == "NEI")]

    def __len__(self) -> int:
        """Returns the length of the Dataset

        Returns:
            int: Length of the Dataset
        """
        return len(self.dataset)
    
    def name(self) -> str:
        """Returns a Name

        Returns:
            str: name of the Class
        """
        return 'scifact'

    def __getitem__(self, idx:int)-> dict:
        """Returns the Datapoint from the Dataset on position index

        Args:
            idx (int): position in the set

        Returns:
            dict: Sencence and Label
        """
        return {'image': "{} [SEP] {} [SEP] {}".format(self.dataset[idx]["claim"], self.dataset[idx]["title"], " ".join(self.dataset[idx]["abstract"])), 'label':torch.tensor(label_enum[self.dataset[idx]["label"]])}

In [None]:
class HealthverDatasetModule:
    def __init__(self):
        """Healthver Dataset Loader loading without the Neutral Class with a mapping to the text Classes of SemEval
        Due to small dataset train and validation is fused an be interpreted as one
        Original Classes being either SUPPORT, NEI, CONTRADICT
        
        Task is to check the entailment if the claim matches to a paper title and abstract
        
        https://huggingface.co/datasets/dwadden/healthver_entailment
        """
        self.ds = load_dataset("dwadden/healthver_entailment")
        self.dataset = [upd(i, "label", "Entailment" if (i['verdict'] == "SUPPORT") else "Contradiction") for i in [*self.ds["train"], *self.ds["validation"]] if not (i['verdict'] == "NEI")]

    def __len__(self) -> int:
        """Returns the length of the Dataset

        Returns:
            int: Length of the Dataset
        """
        return len(self.dataset)
    
    def name(self) -> str:
        """Returns a Name

        Returns:
            str: name of the Class
        """
        return 'healthver'

    def __getitem__(self, idx:int)-> dict:
        """Returns the Datapoint from the Dataset on position index

        Args:
            idx (int): position in the set

        Returns:
            dict: Sencence and Label
        """
        return {'image': "{} [SEP] {} [SEP] {}".format(self.dataset[idx]["claim"], self.dataset[idx]["title"], " ".join(self.dataset[idx]["abstract"])), 'label':torch.tensor(label_enum[self.dataset[idx]["label"]])}

In [None]:
class SnliDatasetModule:
    def __init__(self, name="validation"):
        """ SNLI Dataset Loader loading without the Neutral Class with a mapping to the text Classes of SemEval
        
        Task is to check the entailment if the premise matches hypothesis
        
        https://huggingface.co/datasets/snli

        Args:
            name (str, optional): train, validation, test of snli dataset. Defaults to "validation".
        """
        self.ds = load_dataset("snli")
        self.dataset = [upd(i, "label", "Entailment" if (i['label'] == 0) else "Contradiction") for i in self.ds[name] if not (i['label'] == 1)]

    def __len__(self) -> int:
        """Returns the length of the Dataset

        Returns:
            int: Length of the Dataset
        """
        return len(self.dataset)
    
    def name(self) -> str:
        """Returns a Name

        Returns:
            str: name of the Class
        """
        return 'snli'

    def __getitem__(self, idx:int)-> dict:
        """Returns the Datapoint from the Dataset on position index

        Args:
            idx (int): position in the set

        Returns:
            dict: Sencence and Label
        """
        return {'image': "{} [SEP] {}".format(self.dataset[idx]["premise"], self.dataset[idx]["hypothesis"]), 'label':torch.tensor(label_enum[self.dataset[idx]["label"]])}

### Lightning Module

In [None]:
class DataModule(pl.LightningDataModule):
    def __init__(self, batch_size=4, num_workers=0, permutate=False, strategy="", ds_names=[], **kwargs):
      """Lightning Dataset Module for Loading all possible Datasets initial

      Args:
          batch_size (int, optional): Batch size for the Dataloaders. Defaults to 4.
          num_workers (int, optional): Number of Parallel Workers. Defaults to 0.
          permutate (bool, optional): Activates Permutation for Training. Defaults to False.
          strategy (str, optional): Strategy Decision oneof("CombinedLoader", "CombinedDataset", "Pre-Post:0:50"). Defaults to "".
          ds_names (list, optional): optional Datasets for the Strategy. Defaults to [].
      """
      super().__init__()
      self.save_hyperparameters()
      self.prepare_data_per_node = False
      print(self.hparams)


      #Loading Training Dataset
      train = get_file(f"{FOLDER}/train.json")
      self.train_dataset   = SemEvalDatasetModule([train[key] for key in train], self.hparams.permutate)
      print("Length of SemEval Train Dataset: ", len(self.train_dataset))
      
      #Expanding the Dataset
      if self.hparams.strategy:
        assert ds_names, "NO Dataset Choosen"
        print(f"Datasets for expanding Training with strategy {self.hparams.strategy}:")
        self.ds = [self.train_dataset]
        if "snli" in self.hparams.ds_names: self.append_train_ds(SnliDatasetModule())
        if "scifact" in self.hparams.ds_names: self.append_train_ds(ScifactDatasetModule())
        if "healthver" in self.hparams.ds_names: self.append_train_ds(HealthverDatasetModule())
        
      #Loading Validation Dataset
      valid = get_file(f"{FOLDER}/dev.json")
      self.val_dataset     = SemEvalDatasetModule([valid[key] for key in valid])
      print("Length of SemEval Valid Dataset: ", len(self.val_dataset))
        
      #Loading Test Dataset
      self.test_dataset    =  None
      
      #Loading Validation Dataset
      self.predict_dataset = None



        
            
    def append_train_ds(self, ds):
      """Append the Dataset to the List

      Args:
          ds (Dataset): Selected Dataset
      """
      print(f"      Length of {ds.name()} Dataset: ", len(ds))
      self.ds.append(ds)

    def symm_merger(self) -> List[dict]:
      """Merges equaly each Dataset 
      
      Example for 4 Datasets: 1, 2, 3, 4,    1, 2, 3, 4,    ...

      Returns:
          List[dict]: Newly Merged Dictionaries
      """
      dl = [torch.utils.data.DataLoader(i,batch_size=None,num_workers=self.hparams.num_workers,shuffle=True) for i in self.ds]
      return [{'image': j["image"], 'label': j["label"]} for i in zip(*dl) for j in i]



    def train_dataloader(self)-> torch.utils.data.DataLoader:
      """Returns the Training Dataloader with the choosen hyperparameters

      Returns:
          torch.utils.data.DataLoader: Train Dataloader
      """
      #equal number elements in batch (no shuffle because it destroys the structure)
      if "CombinedLoader" == self.hparams.strategy:
        return torch.utils.data.DataLoader(self.symm_merger(),batch_size=self.hparams.batch_size,num_workers=self.hparams.num_workers,shuffle=False)

      #fully randomly mixind of all Datasets
      if "CombinedDataset" == self.hparams.strategy:
        return torch.utils.data.DataLoader(torch.utils.data.ConcatDataset(self.ds), batch_size=self.hparams.batch_size,num_workers=self.hparams.num_workers,shuffle=True)

      #Training in several steps (3 Steps: Pre-Post:30:50 --> Epoch ds1 0-30, Epoch ds2 30-50, Epoch semeval 50-END)
      if "Pre-Post" in self.hparams.strategy:
        _split= self.hparams.strategy.split(":")
        _split[0] = "0"
        _split = list(map(int, _split))
        assert len(self.ds) == len(_split), "Number Stages does not equal Number of Datasets"
        return {i:torch.utils.data.DataLoader(j,batch_size=self.hparams.batch_size,num_workers=self.hparams.num_workers,shuffle=True) for i, j in zip(_split, reversed(self.ds))}

      #Failsafe
      return torch.utils.data.DataLoader(self.train_dataset,batch_size=self.hparams.batch_size,num_workers=self.hparams.num_workers,shuffle=True) 



    def val_dataloader(self) -> torch.utils.data.DataLoader:
      """Returns the Validation Dataloader with the choosen hyperparameters and no shuffle

      Returns:
          torch.utils.data.DataLoader: Validation Dataloader
      """
      return torch.utils.data.DataLoader(self.val_dataset,batch_size=self.hparams.batch_size,num_workers=self.hparams.num_workers,shuffle=False)



    def test_dataloader(self) -> torch.utils.data.DataLoader:
      """Returns the Test Dataloader with the choosen hyperparameters and no shuffle

      Returns:
          torch.utils.data.DataLoader: Test Dataloader
      """
      return torch.utils.data.DataLoader(self.test_dataset,batch_size=self.hparams.batch_size,num_workers=self.hparams.num_workers,shuffle=False)



    def predict_dataloader(self) -> torch.utils.data.DataLoader:
      """Returns the Predict Dataloader with the choosen hyperparameters and no shuffle

      Returns:
          torch.utils.data.DataLoader: Predict Dataloader
      """
      return torch.utils.data.DataLoader(self.predict_dataset,batch_size=self.hparams.batch_size,num_workers=self.hparams.num_workers,shuffle=False)

## Set up the Pytorch Model and LightningModule

### AutoModelForSequenceClassification + CELoss

In [None]:
class ClassificationModel_V1(torch.nn.Module):
    """Baseline Model AutoModelForSequenceClassification with CrossEntropyLoss as loss function
    
    model.forward():
    | tokenize Sentence
    | Prediction
    | Calculate Loss
    | Return Loss and Prediction

    """
    def __init__(self,model_name:str="distilbert-base-uncased", num_labels:int=2,len_embeddings:int=512):
        super(ClassificationModel_V1,self).__init__()
        self.len_embeddings = len_embeddings
        self.model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=num_labels,max_position_embeddings=self.len_embeddings,ignore_mismatched_sizes=True)
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name,do_lower_case=True,sep_token='[SEP]')
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self,sentence,label,device):
        token = self.tokenizer(sentence,add_special_tokens=True,padding='max_length',truncation=True,max_length=self.len_embeddings,return_tensors="pt").to(device)
        pred = self.model(**token,labels=label)
        loss = self.loss_fn(pred.logits, label)
        return torch.unsqueeze(loss, -1), torch.argmax(pred.logits, dim=1)

### 4xSiameseBert (weight Sharing) -> (u, v, x, y)Linear + CELoss

In [None]:
class ClassificationModel_V2(torch.nn.Module):
    """Sentence BERT idea with 4 Siamese Networks, where the 4 parts of the sentence run seperately trough the model
    
    #https://www.sbert.net/examples/training/nli/README.html
    #https://arxiv.org/pdf/1908.10084.pdf
    
    model.forward():
    | Split sentence in 4 pieces with the [SEP] token as  Splitter
    | 4xTokenize Sentence
    | 4x Prediction
    | Stack Items
    | Fully Connected Layer
    | Calculate Loss
    | Return Loss and Prediction

    """
    def __init__(self,model_name:str="distilbert-base-uncased", num_labels:int=2,len_embeddings:int=512):
        super(ClassificationModel_V2,self).__init__()
        self.len_embeddings = len_embeddings
        self.model = transformers.AutoModel.from_pretrained(model_name,max_position_embeddings=self.len_embeddings,ignore_mismatched_sizes=True)
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name,do_lower_case=True,sep_token='[SEP]')
        self.loss_fn = torch.nn.CrossEntropyLoss()

        #Feed Forward Network
        self.dp1 = torch.nn.Dropout(p=0.1, inplace=False)
        self.fc1 = torch.nn.Linear(3072, num_labels, bias=True)

    def forward(self,sentence,label,device):
        prepare_s  = [itm.split(" [SEP] ") for itm in sentence]
        tokens     = [self.tokenizer(itm,padding='max_length',truncation=True,max_length=self.len_embeddings,return_tensors="pt").to(device) for itm in prepare_s]
        embeddings = [self.model(**itm, output_hidden_states=True, return_dict=True).pooler_output for itm in tokens]

        x = torch.stack(embeddings)
        pred = self.fc1(self.dp1(x.flatten(1)))
        loss = self.loss_fn(pred, label)
        return torch.unsqueeze(loss, -1), torch.argmax(pred, dim=1)

### 2xSiameseBert (weight Sharing) -> (u, v)Linear + CELoss

In [None]:
class ClassificationModel_V3(torch.nn.Module):
    """Sentence BERT idea with 2 Siamese Networks, where the 2 parts of the sentence run seperately trough the model
    statement vs section [SEP] primary [SEP] secondary 
    
    #https://www.sbert.net/examples/training/nli/README.html
    #https://arxiv.org/pdf/1908.10084.pdf
    
    model.forward():
    | Split sentence in 4 pieces with the [SEP] token as  Splitter
    | 2xTokenize Sentence
    | 2x Prediction
    | Stack Items
    | Fully Connected Layer
    | Calculate Loss
    | Return Loss and Prediction

    """
    def __init__(self,model_name:str="distilbert-base-uncased", num_labels:int=2,len_embeddings:int=512):
        super(ClassificationModel_V3,self).__init__()
        self.len_embeddings = len_embeddings
        self.model = transformers.AutoModel.from_pretrained(model_name,max_position_embeddings=self.len_embeddings,ignore_mismatched_sizes=True)
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name,do_lower_case=True,sep_token='[SEP]')
        self.loss_fn = torch.nn.CrossEntropyLoss()

        #Feed Forward Network
        self.dp1 = torch.nn.Dropout(p=0.1, inplace=False)
        self.fc1 = torch.nn.Linear(1536, num_labels, bias=True)

    def forward(self,sentence,label,device):
        prepare_s  = [split_v2(itm, " [SEP] ", 1) for itm in sentence]
        tokens     = [self.tokenizer(itm,padding='max_length',truncation=True,max_length=self.len_embeddings,return_tensors="pt").to(device) for itm in prepare_s]
        embeddings = [self.model(**itm, output_hidden_states=True, return_dict=True).pooler_output for itm in tokens]

        x = torch.stack(embeddings)
        pred = self.fc1(self.dp1(x.flatten(1)))
        loss = self.loss_fn(pred, label)
        return torch.unsqueeze(loss, -1), torch.argmax(pred, dim=1)

### 2xSiameseBert (weight Sharing) -> (u, v, |u-v|)Linear + CELoss

In [None]:
class ClassificationModel_V4(torch.nn.Module):
    """Sentence BERT idea with 2 Siamese Networks, where the 2 parts of the sentence run seperately trough the model
    statement vs section [SEP] primary [SEP] secondary 
    
    #https://www.sbert.net/examples/training/nli/README.html
    #https://arxiv.org/pdf/1908.10084.pdf
    
    model.forward():
    | Split sentence in 2 pieces with the [SEP] token as  Splitter
    | 2xTokenize Sentence
    | 2x Prediction
    | Stack Items
    | Fully Connected Layer+
    | Calculate Loss
    | Return Loss and Prediction

    """
    def __init__(self,model_name:str="distilbert-base-uncased", num_labels:int=2,len_embeddings:int=512):
        super(ClassificationModel_V4,self).__init__()
        self.len_embeddings = len_embeddings
        self.model = transformers.AutoModel.from_pretrained(model_name,max_position_embeddings=self.len_embeddings,ignore_mismatched_sizes=True)
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name,do_lower_case=True,sep_token='[SEP]')
        self.loss_fn = torch.nn.CrossEntropyLoss()

        self.dp1 = torch.nn.Dropout(p=0.1, inplace=False)
        self.fc1 = torch.nn.Linear(2304, num_labels, bias=True)

    def forward(self,sentence,label,device):
        prepare_s  = [split_v2(itm, " [SEP] ", 1) for itm in sentence]
        tokens     = [self.tokenizer(itm,padding='max_length',truncation=True,max_length=self.len_embeddings,return_tensors="pt").to(device) for itm in prepare_s]
        embeddings = [self.model(**itm, output_hidden_states=True, return_dict=True).pooler_output for itm in tokens]
        stack      = [torch.cat([itm, torch.abs(itm[0] - itm[1]).unsqueeze(0)]) for itm in embeddings]

        x = torch.stack(stack)
        pred = self.fc1(self.dp1(x.flatten(1)))
        loss = self.loss_fn(pred, label)
        return torch.unsqueeze(loss, -1), torch.argmax(pred, dim=1)

### 2xSiameseBert (weight Sharing) -> CosSim + CosEmbLoss

In [None]:
class ClassificationModel_V5(torch.nn.Module):
    """Siamese Networks Structure combined with Cosine Simmilarity of the two output embeddings with CosineEmbeddingLoss
    
    #https://www.sbert.net/examples/training/nli/README.html
    #https://arxiv.org/pdf/1908.10084.pdf
    
    model.forward():
    | Split sentence in 2 pieces with the [SEP] token as  Splitter
    | 2xTokenize Sentence
    | 2x Prediction
    | Stack Items
    | Calculate Loss + class_to_cos mapping
    | Return Loss and Prediction + cos_to_class mapping

    """
    def __init__(self,model_name:str="distilbert-base-uncased", num_labels:int=2,len_embeddings:int=512):
        super(ClassificationModel_V5,self).__init__()
        self.len_embeddings = len_embeddings
        self.model = transformers.AutoModel.from_pretrained(model_name,max_position_embeddings=self.len_embeddings,ignore_mismatched_sizes=True)
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name,do_lower_case=True,sep_token='[SEP]')
        self.loss_fn = torch.nn.CosineEmbeddingLoss()
        self.cos_sim = torch.nn.CosineSimilarity(dim=-1)

    def forward(self,sentence,label, device):
        prepare_s  = [split_v2(itm, " [SEP] ", 1) for itm in sentence]
        tokens     = [self.tokenizer(itm,padding='max_length',truncation=True,max_length=self.len_embeddings,return_tensors="pt").to(device) for itm in prepare_s]
        embeddings = [self.model(**itm, output_hidden_states=True, return_dict=True).pooler_output for itm in tokens]
        out      = torch.stack(embeddings).swapaxes(0,1)

        loss = self.loss_fn(out[0], out[1], class_to_cos(label))
        cos = self.cos_sim(out[0], out[1])
        return torch.unsqueeze(loss, -1), cos_to_class(cos)

### 2xSiameseBert (weight Sharing) -> CosSim + MSE or L1 or SoftMargin

In [None]:
class ClassificationModel_V6(torch.nn.Module):
    """Siamese Networks Structure combined with Cosine Simmilarity of the two output embeddings with other Loss functions
    
    #https://www.sbert.net/examples/training/nli/README.html
    #https://arxiv.org/pdf/1908.10084.pdf
    
    model.forward():
    | Split sentence in 2 pieces with the [SEP] token as  Splitter
    | 2xTokenize Sentence
    | 2x Prediction
    | Stack Items
    | Calculate Loss + class_to_cos mapping
    | Return Loss and Prediction + cos_to_class mapping

    """
    def __init__(self,model_name:str="distilbert-base-uncased", num_labels:int=2,len_embeddings:int=512, v:int=1):
        super(ClassificationModel_V6,self).__init__()
        self.len_embeddings = len_embeddings
        self.model = transformers.AutoModel.from_pretrained(model_name,max_position_embeddings=self.len_embeddings,ignore_mismatched_sizes=True)
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name,do_lower_case=True,sep_token='[SEP]')

        if v == 1: self.loss_fn = torch.nn.MSELoss()
        if v == 2: self.loss_fn = torch.nn.L1Loss()
        if v == 3: self.loss_fn = torch.nn.SoftMarginLoss()
        self.cos_sim = torch.nn.CosineSimilarity(dim=-1, eps=1e-9)

    def forward(self,sentence,label, device):
        prepare_s  = [split_v2(itm, " [SEP] ", 1) for itm in sentence]
        tokens     = [self.tokenizer(itm,padding='max_length',truncation=True,max_length=self.len_embeddings,return_tensors="pt").to(device) for itm in prepare_s]
        embeddings = [self.model(**itm, output_hidden_states=True, return_dict=True).pooler_output for itm in tokens]
        out        = torch.stack(embeddings).swapaxes(0,1)

        cos        = self.cos_sim(out[0], out[1])
        loss       = self.loss_fn(cos, class_to_cos(label).float())
        return torch.unsqueeze(loss, -1), cos_to_class(cos)

### 2xSiameseBert (weight Sharing) -> (u, v)Linear + CELoss + SupConLoss (abandoned)

In [None]:
#https://github.com/princeton-nlp/SimCSE/blob/main/simcse/models.py
#https://bhuvana-kundumani.medium.com/implementation-of-simcse-for-unsupervised-approach-in-pytorch-a3f8da756839
#https://arxiv.org/pdf/2104.08821.pdf
#https://arxiv.org/pdf/2109.04321.pdf
#https://aclanthology.org/2023.semeval-1.91.pdf

class SupConLoss(torch.nn.Module):
    def __init__(self, temperature=0.07):
        super(SupConLoss, self).__init__()
        self.temperature = temperature
        self.cos_sim = torch.nn.CosineSimilarity(dim=-1, eps=1e-9)
        self.loss_fct = torch.nn.CrossEntropyLoss()

    def forward(self, features):
      cos = self.cos_sim(features[0].unsqueeze(1), features[1].unsqueeze(0)) / self.temperature
      labels = torch.arange(cos.size(0)).long()
      return self.loss_fct(cos, labels)

class ClassificationModel_V8(torch.nn.Module):
    """Siamese Networks Structure combined with Cosine Simmilarity and Contrastive Loss
    
    #https://github.com/princeton-nlp/SimCSE/blob/main/simcse/models.py
    #https://bhuvana-kundumani.medium.com/implementation-of-simcse-for-unsupervised-approach-in-pytorch-a3f8da756839
    #https://arxiv.org/pdf/2104.08821.pdf
    #https://arxiv.org/pdf/2109.04321.pdf
    #https://aclanthology.org/2023.semeval-1.91.pdf
    
    model.forward():
    | Split sentence in 2 pieces with the [SEP] token as  Splitter
    | 2xTokenize Sentence
    | 2x Prediction
    | Stack Items
    | Classifier
    | Calculate Loss
    | Return Loss and Prediction

    """
    def __init__(self,model_name:str="distilbert-base-uncased", num_labels:int=2,len_embeddings:int=512, v:int=1):
        super(ClassificationModel_V8,self).__init__()
        self.len_embeddings = len_embeddings
        self.model = transformers.AutoModel.from_pretrained(model_name,max_position_embeddings=self.len_embeddings,ignore_mismatched_sizes=True)
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name,do_lower_case=True,sep_token='[SEP]')

        #Feed Forward Network
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(1536, 2)
        self.loss_ce = torch.nn.CrossEntropyLoss()

        if v == 1: self.loss_scl = SupConLoss(temperature=0.5)
        if v == 2: self.loss_scl = SupConLoss(temperature=0.25)
        if v == 3: self.loss_scl = SupConLoss(temperature=0.125)
        if v == 4: self.loss_scl = SupConLoss(temperature=0.05)

    def forward(self,sentence,label, device):
        prepare_s  = [split_v2(itm, " [SEP] ", 1) for itm in sentence]
        tokens     = [self.tokenizer(itm,padding=True,truncation=True,return_tensors="pt").to(device) for itm in prepare_s]
        embeddings = [self.model(**itm, output_hidden_states=True, return_dict=True).pooler_output for itm in tokens]

        emb = torch.stack(embeddings)
        logits = self.classifier(self.dropout(emb.flatten(1)))

        loss = self.loss_ce(logits, label) + self.alpha * self.loss_scl(emb)
        #loss = (1 - self.alpha) * self.loss_ce(logits, label) + self.alpha * self.loss_scl(emb)
        return  torch.unsqueeze(loss, -1), torch.argmax(logits, dim=1)

### Adapter BertModelForSequenceClassification + CELoss

In [None]:
from transformers.models.bert.modeling_bert import BertIntermediate, BertOutput, BertLayer, BertEncoder, BertAttention, BertSelfOutput
from transformers import BertConfig, BertModel, BertForSequenceClassification

class BottleneckAdapterBertConfig(BertConfig):
    def __init__(self,adapter_residual:bool=True,
                      add_attention_adapter:bool=True,
                      add_intermediate_adapter:bool=True,
                      add_output_adapter:bool=True,
                      layers_to_adapt:list=list(range(12)),
                      adapter_non_linearity:str="gelu",
                      adapter_latent_size:int=512,
                      last_layer_dropout:float=0.2,
                      hidden_size:float=0.2,
                      **kwargs):
        """Config Definition of the Adapter Model

        Args:
            adapter_residual (bool, optional): Adds the Residual Adder term in the Bottelneck FFN. Defaults to True.
            add_attention_adapter (bool, optional): Adds the Attention Adapter. Defaults to True.
            add_intermediate_adapter (bool, optional): Adds the Intermediate Adapter. Defaults to True.
            add_output_adapter (bool, optional): Adds the Output Adapter. Defaults to True.
            layers_to_adapt (list, optional): How much layers needs to be changed. Defaults to list(range(12)).
            adapter_non_linearity (str, optional): Which non-linearity function is used. Defaults to "gelu".
            adapter_latent_size (int, optional): Size of the downsizing and upsizing Bottleneck. Defaults to 512.
            last_layer_dropout (float, optional): Dropout Percentage. Defaults to 0.2.
            hidden_size (float, optional): Size of the Tensor. Defaults to 0.2.
        """
        super().__init__(**kwargs)
        self.adapter_residual = adapter_residual
        self.add_attention_adapter = add_attention_adapter
        self.add_intermediate_adapter = add_intermediate_adapter
        self.add_output_adapter = add_output_adapter
        self.last_layer_dropout = last_layer_dropout
        self.hidden_size = hidden_size
        self.layers_to_adapt = layers_to_adapt
        self.adapter_latent_size = adapter_latent_size
        self.adapter_non_linearity = adapter_non_linearity







class BottleneckAdapterLayer(torch.nn.Module):
    """Adapter Bottleneck Layer with a Downsizing, Activation and Upsizing FFN construct
    """
    def __init__(self, config, feature_size=None):
        """ Initialize the Adapter and Creating the Linear Layers
        """
        super().__init__()
        self.adapter_input_size = feature_size if feature_size else config.hidden_size
        self.adapter_latent_size = config.adapter_latent_size
        self.residual = config.adapter_residual

        self.down_proj = torch.nn.Linear(self.adapter_input_size, self.adapter_latent_size) # down projection
        self.non_linearity = get_nonlin_func(config.adapter_non_linearity) # non linearity
        self.up_proj = torch.nn.Linear(self.adapter_latent_size, self.adapter_input_size) # up projection

        self.init_weights()

    def init_weights(self):
        """Initialize the weights -> so that initially the whole Adapter layer is a near-identity function
        """
        self.down_proj.weight.data.normal_(mean=0.0, std=0.02)
        self.down_proj.bias.data.zero_()
        self.up_proj.weight.data.normal_(mean=0.0, std=0.02)
        self.up_proj.bias.data.zero_()

    def forward(self, x:torch.Tensor) -> torch.Tensor:
        """Definition of the Adapter layer as described in many Papers

        Args:
            x (torch.Tensor): input Tensor

        Returns:
            torch.Tensor: output Tensor
        """
        output = self.up_proj(self.non_linearity(self.down_proj(x)))
        if self.residual: output = x + output
        return output
    
    
    
    
    
    
    
    

class AdapterBertOutput(BertOutput):
    """Overrides BertOutput with the Adapter Version
    and adding the BottleneckAdapterLayer as it is described in papers

    """
    def __init__(self, config, layer_index):
        """Initializing the new BottleneckAdapterLayer
        """
        super().__init__(config)
        self.add_adapter = layer_index in config.layers_to_adapt  and config.add_output_adapter
        if self.add_adapter: self.output_adapter = BottleneckAdapterLayer(config)

    def forward(self, hidden_states, input_tensor):
        """Reconnect the Paths to the definition
        """
        hidden_states = self.dropout(self.dense(hidden_states))
        if self.add_adapter: hidden_states = self.output_adapter(hidden_states) # adapter extension
        return self.LayerNorm(hidden_states + input_tensor)

class AdapterBertIntermediate(BertIntermediate):
    """Overrides BertIntermediate with the Adapter Version
    and adding the BottleneckAdapterLayer as it is described in papers

    """
    def __init__(self, config, layer_index):
        """Initializing the new BottleneckAdapterLayer
        """
        super().__init__(config)
        self.add_adapter = layer_index in config.layers_to_adapt and config.add_intermediate_adapter
        if self.add_adapter: self.intermediate_adapter = BottleneckAdapterLayer(config, feature_size=self.dense.out_features)

    def forward(self, hidden_states):
        """Reconnect the Paths to the definition
        """
        if self.add_adapter: hidden_states = self.intermediate_adapter(hidden_states) # adapter extension
        return self.intermediate_act_fn(self.dense(hidden_states))

class AdapterBertSelfOutput(BertSelfOutput):
    """Overrides BertSelfOutput with the Adapter Version
    and adding the BottleneckAdapterLayer as it is described in papers

    """
    def __init__(self, config, layer_index):
        """Initializing the new BottleneckAdapterLayer
        """
        super().__init__(config)
        self.add_adapter = layer_index in config.layers_to_adapt  and config.add_attention_adapter
        if self.add_adapter: self.attention_adapter = BottleneckAdapterLayer(config)

    def forward(self, hidden_states, input_tensor):
        """Reconnect the Paths to the definition
        """
        hidden_states = self.dropout(self.dense(hidden_states))
        if self.add_adapter: hidden_states = self.attention_adapter(hidden_states) # adapter extension
        return self.LayerNorm(hidden_states + input_tensor)








class AdapterBertAttention(BertAttention):
    """Overrides BertAttention with the Adapter Version
    """
    def __init__(self, config, layer_index):
        super().__init__(config)
        self.output = AdapterBertSelfOutput(config, layer_index)

class AdapterBertLayer(BertLayer):
    """Overrides BertLayer with the Adapter Version
    """
    def __init__(self, config, layer_index):
        super().__init__(config)
        self.attention = AdapterBertAttention(config, layer_index)
        self.intermediate = AdapterBertIntermediate(config, layer_index)
        self.output = AdapterBertOutput(config, layer_index)

class AdapterBertEncoder(BertEncoder):
    """Overrides BertEncoder with the Adapter Version
    """
    def __init__(self, config):
        super().__init__(config)
        self.layer = torch.nn.ModuleList([AdapterBertLayer(config, i) for i in range(config.num_hidden_layers)])








class AdapterBertModel(BertModel):
    """Overrides BertModel with the Adapter Version
    """
    def __init__(self, config):
        super().__init__(config)
        self.encoder = AdapterBertEncoder(config)       
        self.freeze_unfreeze_params_all(False)
        self.freeze_unfreeze_params_adapter(config, True)

    def freeze_unfreeze_params_adapter(self, config, requires_grad:bool):
        """Freeze and Unfreeze Adapter Parameters

        Args:
            config: Bert Config
            requires_grad (bool): enable disable Gradient
        """
        for i in range(config.num_hidden_layers):
            if i in config.layers_to_adapt:
                if config.add_attention_adapter:    self.freeze_unfreeze_params(self.encoder.layer[i].attention.output.attention_adapter.parameters(), requires_grad)
                if config.add_intermediate_adapter: self.freeze_unfreeze_params(self.encoder.layer[i].intermediate.intermediate_adapter.parameters(), requires_grad)
                if config.add_output_adapter:       self.freeze_unfreeze_params(self.encoder.layer[i].output.output_adapter.parameters(), requires_grad)
    
    def freeze_unfreeze_params(self, itm, requires_grad:bool):
        """Freeze and Unfreeze Parameters

        Args:
            itm: items to enable and disable gradient
            requires_grad (bool): enable disable Gradient
        """
        for param in itm: param.requires_grad = requires_grad

    def freeze_unfreeze_params_all(self, requires_grad:bool):
        """Freeze and Unfreeze all Parameters

        Args:
            requires_grad (bool): enable disable Gradient
        """
        for param in self.parameters(): param.requires_grad = requires_grad

class AdapterBertForSequenceClassification(BertForSequenceClassification):
    """Inherited Class from BertForSequenceClassification overriding config and bert model
    """
    def __init__(self, config, **kwargs):
        super().__init__(config)
        self.config = config
        self.bert = AdapterBertModel(config)
        #self.bert.freeze_unfreeze_params_all(True)











class ClassificationModel_V9(torch.nn.Module):
    """Adapter Model of AutoModelForSequenceClassification with CrossEntropyLoss as loss function
    Override Classes
    
    model.forward():
    | tokenize Sentence
    | Prediction
    | Calculate Loss
    | Return Loss and Prediction

    """
    def __init__(self,model_name:str="distilbert-base-uncased", num_labels:int=2,len_embeddings:int=512):
        super(ClassificationModel_V9,self).__init__()
        self.len_embeddings = len_embeddings
        config = BottleneckAdapterBertConfig.from_pretrained(model_name,
                                                             adapter_non_linearity="gelu",
                                                             add_attention_adapter=True,
                                                             add_intermediate_adapter=False,
                                                             add_output_adapter=True)
        self.model = AdapterBertForSequenceClassification.from_pretrained(model_name,num_labels=num_labels,max_position_embeddings=self.len_embeddings,ignore_mismatched_sizes=True,config=config)
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name,do_lower_case=True,sep_token='[SEP]')
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self,sentence,label,device):
        token = self.tokenizer(sentence,add_special_tokens=True,padding='max_length',truncation=True,max_length=self.len_embeddings,return_tensors="pt").to(device)
        pred = self.model(**token,labels=label)
        loss = self.loss_fn(pred.logits, label)
        return torch.unsqueeze(loss, -1), torch.argmax(pred.logits, dim=1)

### Lightning Module

In [None]:
class ClassificationModule(pl.LightningModule):
    def __init__(self, lr=0.001, model_name="bert-base-uncased", num_labels=2, len_embeddings=512, seed_val=42, model_version=0, num_warmup_steps=20, **kwargs):
        """_summary_

        Args:
            lr (float, optional): Learning Rate for the Optimizer. Defaults to 0.001.
            model_name (str, optional): Name of Huggingface Model. Defaults to "bert-base-uncased".
            num_labels (int, optional): Number of Classes for Classification. Defaults to 2.
            len_embeddings (int, optional): Lenght of the Embedding for tokenizer. Defaults to 512.
            seed_val (int, optional): Seed value. Defaults to 42.
            model_version (int, optional): Versions of the defined Classification Models. Defaults to 0.
            num_warmup_steps (int, optional): Warmup Steps (deacitvated). Defaults to 20.
        """
        super().__init__()
        self.save_hyperparameters()
        print(self.hparams)

        #Setting the Seed
        #random.seed(self.hparams.seed_val) <---- DO NOT TOUCH else PERMUTATION on DATASET does not work
        np.random.seed(self.hparams.seed_val)
        torch.manual_seed(self.hparams.seed_val)
        torch.cuda.manual_seed_all(self.hparams.seed_val)

        #Select Model
        assert model_version, "No model Version Selected!"
        if  model_version == 1:   self.model = ClassificationModel_V1(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings)
        if  model_version == 2:   self.model = ClassificationModel_V2(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings)
        if  model_version == 3:   self.model = ClassificationModel_V3(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings)
        if  model_version == 4:   self.model = ClassificationModel_V4(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings)
        if  model_version == 5:   self.model = ClassificationModel_V5(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings)
        if  model_version == 6:   self.model = ClassificationModel_V6(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings, v=1)
        if  model_version == 7:   self.model = ClassificationModel_V6(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings, v=2)
        if  model_version == 8:   self.model = ClassificationModel_V6(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings, v=3)
        if  model_version == 9:   self.model = ClassificationModel_V8(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings, v=1)
        if  model_version == 10:  self.model = ClassificationModel_V8(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings, v=2)
        if  model_version == 11:  self.model = ClassificationModel_V8(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings, v=3)
        if  model_version == 12:  self.model = ClassificationModel_V8(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings, v=4)
        if  model_version == 13:  self.model = ClassificationModel_V9(model_name=self.hparams.model_name, num_labels=self.hparams.num_labels, len_embeddings=self.hparams.len_embeddings)
        
        #Initialize Save Registers for values
        self.train_step_outputs = {"loss": [], "pred": [], "target": []}
        self.val_step_outputs   = {"loss": [], "pred": [], "target": []}
        self.test_step_outputs  = {"loss": [], "pred": [], "target": []}

    def shared_step(self, step_outputs:dict, batch, stage:str) -> torch.Tensor:
        """_summary_

        Args:
            step_outputs (dict): Save Registry
            batch (_type_): Current batch which needs to be Processed
            stage (str): train, test, validation

        Returns:
            torch.Tensor: Loss of the batch
        """
        
        #Run the batch trough the model on the stage=oneof(train, vaild, test) and saving the results in the desired Registers 
        batch = batch if "image" in batch.keys() else list(map(batch.get,filter(lambda k: k<=self.current_epoch,batch.keys())))[-1] # Training with several DS depending on epoch or normal
        loss, pred = self.model(sentence=batch["image"], label=batch["label"], device=self.device) #Run the Prediction from a batch

        #Save Results
        step_outputs["loss"].append(loss)
        step_outputs["pred"].append(pred)
        step_outputs["target"].append(batch["label"])
        return loss

    def shared_epoch_end(self, step_outputs:dict, stage:str):
        """Metric Calculations

        Args:
            step_outputs (dict): Registry of Calculated Entries of each Batch
            stage (str): train, test, validation Stage
        """
        #Concatenating all batch results and clear Registers
        pred   = torch.cat(step_outputs["pred"]).cpu()
        step_outputs["pred"].clear()
        target = torch.cat(step_outputs["target"]).cpu()
        step_outputs["target"].clear()
        loss = torch.cat(step_outputs["loss"]).cpu()
        step_outputs["loss"].clear()


        #Metric Calculations
        metrics = {stage+'_loss':      loss.mean().item(),
                   stage+'_accurancy': accuracy_score(target, pred),
                   stage+'_precision': precision_score(target, pred),
                   stage+'_recall':    recall_score(target, pred),
                   stage+'_f1':        f1_score(target, pred)}
        self.log_dict(metrics, prog_bar=True)

        
        #Confusion Matrix Calculations
        #https://torchmetrics.readthedocs.io/en/stable/classification/confusion_matrix.html
        #https://matplotlib.org/stable/users/explain/colors/colormaps.html
        confusion = torchmetrics.ConfusionMatrix(num_classes=self.hparams.num_labels, task="multiclass")
        confusion(pred, target)
        fig_, ax_ = plot_confusion_matrix(conf_mat=confusion.compute().detach().cpu().numpy().astype(int),show_absolute=True,show_normed=True,colorbar=True,cmap='Blues')
        self.logger.experiment.add_figure(stage+"_confusion_matrix", fig_, self.current_epoch)


    def training_step(self, batch, batch_idx):
        return self.shared_step(self.train_step_outputs, batch, "train")

    def on_train_epoch_end(self):
        return self.shared_epoch_end(self.train_step_outputs, "train")

    def validation_step(self, batch, batch_idx):
        return self.shared_step(self.val_step_outputs, batch, "valid")

    def on_validation_epoch_end(self):
        return self.shared_epoch_end(self.val_step_outputs, "valid")

    def test_step(self, batch, batch_idx):
        return self.shared_step(self.test_step_outputs, batch, "test")

    def on_test_epoch_end(self):
        return self.shared_epoch_end(self.test_step_outputs, "test")

    def configure_optimizers(self):
        """Configurates the Optimizer and Scheduler
        #https://huggingface.co/docs/transformers/en/main_classes/optimizer_schedules

        Returns:
            function: Choosen Optimizer
        """
        #optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        #scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.hparams.num_warmup_steps, num_training_steps=self.trainer.max_epochs)
        #return [optimizer], [scheduler]
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

# RUN

## Start Tensorboard

In [None]:
%load_ext tensorboard
#%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/lightning_logs

## Training the Model

In [None]:
#data = DataModule(batch_size=8, num_workers=0, permutate=False, strategy="")
data = DataModule(batch_size=16, num_workers=os.cpu_count(), permutate=False, strategy="")

#data = DataModule(batch_size=2, num_workers=os.cpu_count(), permutate=False, strategy="CombinedDataset", ds_names=["snli"])
#data = DataModule(batch_size=2, num_workers=os.cpu_count(), permutate=False, strategy="CombinedLoader", ds_names=["healthver"])
#data = DataModule(batch_size=2, num_workers=os.cpu_count(), permutate=False, strategy="Pre-Post:50", ds_names=["scifact"])

#data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedDataset", ds_names=["snli", "scifact", "healthver"])
#data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedLoader", ds_names=["snli", "scifact", "healthver"])
#data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="Pre-Post:25:50:100", ds_names=["snli", "scifact", "healthver"])

In [None]:
#model = ClassificationModule(lr=5.000e-5, model_name="princeton-nlp/sup-simcse-bert-base-uncased", len_embeddings=512, seed_val=0, model_version=114)
model = ClassificationModule(lr=5.000e-6, model_name="bert-base-uncased", len_embeddings=512, seed_val=0, model_version=6)
logger = pl.loggers.TensorBoardLogger(save_dir=LOG_FOLDER, version=f"bert-v6-512_5.000e-6_seed-0", name="lightning_logs")

In [None]:
trainer = pl.Trainer(accelerator="auto",max_epochs=50,logger=logger, enable_checkpointing=False)
trainer.fit(model, datamodule=data)

In [None]:
#print(model)
#trainer.save_checkpoint("example.ckpt")
#new_model = ClassificationModule.load_from_checkpoint(checkpoint_path="example.ckpt")

# Dataset Analysis

## Aggregation Test

Does all Combination of Strategy and Dataset works?

In [None]:
#data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="")
#data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedLoader", ds_names=["healthver"])
#data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedDataset", ds_names=["scifact", "healthver"])

In [None]:
#for batch in data.train_dataloader(): print(batch["label"], batch["image"])
#for batch in data.val_dataloader(): print(batch["label"], batch["image"])
#for batch in data.test_dataloader(): print(batch["label"], batch["image"])

In [None]:
data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="")
len(data.train_dataloader())

In [None]:
data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedDataset", ds_names=["healthver"])
len(data.train_dataloader())

In [None]:
data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedDataset", ds_names=["scifact"])
len(data.train_dataloader())

In [None]:
data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedDataset", ds_names=["snli"])
len(data.train_dataloader())

In [None]:
data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedLoader", ds_names=["healthver"])
len(data.train_dataloader())

In [None]:
data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedLoader", ds_names=["scifact"])
len(data.train_dataloader())

In [None]:
data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedLoader", ds_names=["snli"])
len(data.train_dataloader())

## SemEval Dataset

Distribution Analysis of SemEval

In [None]:
data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="")

In [None]:
label = np.array([i for batch in data.train_dataloader() for i in batch["label"].numpy()])
section = np.array([i.split("[SEP]")[1] for batch in data.train_dataloader() for i in batch["image"]])
print("label distribution in Train Dataloader:", Counter(label))
print("section distribution in Train Dataloader:", Counter(section))
print("section distribution in Train Dataloader for Label 0:", Counter(section[label == 0]))
print("section distribution in Train Dataloader for Label 1:", Counter(section[label == 1]))

print()

label = np.array([i for batch in data.val_dataloader() for i in batch["label"].numpy()])
section = np.array([i.split("[SEP]")[1] for batch in data.val_dataloader() for i in batch["image"]])
print("label distribution in Valid Dataloader:", Counter(label))
print("section distribution in Val Dataloader:", Counter(section))
print("section distribution in Val Dataloader for Label 0:", Counter(section[label == 0]))
print("section distribution in Val Dataloader for Label 1:", Counter(section[label == 1]))

## Histogram

Generating a Histogram of the Distribution of the Sencente lenght or Tokenized length

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="")
#data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedDataset", ds_names=["healthver"])
#data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedDataset", ds_names=["scifact"])
#data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedDataset", ds_names=["snli"])
#data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedDataset", ds_names=["scifact", "healthver"])
#data = DataModule(batch_size=8, num_workers=os.cpu_count(), permutate=False, strategy="CombinedDataset", ds_names=["scifact", "healthver", "snli"])

In [None]:
train_len_str   = np.array([len(i) for batch in data.train_dataloader() for i in batch["image"]])
train_num_words = np.array([len(list(map(len, i.split()))) for batch in data.train_dataloader() for i in batch["image"]])
train_tok_words = np.array([len(tokenizer.tokenize(i,add_special_tokens=True)) for batch in data.train_dataloader() for i in batch["image"]])

print(len(train_len_str), len(train_num_words), len(train_tok_words))
print(np.sum(train_tok_words <= 512), np.sum(train_tok_words > 512))

print("Lengt of String in Train Dataset:", f"min={np.min(train_len_str).round(2)}",
                                           f"max={np.max(train_len_str).round(2)}",
                                           f"mean={np.mean(train_len_str).round(2)}",
                                           f"median={np.median(train_len_str).round(2)}")

print("Number of Words in Train Dataset:", f"min={np.min(train_num_words).round(2)}",
                                           f"max={np.max(train_num_words).round(2)}",
                                           f"mean={np.mean(train_num_words).round(2)}",
                                           f"median={np.median(train_num_words).round(2)}")

print("Tokenized Words in Train Dataset:", f"min={np.min(train_tok_words).round(2)}",
                                           f"max={np.max(train_tok_words).round(2)}",
                                           f"mean={np.mean(train_tok_words).round(2)}",
                                           f"median={np.median(train_tok_words).round(2)}")


valid_len_str   = np.array([len(i) for batch in data.val_dataloader() for i in batch["image"]])
valid_num_words = np.array([len(list(map(len, i.split()))) for batch in data.val_dataloader() for i in batch["image"]])
valid_tok_words = np.array([len(tokenizer.tokenize(i,add_special_tokens=True)) for batch in data.val_dataloader() for i in batch["image"]])

print(len(valid_len_str), len(valid_num_words), len(valid_tok_words))
print(np.sum(valid_tok_words <= 512), np.sum(valid_tok_words > 512))

print("Lengt of String in Val Dataset:", f"min={np.min(valid_len_str).round(2)}",
                                         f"max={np.max(valid_len_str).round(2)}",
                                         f"mean={np.mean(valid_len_str).round(2)}",
                                         f"median={np.median(valid_len_str).round(2)}")

print("Number of Words in Val Dataset:", f"min={np.min(valid_num_words).round(2)}",
                                         f"max={np.max(valid_num_words).round(2)}",
                                         f"mean={np.mean(valid_num_words).round(2)}",
                                         f"median={np.median(valid_num_words).round(2)}")

print("Tokenized Words in Val Dataset:", f"min={np.min(valid_tok_words).round(2)}",
                                         f"max={np.max(valid_tok_words).round(2)}",
                                         f"mean={np.mean(valid_tok_words).round(2)}",
                                         f"median={np.median(valid_tok_words).round(2)}")


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
fig, axs = plt.subplots(1, 2, figsize=(15, 5), sharex=True)
num_bins = 75
bins=np.histogram(np.hstack((train_num_words,train_tok_words, valid_num_words,valid_tok_words)), bins=num_bins)[1] #get the bin edges


axs[0].xaxis.set_major_locator(ticker.MaxNLocator(15))
axs[0].hist(train_num_words, bins=bins, edgecolor='black', alpha=0.75, label=f'Words (mean={int(np.mean(train_num_words))})')
axs[0].hist(train_tok_words, bins=bins, edgecolor='black', alpha=0.5, label=f'Token (mean={int(np.mean(train_tok_words))})')
axs[0].set_title(f"Training Histogram")
axs[0].set_xlabel("Occurancies in Sentence")
axs[0].set_ylabel("Occurancies in Dataset")
#axs[0].axvline(x=512, color='red', lw=1, ls='--', label='512 Token Mark')
axs[0].legend()

axs[1].xaxis.set_major_locator(ticker.MaxNLocator(15))
axs[1].hist(valid_num_words, bins=bins, edgecolor='black', alpha=0.75, label=f'Words (mean={int(np.mean(valid_num_words))})')
axs[1].hist(valid_tok_words, bins=bins, edgecolor='black', alpha=0.5, label=f'Token (mean={int(np.mean(valid_tok_words))})')
axs[1].set_title("Valid Histogram")
axs[1].set_xlabel("Occurancies in Sentence")
axs[1].set_ylabel("Occurancies in Dataset")
#axs[1].axvline(x=512, color='red', lw=1, ls='--', label='512 Token Mark')
axs[1].legend()

name = LOG_FOLDER + "/histogram.svg"
plt.savefig(name, format="svg")

## Token

Analyzation hot the tokens are built for a sentence

In [None]:
"""
for i, batch in enumerate(data.val_dataloader()):
  for j, itm in enumerate(batch["image"]):
    tok = tokenizer.tokenize(itm,add_special_tokens=True)
    print(i, j, len(tok), tok)
  print()
"""