# Pip Wheels

In [1]:
'''
!pip install pytorch_lightning
!pip install torchmetrics
!pip install tokenizers
!pip install transformers
!pip install ray[tune]
'''

'\n!pip install pytorch_lightning\n!pip install torchmetrics\n!pip install tokenizers\n!pip install transformers\n!pip install ray[tune]\n'

# Imports

In [2]:
# General Libraries
import os
import re
import random
import numpy as np
import pandas as pd
import scipy as sp
from tqdm.auto import tqdm
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Type
from copy import deepcopy

# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule, seed_everything, Trainer, LightningModule
from torchmetrics import Accuracy
from torchmetrics.functional import f1_score, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, TQDMProgressBar 
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.loops.loop import Loop
from pytorch_lightning.loops.fit_loop import FitLoop
from pytorch_lightning.trainer.states import TrainerFn

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

# Ray[Tune]
import ray
from ray import air
from ray import tune
from ray.air import session
from ray.tune.integration.pytorch_lightning import TuneReportCallback


# HuggingFace Libraries
import tokenizers
import transformers 
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, AdamW, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


# Configuration

## Configuration Class: notebook-specific settings

In [3]:
class CFG:
    # General
    seed = 42
    
    # Debug 
    debug = False
    debug_samples = 100

## Configuration Dictionary: trial-specific settings

In [4]:
# Defining a search space!
config_dict = {
    "target_size" : 1,
    "num_workers" : 16,
    
    # Training parameters
    "batch_size" : 64,
    "epochs" : 2,
    "n_fold" : 2,
    "warmup_steps" : 0,
    "min_lr" : 1e-6,
    "encoder_lr" : 2e-5,
    "decoder_lr" : 2e-5,
    "eps" : 1e-6,
    "betas" : (0.9, 0.999),
    "weight_decay" : 0.01,
    "fc_dropout" : 0.2,

    # Transformers
    # "model" : tune.choice(["microsoft/deberta-v3-large"]),
    # "model" : tune.choice(["distilbert-base-uncased"]),
    "model" : tune.grid_search(["AI-Growth-Lab/PatentSBERTa","distilbert-base-uncased","ahotrod/electra_large_discriminator_squad2_512","Yanhao/simcse-bert-for-patent"])
}

## Directories

In [5]:
INPUT_DIR = '../dataset/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

## Logger

In [6]:
logger = TensorBoardLogger("lightning_logs", name="USPPPM")

## Random seed

In [7]:
pl.seed_everything(CFG.seed)

Global seed set to 42


42

## Scoring Function

In [8]:
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score

# Data Loading

In [9]:
cpc_texts = torch.load('cpc_texts.pth')
dataframe = pd.read_csv("dataframe.csv")
display(dataframe.head())

Unnamed: 0.1,Unnamed: 0,id,anchor,target,context,score,context_text,text,score_map
0,0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]abatement of pollution[SEP]HUMAN...,2
1,1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]act of abating[SEP]HUMAN NECESSI...,3
2,2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]active catalyst[SEP]HUMAN NECESS...,1
3,3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]eliminating process[SEP]HUMAN NE...,2
4,4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]forest region[SEP]HUMAN NECESSIT...,0


## Debug Slicing

In [10]:
if CFG.debug:
    dataframe = dataframe.iloc[:CFG.debug_samples,:]

## Train-Test Split

In [11]:
# train_df, test_df = train_test_split(dataframe, test_size = 0.1, random_state = CFG.seed, stratify = dataframe.score_map)
train_df, test_df = train_test_split(dataframe, test_size = 0.1, random_state = CFG.seed)
display(train_df.head())
display(test_df.head())

Unnamed: 0.1,Unnamed: 0,id,anchor,target,context,score,context_text,text,score_map
9900,9900,0dbb44b9a145edec,distributor pipe,pipe,B01,0.5,PERFORMING OPERATIONS; TRANSPORTING. PHYSICAL ...,distributor pipe[SEP]pipe[SEP]PERFORMING OPERA...,2
1303,1303,74afca34a5439c23,ammonia recovery,recovery of water,C01,0.25,HEMISTRY; METALLURGY. INORGANIC CHEMISTRY,ammonia recovery[SEP]recovery of water[SEP]HEM...,1
16591,16591,6371befc3ee1b0f2,inner closed,cylindrical inner member,E04,0.5,FIXED CONSTRUCTIONS. BUILDING,inner closed[SEP]cylindrical inner member[SEP]...,2
25822,25822,20489196c73bd86b,produce thin layers,produce layers,G01,0.5,PHYSICS. MEASURING; TESTING,produce thin layers[SEP]produce layers[SEP]PHY...,2
23640,23640,9af994b21c892022,parallel orientation,zero angle,G06,0.25,PHYSICS. COMPUTING; CALCULATING; COUNTING,parallel orientation[SEP]zero angle[SEP]PHYSIC...,1


Unnamed: 0.1,Unnamed: 0,id,anchor,target,context,score,context_text,text,score_map
33511,33511,ed1c4e525eb105fe,transmit alarm,display indicator,G08,0.0,PHYSICS. SIGNALLING,transmit alarm[SEP]display indicator[SEP]PHYSI...,0
18670,18670,5386316f318f5221,locking formation,retaining element,B60,0.25,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...,locking formation[SEP]retaining element[SEP]PE...,1
18049,18049,1544ca6753fcbddd,lateral power,transducer,H01,0.25,ELECTRICITY. BASIC ELECTRIC ELEMENTS,lateral power[SEP]transducer[SEP]ELECTRICITY. ...,1
31660,31660,f9d8979b94cec923,spreader body,spreader,A01,0.75,HUMAN NECESSITIES. GRICULTURE; FORESTRY; ANIMA...,spreader body[SEP]spreader[SEP]HUMAN NECESSITI...,3
15573,15573,e151ca5ea5cc0f08,high gradient magnetic separators,magnetic filtration,B03,0.5,PERFORMING OPERATIONS; TRANSPORTING. SEPARATIO...,high gradient magnetic separators[SEP]magnetic...,2


# Dataset Preparation

## Tokenizer

In [12]:
def set_tokenizer(config_dict):
    tokenizer = AutoTokenizer.from_pretrained(config_dict['model'])
    tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
    config_dict['tokenizer'] = tokenizer

## Maximum length

In [13]:
def set_max_len(config_dict, cpc_texts=cpc_texts, train_df=dataframe):
    tokenizer = config_dict['tokenizer']
    lengths_dict = {}

    lengths = []
    tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict['context_text'] = lengths

    for text_col in ['anchor', 'target']:
        lengths = []
        tk0 = tqdm(train_df[text_col].fillna("").values, total=len(train_df))
        for text in tk0:
            length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
            lengths.append(length)
        lengths_dict[text_col] = lengths

    config_dict['max_len'] = max(lengths_dict['anchor']) + max(lengths_dict['target'])\
                    + max(lengths_dict['context_text']) + 4 # CLS + SEP + SEP + SEP

## Dataset

In [14]:
def prepare_input(config_dict, text):
    tokenizer = config_dict['tokenizer']
    inputs = tokenizer(text,
                       add_special_tokens = True,
                       max_length = config_dict['max_len'],
                       padding = "max_length",
                       return_offsets_mapping = False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs

In [15]:
class USPPM_dataset(Dataset):
    def __init__(self, config_dict, train_df, train=True):
        self.config_dict = config_dict
        self.texts = train_df['text'].values
        self.train = train
        if train:
            self.labels = train_df['score'].values
            self.score_map = train_df['score_map'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.config_dict, self.texts[item])
        if self.train:
            labels = torch.tensor(self.labels[item], dtype=torch.float)
            return dict(
                  inputs = inputs,
                  labels = labels
            )
        else:
            return dict(
                  inputs = inputs
            )

# K-Fold 

## KFold DataModule definition                       

In [16]:
class BaseKFoldDataModule(LightningDataModule, ABC):
    @abstractmethod
    def setup_folds(self, num_folds: int) -> None:
        pass

    @abstractmethod
    def setup_fold_index(self, fold_index: int) -> None:
        pass

## KFoldDataModule implementation

In [17]:
@dataclass
class USPPPM_kf_datamodule(BaseKFoldDataModule):
    def __init__(self, config_dict, dataframe = dataframe):
        
        self.config_dict = config_dict
        self.prepare_data_per_node = False
        self._log_hyperparams = False
        
        train_dataset: Optional[Dataset] = None
        test_dataset: Optional[Dataset] = None
        train_fold: Optional[Dataset] = None
        val_fold: Optional[Dataset] = None
        
        self.dataframe = dataframe
            
    def setup(self, stage: Optional[str] = None) -> None:
        # train_df, test_df = train_test_split(self.dataframe, test_size = 0.1, random_state = CFG.seed, stratify = self.dataframe.score_map)
        train_df, test_df = train_test_split(self.dataframe, test_size = 0.1, random_state = CFG.seed)
        self.train_dataset = USPPM_dataset(self.config_dict, train_df)
        self.test_dataset = USPPM_dataset(self.config_dict, test_df)

    def setup_folds(self, num_folds: int) -> None:
        self.num_folds = num_folds
        Fold = StratifiedKFold(n_splits=self.num_folds, shuffle=True)
        self.splits = [split for split in Fold.split(self.train_dataset, self.train_dataset.score_map)]

    def setup_fold_index(self, fold_index: int) -> None:
        train_indices, val_indices = self.splits[fold_index]
        self.train_fold = Subset(self.train_dataset, train_indices)
        self.val_fold = Subset(self.train_dataset, val_indices)
        print("TRAIN FOLD", fold_index + 1, len(self.train_fold))
        print("VALID FOLD", fold_index + 1, len(self.val_fold))

    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.train_fold, num_workers = self.config_dict['num_workers'], batch_size = self.config_dict['batch_size'])

    def val_dataloader(self) -> DataLoader:
        return DataLoader(self.val_fold, num_workers = self.config_dict['num_workers'], batch_size = self.config_dict['batch_size'])
    
    def test_dataloader(self) -> DataLoader:
        return DataLoader(self.test_dataset, num_workers = self.config_dict['num_workers'], batch_size = self.config_dict['batch_size'])
    
    def __post_init__(cls):
        super().__init__()

## Ensemble Model for kfold

In [18]:
class EnsembleVotingModel(LightningModule):
    def __init__(self, model_cls: Type[LightningModule], checkpoint_paths: List[str], config_dict):
        super().__init__()
        # Create `num_folds` models with their associated fold weights
        self.models = torch.nn.ModuleList([model_cls.load_from_checkpoint(p, config_dict=config_dict) for p in checkpoint_paths])
        self.get_score = get_score

    def test_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None:
        # Compute the averaged predictions over the `num_folds` models.
        inputs = batch["inputs"]
        labels = batch["labels"]
        
        losses = []
        scores = []
        
        for m in self.models : 
            loss, outputs = m(inputs, labels.unsqueeze(1))
            predictions = outputs.squeeze().sigmoid().cpu().detach().numpy()
            
            score = self.get_score(labels.cpu().numpy(), predictions)
            scores.append(score)
            losses.append(loss.cpu().detach().numpy())
            
        avg_loss = np.mean(losses)
        avg_score = np.mean(scores)
        
        self.log("ensemble_avg_loss", avg_loss, prog_bar=True, logger=True)
        print('ensemble_avg_loss', avg_loss)
        self.log("ensemble_avg_score", avg_score, prog_bar=True, logger=True)
        print("ensemble_avg_score", avg_score)

## KFoldLoop implementation

In [19]:
class KFoldLoop(Loop):
    def __init__(self, num_folds: int, config_dict, export_path: str) -> None:
        super().__init__()
        self.num_folds = num_folds
        self.current_fold: int = 0
        self.export_path = export_path
        self.config_dict = config_dict

    @property
    def done(self) -> bool:
        return self.current_fold >= self.num_folds

    def connect(self, fit_loop: FitLoop) -> None:
        self.fit_loop = fit_loop

    def reset(self) -> None:
        """Nothing to reset in this loop."""

    def on_run_start(self, *args: Any, **kwargs: Any) -> None:
        """Used to call `setup_folds` from the `BaseKFoldDataModule` instance and store the original weights of the model."""
        assert isinstance(self.trainer.datamodule, BaseKFoldDataModule)
        self.trainer.datamodule.setup_folds(self.num_folds)
        self.lightning_module_state_dict = deepcopy(self.trainer.lightning_module.state_dict())

    def on_advance_start(self, *args: Any, **kwargs: Any) -> None:
        """Used to call `setup_fold_index` from the `BaseKFoldDataModule` instance."""
        print(f"STARTING FOLD {self.current_fold+1}")
        assert isinstance(self.trainer.datamodule, BaseKFoldDataModule)
        self.trainer.datamodule.setup_fold_index(self.current_fold)

    def advance(self, *args: Any, **kwargs: Any) -> None:
        """Used to the run a fitting and testing on the current hold."""
        self._reset_fitting()  # requires to reset the tracking stage.
        self.fit_loop.run()

        self._reset_testing()  # requires to reset the tracking stage.
        self.trainer.test_loop.run()
        print('TEST for FOLD', self.current_fold+1)
        
        self.current_fold += 1  # increment fold tracking number.

    def on_advance_end(self) -> None:
        """Used to save the weights of the current fold and reset the LightningModule and its optimizers."""
        self.trainer.save_checkpoint(os.path.join(self.export_path, f"model.{self.current_fold}.pt"))
        # restore the original weights + optimizers and schedulers.
        self.trainer.lightning_module.load_state_dict(self.lightning_module_state_dict)
        self.trainer.strategy.setup_optimizers(self.trainer)
        self.replace(fit_loop=FitLoop)

    def on_run_end(self) -> None:
        """Used to compute the performance of the ensemble model on the test set."""
        checkpoint_paths = [os.path.join(self.export_path, f"model.{f_idx + 1}.pt") for f_idx in range(self.num_folds)]
        voting_model = EnsembleVotingModel(type(self.trainer.lightning_module), checkpoint_paths, self.config_dict)
        voting_model.trainer = self.trainer

        # This requires to connect the new model and move it the right device.
        self.trainer.strategy.connect(voting_model)
        self.trainer.strategy.model_to_device()
        self.trainer.test_loop.run()

    def on_save_checkpoint(self) -> Dict[str, int]:
        return {"current_fold": self.current_fold}

    def on_load_checkpoint(self, state_dict: Dict) -> None:
        self.current_fold = state_dict["current_fold"]

    def _reset_fitting(self) -> None:
        self.trainer.reset_train_dataloader()
        self.trainer.reset_val_dataloader()
        self.trainer.state.fn = TrainerFn.FITTING
        self.trainer.training = True

    def _reset_testing(self) -> None:
        self.trainer.reset_test_dataloader()
        self.trainer.state.fn = TrainerFn.TESTING
        self.trainer.testing = True

    def __getattr__(self, key) -> Any:
        # requires to be overridden as attributes of the wrapped loop are being accessed.
        if key not in self.__dict__:
            return getattr(self.fit_loop, key)
        return self.__dict__[key]

# Model

In [20]:
class USPPPM_model(pl.LightningModule):
    def __init__(self, config_dict=config_dict, config_path=None, pretrained=True):
        super().__init__()
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained(config_dict['model'], output_hidden_states = True)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(config_dict['model'], config = self.config)
        else:
            self.model = AutoModel.from_config(self.config)
            
        self.config_dict = config_dict
        self.n_warmup_steps = config_dict['warmup_steps']
        self.n_training_steps = config_dict['training_steps']
        self.criterion = nn.BCEWithLogitsLoss(reduction="mean")
            
        self.fc_dropout = nn.Dropout(config_dict['fc_dropout'])
        self.fc = nn.Linear(self.config.hidden_size, config_dict['target_size'])
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        
        self.batch_labels = []
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs, labels=None):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output
    
    def training_step(self, batch, batch_idx):
        inputs = batch["inputs"]
        labels = batch["labels"]
        loss, outputs = self(inputs, labels.unsqueeze(1))
        self.log("train_loss", loss, prog_bar=True, logger=True)
        # session.report({"train_loss": loss})  # Send the score to Tune.
        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        inputs = batch["inputs"]
        labels = batch["labels"]
        loss, outputs = self(inputs, labels.unsqueeze(1))
        self.log("val_loss", loss, prog_bar=True, logger=True)
        # session.report({"val_loss": loss})  # Send the score to Tune.
        return {"loss": loss, "predictions": outputs, "labels": labels}

    def test_step(self, batch, batch_idx):
        inputs = batch["inputs"]
        labels = batch["labels"]
        loss, outputs = self(inputs, labels.unsqueeze(1))
        self.log("test_loss", loss, prog_bar=True, logger=True)
        # session.report({"test_loss": loss})  # Send the score to Tune.
        return {"loss": loss, "predictions": outputs, "labels": labels}
    
    def validation_epoch_end(self, batch_results):
        outputs, labels, losses = [], [], []
        for batch in batch_results:
            outputs.append(batch['predictions'])
            labels.append(batch['labels'])
            losses.append(batch['loss'])

        labels = torch.cat(labels).cpu().numpy()
        predictions = np.concatenate(torch.cat(outputs).sigmoid().to('cpu').numpy())
        score = get_score(labels, predictions)
        self.log("val_score", score, prog_bar=True, logger=True)
        # tune.report({"val_score": score})  # Send the score to Tune.

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.config_dict['encoder_lr'])
        # optimizer = AdamW(self.parameters(), lr=2e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.n_warmup_steps,
            num_training_steps=self.n_training_steps
        )
        return dict(
          optimizer=optimizer,
          lr_scheduler=dict(
            scheduler=scheduler,
            interval='step'
          )
        )

'''        def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
            param_optimizer = list(model.named_parameters())
            no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
            optimizer_parameters = [
                {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],'lr': encoder_lr, 'weight_decay': weight_decay},
                {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],'lr': encoder_lr, 'weight_decay': 0.0},
                {'params': [p for n, p in model.named_parameters() if "model" not in n],'lr': decoder_lr, 'weight_decay': 0.0}
            ]
            return optimizer_parameters

        optimizer_parameters = get_optimizer_params(self,
                                            encoder_lr=self.config_dict["encoder_lr"], 
                                            decoder_lr=self.config_dict["decoder_lr"],
                                            weight_decay=self.config_dict["weight_decay"])

        optimizer = AdamW(optimizer_parameters, 
                          lr=self.config_dict["encoder_lr"], 
                          eps=self.config_dict["eps"],
                          betas=self.config_dict["betas"])
        
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.n_warmup_steps,
            num_training_steps=self.n_training_steps
        )'''



'        def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):\n            param_optimizer = list(model.named_parameters())\n            no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]\n            optimizer_parameters = [\n                {\'params\': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],\'lr\': encoder_lr, \'weight_decay\': weight_decay},\n                {\'params\': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],\'lr\': encoder_lr, \'weight_decay\': 0.0},\n                {\'params\': [p for n, p in model.named_parameters() if "model" not in n],\'lr\': decoder_lr, \'weight_decay\': 0.0}\n            ]\n            return optimizer_parameters\n\n        optimizer_parameters = get_optimizer_params(self,\n                                            encoder_lr=self.config_dict["encoder_lr"], \n                                            decoder_lr=self.config_dict["decode

# Training

## Callbacks

In [21]:
ray.init(num_gpus=4)

2022-11-12 17:22:17,121	INFO worker.py:1518 -- Started a local Ray instance.


0,1
Python version:,3.8.10
Ray version:,2.0.1


In [22]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best_checkpoint",
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

metrics = {"val_score": "val_score", "train_loss" : "train_loss", "val_loss" : "val_loss"}

In [23]:
def trainable(config_dict):  # Pass a "config" dictionary into your trainable.

    steps_per_epoch = len(train_df) // config_dict['batch_size']
    config_dict['training_steps'] = steps_per_epoch * config_dict['epochs']
    
    set_tokenizer(config_dict)
    set_max_len(config_dict)
    # train_dataset = USPPM_dataset(config_dict)
    datamodule = USPPPM_kf_datamodule(config_dict, dataframe)
    
    model = USPPPM_model(config_dict)
    
    callbacks = [TuneReportCallback(metrics, on="validation_end"), checkpoint_callback, early_stopping_callback, TQDMProgressBar(refresh_rate=2)]
    trainer = pl.Trainer(
            logger=logger,
            num_sanity_val_steps=0,
            check_val_every_n_epoch=1,
            callbacks=callbacks,
            max_epochs=config_dict['epochs'],
            #devices=[1],
            accelerator="gpu",
        
            )
    
    internal_fit_loop = trainer.fit_loop
    trainer.fit_loop = KFoldLoop(config_dict['n_fold'], config_dict, export_path="./")
    trainer.fit_loop.connect(internal_fit_loop)
    
    trainer.fit(model, datamodule)

In [24]:
resource_group = tune.PlacementGroupFactory([{"CPU": 1, "GPU": 1}])

tuner = tune.Tuner(tune.with_resources(trainable, 
                                       {"cpu":0.25,"gpu":1}),
                                       param_space = config_dict,
                                       tune_config = tune.TuneConfig(metric="val_score", mode="max",max_concurrent_trials=4),
                                       # tune_config = tune.TuneConfig(metric="val_score", mode="max"),
                                       run_config = air.RunConfig(name="tune_uspppm", verbose=2, progress_reporter=tune.JupyterNotebookReporter(overwrite=True))
                                       )
                  
                  

Trial name,status,loc,model,iter,total time (s),val_score,train_loss,val_loss
trainable_2e6d4_00000,RUNNING,131.114.50.210:2737115,AI-Growth-Lab/P_0c60,2.0,512.377,0.789584,0.525246,0.555948
trainable_2e6d4_00001,RUNNING,131.114.50.210:2737159,distilbert-base_0f30,3.0,418.287,0.287087,0.631234,0.645798
trainable_2e6d4_00002,ERROR,131.114.50.210:2737326,ahotrod/electra_9810,,,,,
trainable_2e6d4_00003,ERROR,131.114.50.210:2738869,Yanhao/simcse-b_0f80,,,,,

Trial name,# failures,error file
trainable_2e6d4_00002,1,/storagenfs/m.petix/ray_results/tune_uspppm/trainable_2e6d4_00002_2_model=ahotrod_electra_large_discriminator_squad2_512_2022-11-12_17-22-35/error.txt
trainable_2e6d4_00003,1,/storagenfs/m.petix/ray_results/tune_uspppm/trainable_2e6d4_00003_3_model=Yanhao_simcse-bert-for-patent_2022-11-12_17-22-44/error.txt


In [None]:
results = tuner.fit()

best_result = results.get_best_result()  # Get best result object
print(best_result)

In [None]:
ray.shutdown()

In [None]:
# Get a dataframe for the last reported results of all of the trials 
df = results.get_dataframe() 

In [None]:
df.to_csv('grid_search_results.csv')

Testing DataLoader 0:  56%|█████▌    | 32/57 [00:07<00:05,  4.54it/s]
[2m[36m(trainable pid=2737159)[0m 
Validation DataLoader 0:  60%|█████▉    | 154/257 [00:17<00:11,  9.00it/s][A
Epoch 1:  80%|████████  | 412/514 [01:51<00:27,  3.70it/s, loss=0.628, v_num=0, train_loss=0.621, val_loss=0.646, val_score=0.287, test_loss=0.578]
[2m[36m(trainable pid=2737159)[0m 
Validation DataLoader 0:  61%|██████    | 156/257 [00:17<00:11,  9.01it/s][A
Epoch 1:  81%|████████  | 414/514 [01:51<00:26,  3.72it/s, loss=0.628, v_num=0, train_loss=0.621, val_loss=0.646, val_score=0.287, test_loss=0.578]
Testing DataLoader 0:  60%|█████▉    | 34/57 [00:07<00:05,  4.53it/s]
[2m[36m(trainable pid=2737159)[0m 
Validation DataLoader 0:  61%|██████▏   | 158/257 [00:17<00:10,  9.00it/s][A
Epoch 1:  81%|████████  | 416/514 [01:51<00:26,  3.73it/s, loss=0.628, v_num=0, train_loss=0.621, val_loss=0.646, val_score=0.287, test_loss=0.578]
[2m[36m(trainable pid=2737159)[0m 
Validation DataLoader 0:  62%|

In [None]:
## debug transoformers
config_dict = {
    "target_size" : 1,
    "num_workers" : 16,
    
    # Training parameters
    "batch_size" : 64,
    "epochs" : 2,
    "n_fold" : 2,
    "warmup_steps" : 0,
    "min_lr" : 1e-6,
    "encoder_lr" : 2e-5,
    "decoder_lr" : 2e-5,
    "eps" : 1e-6,
    "betas" : (0.9, 0.999),
    "weight_decay" : 0.01,
    "fc_dropout" : 0.2,

    # Transformers
    # "model" : tune.choice(["microsoft/deberta-v3-large"]),
    # "model" : tune.choice(["distilbert-base-uncased"]),
    "model" : tune.grid_search(["AI-Growth-Lab/PatentSBERTa","distilbert-base-uncased","ahotrod/electra_large_discriminator_squad2_512","Yanhao/simcse-bert-for-patent"])
}