# Pip Wheels

In [1]:
'''
!pip install pytorch_lightning
!pip install torchmetrics
!pip install tokenizers
!pip install transformers
!pip install ray[tune]
'''

'\n!pip install pytorch_lightning\n!pip install torchmetrics\n!pip install tokenizers\n!pip install transformers\n!pip install ray[tune]\n'

# Imports

In [2]:
# General Libraries
import os
import re
import random
import numpy as np
import pandas as pd
import scipy as sp

# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, TQDMProgressBar 
from pytorch_lightning.loggers import TensorBoardLogger

# Scikit-learn
from sklearn.model_selection import train_test_split

# Ray[Tune]
import ray
from ray import air
from ray import tune
from ray.air import session
from ray.tune.integration.pytorch_lightning import TuneReportCallback

import torch

%env TOKENIZERS_PARALLELISM=true

#out code 
from kfold_loop import KFoldLoop
from USPPM_model import USPPPM_model
from USPPM_dataset import set_tokenizer, set_max_len
from USPPM_kfold_datamodule import USPPPM_kf_datamodule

from datetime import datetime
from pytorch_lightning.utilities.memory import garbage_collection_cuda


env: TOKENIZERS_PARALLELISM=true


In [3]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Configuration

## Configuration Class: notebook-specific settings

In [4]:
class CFG:
    # General
    seed = 42
    
    # Debug 
    debug = False
    debug_samples = 100

## Configuration Dictionary: trial-specific settings

In [5]:
# Defining a search space!
config_dict = {
    "target_size" : 1,
    "num_workers" : 16,
    
    # Training parameters
    "batch_size" : tune.grid_search([4,8,16,32,64,128]),
    "epochs" : 2,
    "n_fold" : tune.grid_search([8,16,32]),
    "warmup_steps" : 0,
    "min_lr" : 1e-6,
    "encoder_lr" : 2e-5,
    "decoder_lr" : 2e-5,
    "eps" : 1e-6,
    "betas" : (0.9, 0.999),
    "weight_decay" : 0.01,
    "fc_dropout" : 0.2,
    "seed" : 42,

    # Transformers
    # "model" : tune.choice(["microsoft/deberta-v3-large"]),
    #"model" : tune.choice(["distilbert-base-uncased"]),
    "model" : tune.grid_search(["AI-Growth-Lab/PatentSBERTa","distilbert-base-uncased","ahotrod/electra_large_discriminator_squad2_512",
                                "Yanhao/simcse-bert-for-patent","microsoft/deberta-v3-large","anferico/bert-for-patents"])
}

## Directories

In [6]:
INPUT_DIR = '../dataset/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Data Loading

In [7]:
cpc_texts = torch.load('cpc_texts.pth')
dataframe = pd.read_csv("dataframe.csv")
display(dataframe.head())

Unnamed: 0.1,Unnamed: 0,id,anchor,target,context,score,context_text,text,score_map
0,0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]abatement of pollution[SEP]HUMAN...,2
1,1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]act of abating[SEP]HUMAN NECESSI...,3
2,2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]active catalyst[SEP]HUMAN NECESS...,1
3,3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]eliminating process[SEP]HUMAN NE...,2
4,4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]forest region[SEP]HUMAN NECESSIT...,0


## Debug Slicing

In [8]:
if CFG.debug:
    dataframe = dataframe.iloc[:CFG.debug_samples,:]

## Train-Test Split

In [9]:
# train_df, test_df = train_test_split(dataframe, test_size = 0.1, random_state = CFG.seed, stratify = dataframe.score_map)
train_df, test_df = train_test_split(dataframe, test_size = 0.1, random_state = CFG.seed)
display(train_df.head())
display(test_df.head())

Unnamed: 0.1,Unnamed: 0,id,anchor,target,context,score,context_text,text,score_map
9900,9900,0dbb44b9a145edec,distributor pipe,pipe,B01,0.5,PERFORMING OPERATIONS; TRANSPORTING. PHYSICAL ...,distributor pipe[SEP]pipe[SEP]PERFORMING OPERA...,2
1303,1303,74afca34a5439c23,ammonia recovery,recovery of water,C01,0.25,HEMISTRY; METALLURGY. INORGANIC CHEMISTRY,ammonia recovery[SEP]recovery of water[SEP]HEM...,1
16591,16591,6371befc3ee1b0f2,inner closed,cylindrical inner member,E04,0.5,FIXED CONSTRUCTIONS. BUILDING,inner closed[SEP]cylindrical inner member[SEP]...,2
25822,25822,20489196c73bd86b,produce thin layers,produce layers,G01,0.5,PHYSICS. MEASURING; TESTING,produce thin layers[SEP]produce layers[SEP]PHY...,2
23640,23640,9af994b21c892022,parallel orientation,zero angle,G06,0.25,PHYSICS. COMPUTING; CALCULATING; COUNTING,parallel orientation[SEP]zero angle[SEP]PHYSIC...,1


Unnamed: 0.1,Unnamed: 0,id,anchor,target,context,score,context_text,text,score_map
33511,33511,ed1c4e525eb105fe,transmit alarm,display indicator,G08,0.0,PHYSICS. SIGNALLING,transmit alarm[SEP]display indicator[SEP]PHYSI...,0
18670,18670,5386316f318f5221,locking formation,retaining element,B60,0.25,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...,locking formation[SEP]retaining element[SEP]PE...,1
18049,18049,1544ca6753fcbddd,lateral power,transducer,H01,0.25,ELECTRICITY. BASIC ELECTRIC ELEMENTS,lateral power[SEP]transducer[SEP]ELECTRICITY. ...,1
31660,31660,f9d8979b94cec923,spreader body,spreader,A01,0.75,HUMAN NECESSITIES. GRICULTURE; FORESTRY; ANIMA...,spreader body[SEP]spreader[SEP]HUMAN NECESSITI...,3
15573,15573,e151ca5ea5cc0f08,high gradient magnetic separators,magnetic filtration,B03,0.5,PERFORMING OPERATIONS; TRANSPORTING. SEPARATIO...,high gradient magnetic separators[SEP]magnetic...,2


# Training

## Callbacks

In [10]:
ray.init(num_gpus=4)

2022-11-14 16:48:31,824	INFO worker.py:1518 -- Started a local Ray instance.


0,1
Python version:,3.8.10
Ray version:,2.0.1


In [11]:
metrics = {"val_score": "val_score", "train_loss" : "train_loss", "val_loss" : "val_loss"}

In [12]:
def trainable(config_dict):  # Pass a "config" dictionary into your trainable.
    garbage_collection_cuda()
    
    trial_id = ray.air.session.get_trial_id()
    logging_dir = f"USPPPM_{trial_id}"
    
    export_path = f'./ensemble_checkpoints/{trial_id}'
    
    for d in ["ensemble_checkpoints/",export_path,"lightning_logs",f"lightning_logs/{logging_dir}"]:
        try:
            os.mkdir(d)
        except FileExistsError:
            pass
    
    logger = TensorBoardLogger("lightning_logs", name=logging_dir)
    pl.seed_everything(CFG.seed)
    
    checkpoint_callback = ModelCheckpoint(
    dirpath=f"checkpoints/{trial_id}_checkpoints",
    filename="best_checkpoint",
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
    )

    early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)
    
    steps_per_epoch = len(train_df) // config_dict['batch_size']
    config_dict['training_steps'] = steps_per_epoch * config_dict['epochs']
    
    set_tokenizer(config_dict, OUTPUT_DIR)
    set_max_len(config_dict, cpc_texts, dataframe)
    # train_dataset = USPPM_dataset(config_dict)
    datamodule = USPPPM_kf_datamodule(config_dict, dataframe)
    
    model = USPPPM_model(config_dict)
    
    callbacks = [TuneReportCallback(metrics, on="validation_end"), checkpoint_callback, early_stopping_callback]
    trainer = pl.Trainer(
            logger=logger,
            num_sanity_val_steps=0,
            check_val_every_n_epoch=1,
            callbacks=callbacks,
            max_epochs=config_dict['epochs'],
            #devices=[1],
            accelerator="gpu",
            )
    
    internal_fit_loop = trainer.fit_loop
    trainer.fit_loop = KFoldLoop(config_dict['n_fold'], config_dict, export_path=export_path)
    trainer.fit_loop.connect(internal_fit_loop)
    
    trainer.fit(model, datamodule)
    garbage_collection_cuda()
    del model
    del datamodule
    del trainer


In [13]:
tuner = tune.Tuner(tune.with_resources(trainable, 
                                       {"cpu":0.25,"gpu":1}),
                                       param_space = config_dict,
                                       tune_config = tune.TuneConfig(metric="val_score", mode="max",max_concurrent_trials=4),
                                       # tune_config = tune.TuneConfig(metric="val_score", mode="max"),
                                       run_config = air.RunConfig(name="tune_uspppm", verbose=2, progress_reporter=tune.JupyterNotebookReporter(overwrite=True))
                                       )


Trial name,status,loc,batch_size,model,n_fold
trainable_cbdd2_00000,RUNNING,131.114.50.210:4156531,4,AI-Growth-Lab/P_b3f0,8
trainable_cbdd2_00001,RUNNING,131.114.50.210:4156575,8,AI-Growth-Lab/P_b3f0,8
trainable_cbdd2_00002,RUNNING,131.114.50.210:4156753,16,AI-Growth-Lab/P_b3f0,8
trainable_cbdd2_00006,RUNNING,131.114.50.210:4157739,4,distilbert-base_b710,8
trainable_cbdd2_00003,ERROR,131.114.50.210:4156925,32,AI-Growth-Lab/P_b3f0,8
trainable_cbdd2_00004,ERROR,131.114.50.210:4157292,64,AI-Growth-Lab/P_b3f0,8
trainable_cbdd2_00005,ERROR,131.114.50.210:4157517,128,AI-Growth-Lab/P_b3f0,8

Trial name,# failures,error file
trainable_cbdd2_00003,1,"/storagenfs/m.petix/ray_results/tune_uspppm/trainable_cbdd2_00003_3_batch_size=32,model=AI-Growth-Lab_PatentSBERTa,n_fold=8_2022-11-14_16-48-57/error.txt"
trainable_cbdd2_00004,1,"/storagenfs/m.petix/ray_results/tune_uspppm/trainable_cbdd2_00004_4_batch_size=64,model=AI-Growth-Lab_PatentSBERTa,n_fold=8_2022-11-14_16-49-14/error.txt"
trainable_cbdd2_00005,1,"/storagenfs/m.petix/ray_results/tune_uspppm/trainable_cbdd2_00005_5_batch_size=128,model=AI-Growth-Lab_PatentSBERTa,n_fold=8_2022-11-14_16-49-31/error.txt"


In [None]:
results = tuner.fit()

best_result = results.get_best_result()  # Get best result object
print(best_result)

[2m[36m(trainable pid=4156531)[0m Global seed set to 42
100%|██████████| 136/136 [00:00<00:00, 4695.28it/s]
  0%|          | 0/36473 [00:00<?, ?it/s]
  4%|▍         | 1532/36473 [00:00<00:02, 15311.69it/s]
  9%|▊         | 3145/36473 [00:00<00:02, 15791.29it/s]
 13%|█▎        | 4744/36473 [00:00<00:01, 15880.53it/s]
 17%|█▋        | 6333/36473 [00:00<00:01, 15644.18it/s]
 26%|██▌       | 9467/36473 [00:00<00:01, 15637.35it/s]
 30%|███       | 11123/36473 [00:00<00:01, 15934.97it/s]
 35%|███▍      | 12717/36473 [00:00<00:01, 15920.98it/s]
 39%|███▉      | 14310/36473 [00:00<00:01, 15842.11it/s]
 44%|████▎     | 15909/36473 [00:01<00:01, 15884.33it/s]
 48%|████▊     | 17506/36473 [00:01<00:01, 15907.55it/s]
 52%|█████▏    | 19134/36473 [00:01<00:01, 16017.37it/s]
 57%|█████▋    | 20736/36473 [00:01<00:00, 15836.05it/s]
 61%|██████    | 22321/36473 [00:01<00:00, 15610.77it/s]
 66%|██████▌   | 23923/36473 [00:01<00:00, 15731.96it/s]
 70%|██████▉   | 25498/36473 [00:01<00:00, 15545.68it/

The trial trainable_cbdd2_00003 errored with parameters={'target_size': 1, 'num_workers': 16, 'batch_size': 32, 'epochs': 2, 'n_fold': 8, 'warmup_steps': 0, 'min_lr': 1e-06, 'encoder_lr': 2e-05, 'decoder_lr': 2e-05, 'eps': 1e-06, 'betas': (0.9, 0.999), 'weight_decay': 0.01, 'fc_dropout': 0.2, 'seed': 42, 'model': 'AI-Growth-Lab/PatentSBERTa'}. Error file: /storagenfs/m.petix/ray_results/tune_uspppm/trainable_cbdd2_00003_3_batch_size=32,model=AI-Growth-Lab_PatentSBERTa,n_fold=8_2022-11-14_16-48-57/error.txt


[2m[36m(trainable pid=4157292)[0m Global seed set to 42
  0%|          | 0/136 [00:00<?, ?it/s]
100%|██████████| 136/136 [00:00<00:00, 4183.14it/s]
  0%|          | 0/36473 [00:00<?, ?it/s]
  4%|▍         | 1537/36473 [00:00<00:02, 15348.39it/s]
  9%|▊         | 3181/36473 [00:00<00:02, 15985.11it/s]
 13%|█▎        | 4876/36473 [00:00<00:01, 16418.83it/s]
 18%|█▊        | 6518/36473 [00:00<00:01, 16362.63it/s]
 22%|██▏       | 8182/36473 [00:00<00:01, 16461.76it/s]
 27%|██▋       | 9829/36473 [00:00<00:01, 16295.39it/s]
 31%|███▏      | 11459/36473 [00:00<00:01, 16077.56it/s]
 36%|███▌      | 13068/36473 [00:00<00:01, 15712.67it/s]
 40%|████      | 14702/36473 [00:00<00:01, 15903.33it/s]
 49%|████▉     | 17953/36473 [00:01<00:01, 16079.21it/s]
 54%|█████▎    | 19586/36473 [00:01<00:01, 16152.97it/s]
 58%|█████▊    | 21203/36473 [00:01<00:00, 16150.19it/s]
 63%|██████▎   | 22881/36473 [00:01<00:00, 16337.23it/s]
 67%|██████▋   | 24516/36473 [00:01<00:00, 16247.68it/s]
 72%|███████▏  

The trial trainable_cbdd2_00004 errored with parameters={'target_size': 1, 'num_workers': 16, 'batch_size': 64, 'epochs': 2, 'n_fold': 8, 'warmup_steps': 0, 'min_lr': 1e-06, 'encoder_lr': 2e-05, 'decoder_lr': 2e-05, 'eps': 1e-06, 'betas': (0.9, 0.999), 'weight_decay': 0.01, 'fc_dropout': 0.2, 'seed': 42, 'model': 'AI-Growth-Lab/PatentSBERTa'}. Error file: /storagenfs/m.petix/ray_results/tune_uspppm/trainable_cbdd2_00004_4_batch_size=64,model=AI-Growth-Lab_PatentSBERTa,n_fold=8_2022-11-14_16-49-14/error.txt


[2m[36m(trainable pid=4157517)[0m Global seed set to 42
100%|██████████| 136/136 [00:00<00:00, 4766.02it/s]
  0%|          | 0/36473 [00:00<?, ?it/s]
  4%|▍         | 1461/36473 [00:00<00:02, 14604.20it/s]
  8%|▊         | 3035/36473 [00:00<00:02, 15270.50it/s]
 13%|█▎        | 4621/36473 [00:00<00:02, 15539.22it/s]
 21%|██        | 7747/36473 [00:00<00:01, 15525.78it/s]
 25%|██▌       | 9300/36473 [00:00<00:01, 15425.77it/s]
 30%|██▉       | 10843/36473 [00:00<00:01, 15369.20it/s]
 34%|███▍      | 12381/36473 [00:00<00:01, 15018.69it/s]
 38%|███▊      | 13885/36473 [00:00<00:01, 14961.89it/s]
 42%|████▏     | 15383/36473 [00:01<00:01, 14929.59it/s]
 46%|████▋     | 16936/36473 [00:01<00:01, 15109.85it/s]
 51%|█████     | 18482/36473 [00:01<00:01, 15211.82it/s]
 55%|█████▍    | 20004/36473 [00:01<00:01, 15072.33it/s]
 59%|█████▉    | 21514/36473 [00:01<00:00, 15078.80it/s]
 63%|██████▎   | 23081/36473 [00:01<00:00, 15254.04it/s]
 68%|██████▊   | 24645/36473 [00:01<00:00, 15369.08it/

The trial trainable_cbdd2_00005 errored with parameters={'target_size': 1, 'num_workers': 16, 'batch_size': 128, 'epochs': 2, 'n_fold': 8, 'warmup_steps': 0, 'min_lr': 1e-06, 'encoder_lr': 2e-05, 'decoder_lr': 2e-05, 'eps': 1e-06, 'betas': (0.9, 0.999), 'weight_decay': 0.01, 'fc_dropout': 0.2, 'seed': 42, 'model': 'AI-Growth-Lab/PatentSBERTa'}. Error file: /storagenfs/m.petix/ray_results/tune_uspppm/trainable_cbdd2_00005_5_batch_size=128,model=AI-Growth-Lab_PatentSBERTa,n_fold=8_2022-11-14_16-49-31/error.txt


[2m[36m(trainable pid=4157739)[0m Global seed set to 42
100%|██████████| 136/136 [00:00<00:00, 4353.26it/s]
  0%|          | 0/36473 [00:00<?, ?it/s]
  4%|▍         | 1570/36473 [00:00<00:02, 15695.56it/s]
  9%|▉         | 3229/36473 [00:00<00:02, 16218.81it/s]
 13%|█▎        | 4851/36473 [00:00<00:01, 16172.99it/s]
 18%|█▊        | 6477/36473 [00:00<00:01, 16205.20it/s]
 22%|██▏       | 8098/36473 [00:00<00:01, 15905.43it/s]
 27%|██▋       | 9690/36473 [00:00<00:01, 15328.98it/s]
 31%|███       | 11227/36473 [00:00<00:01, 15254.45it/s]
 35%|███▍      | 12755/36473 [00:00<00:01, 15187.46it/s]
 39%|███▉      | 14339/36473 [00:00<00:01, 15384.35it/s]
 48%|████▊     | 17585/36473 [00:01<00:01, 15818.94it/s]
 53%|█████▎    | 19231/36473 [00:01<00:01, 16009.88it/s]
 57%|█████▋    | 20833/36473 [00:01<00:00, 15868.64it/s]
 62%|██████▏   | 22436/36473 [00:01<00:00, 15916.64it/s]
 66%|██████▌   | 24029/36473 [00:01<00:00, 15650.90it/s]
 74%|███████▍  | 27141/36473 [00:01<00:00, 15383.47it/s

In [None]:
ray.shutdown()

In [None]:
# Get a dataframe for the last reported results of all of the trials 
df = results.get_dataframe() 

In [None]:
df.to_csv('grid_search_results.csv')