# Pip Wheels

In [1]:
'''
!pip install pytorch_lightning
!pip install torchmetrics
!pip install tokenizers
!pip install transformers
!pip install ray[tune]
'''

'\n!pip install pytorch_lightning\n!pip install torchmetrics\n!pip install tokenizers\n!pip install transformers\n!pip install ray[tune]\n'

# Imports

In [2]:
# General Libraries
import os
import re
import random
import numpy as np
import pandas as pd
import scipy as sp



# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import seed_everything, Trainer, LightningModule
from torchmetrics import Accuracy
from torchmetrics.functional import f1_score, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, TQDMProgressBar 
from pytorch_lightning.loggers import TensorBoardLogger



# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


# Ray[Tune]
import ray
from ray import air
from ray import tune
from ray.air import session
from ray.tune.integration.pytorch_lightning import TuneReportCallback

import torch
# HuggingFace Libraries
import tokenizers
import transformers 

%env TOKENIZERS_PARALLELISM=true

#out code 
from kfold_loop import KFoldLoop
from USPPM_model import USPPPM_model
from USPPM_dataset import set_tokenizer, set_max_len
from USPPM_kfold_datamodule import USPPPM_kf_datamodule

env: TOKENIZERS_PARALLELISM=true


# Configuration

## Configuration Class: notebook-specific settings

In [3]:
class CFG:
    # General
    seed = 42
    
    # Debug 
    debug = False
    debug_samples = 100

## Configuration Dictionary: trial-specific settings

In [4]:
# Defining a search space!
config_dict = {
    "target_size" : 1,
    "num_workers" : 16,
    
    # Training parameters
    "batch_size" : 32,
    "epochs" : 2,
    "n_fold" : 2,
    "warmup_steps" : 0,
    "min_lr" : 1e-6,
    "encoder_lr" : 2e-5,
    "decoder_lr" : 2e-5,
    "eps" : 1e-6,
    "betas" : (0.9, 0.999),
    "weight_decay" : 0.01,
    "fc_dropout" : 0.2,
    "seed" : 42,

    # Transformers
    # "model" : tune.choice(["microsoft/deberta-v3-large"]),
     "model" : tune.choice(["distilbert-base-uncased"]),
    #"model" : tune.grid_search(["AI-Growth-Lab/PatentSBERTa","distilbert-base-uncased","ahotrod/electra_large_discriminator_squad2_512","Yanhao/simcse-bert-for-patent","microsoft/deberta-v3-large"])
}

## Directories

In [5]:
INPUT_DIR = '../dataset/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

## Logger

In [6]:
logger = TensorBoardLogger("lightning_logs", name="USPPPM")

## Random seed

In [7]:
pl.seed_everything(CFG.seed)

Global seed set to 42


42

# Data Loading

In [8]:
cpc_texts = torch.load('cpc_texts.pth')
dataframe = pd.read_csv("dataframe.csv")
display(dataframe.head())

Unnamed: 0.1,Unnamed: 0,id,anchor,target,context,score,context_text,text,score_map
0,0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]abatement of pollution[SEP]HUMAN...,2
1,1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]act of abating[SEP]HUMAN NECESSI...,3
2,2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]active catalyst[SEP]HUMAN NECESS...,1
3,3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]eliminating process[SEP]HUMAN NE...,2
4,4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]forest region[SEP]HUMAN NECESSIT...,0


## Debug Slicing

In [9]:
if CFG.debug:
    dataframe = dataframe.iloc[:CFG.debug_samples,:]

## Train-Test Split

In [10]:
# train_df, test_df = train_test_split(dataframe, test_size = 0.1, random_state = CFG.seed, stratify = dataframe.score_map)
train_df, test_df = train_test_split(dataframe, test_size = 0.1, random_state = CFG.seed)
display(train_df.head())
display(test_df.head())

Unnamed: 0.1,Unnamed: 0,id,anchor,target,context,score,context_text,text,score_map
9900,9900,0dbb44b9a145edec,distributor pipe,pipe,B01,0.5,PERFORMING OPERATIONS; TRANSPORTING. PHYSICAL ...,distributor pipe[SEP]pipe[SEP]PERFORMING OPERA...,2
1303,1303,74afca34a5439c23,ammonia recovery,recovery of water,C01,0.25,HEMISTRY; METALLURGY. INORGANIC CHEMISTRY,ammonia recovery[SEP]recovery of water[SEP]HEM...,1
16591,16591,6371befc3ee1b0f2,inner closed,cylindrical inner member,E04,0.5,FIXED CONSTRUCTIONS. BUILDING,inner closed[SEP]cylindrical inner member[SEP]...,2
25822,25822,20489196c73bd86b,produce thin layers,produce layers,G01,0.5,PHYSICS. MEASURING; TESTING,produce thin layers[SEP]produce layers[SEP]PHY...,2
23640,23640,9af994b21c892022,parallel orientation,zero angle,G06,0.25,PHYSICS. COMPUTING; CALCULATING; COUNTING,parallel orientation[SEP]zero angle[SEP]PHYSIC...,1


Unnamed: 0.1,Unnamed: 0,id,anchor,target,context,score,context_text,text,score_map
33511,33511,ed1c4e525eb105fe,transmit alarm,display indicator,G08,0.0,PHYSICS. SIGNALLING,transmit alarm[SEP]display indicator[SEP]PHYSI...,0
18670,18670,5386316f318f5221,locking formation,retaining element,B60,0.25,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...,locking formation[SEP]retaining element[SEP]PE...,1
18049,18049,1544ca6753fcbddd,lateral power,transducer,H01,0.25,ELECTRICITY. BASIC ELECTRIC ELEMENTS,lateral power[SEP]transducer[SEP]ELECTRICITY. ...,1
31660,31660,f9d8979b94cec923,spreader body,spreader,A01,0.75,HUMAN NECESSITIES. GRICULTURE; FORESTRY; ANIMA...,spreader body[SEP]spreader[SEP]HUMAN NECESSITI...,3
15573,15573,e151ca5ea5cc0f08,high gradient magnetic separators,magnetic filtration,B03,0.5,PERFORMING OPERATIONS; TRANSPORTING. SEPARATIO...,high gradient magnetic separators[SEP]magnetic...,2


# Training

## Callbacks

In [11]:
ray.init(num_gpus=4)

2022-11-13 23:50:39,886	INFO worker.py:1518 -- Started a local Ray instance.


0,1
Python version:,3.8.10
Ray version:,2.0.1


In [12]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best_checkpoint",
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

metrics = {"val_score": "val_score", "train_loss" : "train_loss", "val_loss" : "val_loss"}

In [13]:
def trainable(config_dict):  # Pass a "config" dictionary into your trainable.

    steps_per_epoch = len(train_df) // config_dict['batch_size']
    config_dict['training_steps'] = steps_per_epoch * config_dict['epochs']
    
    set_tokenizer(config_dict, OUTPUT_DIR)
    set_max_len(config_dict, cpc_texts, dataframe)
    # train_dataset = USPPM_dataset(config_dict)
    datamodule = USPPPM_kf_datamodule(config_dict, dataframe)
    
    model = USPPPM_model(config_dict)
    
    callbacks = [TuneReportCallback(metrics, on="validation_end"), checkpoint_callback, early_stopping_callback, TQDMProgressBar(refresh_rate=2)]
    trainer = pl.Trainer(
            logger=logger,
            num_sanity_val_steps=0,
            check_val_every_n_epoch=1,
            callbacks=callbacks,
            max_epochs=config_dict['epochs'],
            #devices=[1],
            accelerator="gpu",
        
            )
    
    internal_fit_loop = trainer.fit_loop
    trainer.fit_loop = KFoldLoop(config_dict['n_fold'], config_dict, export_path="./")
    trainer.fit_loop.connect(internal_fit_loop)
    
    trainer.fit(model, datamodule)

In [14]:
resource_group = tune.PlacementGroupFactory([{"CPU": 1, "GPU": 1}])

tuner = tune.Tuner(tune.with_resources(trainable, 
                                       {"cpu":0.25,"gpu":1}),
                                       param_space = config_dict,
                                       tune_config = tune.TuneConfig(metric="val_score", mode="max",max_concurrent_trials=4),
                                       # tune_config = tune.TuneConfig(metric="val_score", mode="max"),
                                       run_config = air.RunConfig(name="tune_uspppm", verbose=2, progress_reporter=tune.JupyterNotebookReporter(overwrite=True))
                                       )


Trial name,status,loc,model,iter,total time (s),val_score,train_loss,val_loss
trainable_9a289_00000,TERMINATED,131.114.50.210:3350669,distilbert-base_3300,4,567.619,0.613471,0.576764,0.600198


In [15]:
results = tuner.fit()

best_result = results.get_best_result()  # Get best result object
print(best_result)

100%|██████████| 136/136 [00:00<00:00, 5165.35it/s]
  0%|          | 0/36473 [00:00<?, ?it/s]
  4%|▍         | 1582/36473 [00:00<00:02, 15813.00it/s]
  9%|▉         | 3193/36473 [00:00<00:02, 15987.06it/s]
 13%|█▎        | 4792/36473 [00:00<00:02, 15499.01it/s]
 17%|█▋        | 6344/36473 [00:00<00:01, 15362.43it/s]
 22%|██▏       | 7882/36473 [00:00<00:01, 15017.26it/s]
 26%|██▌       | 9386/36473 [00:00<00:01, 14982.74it/s]
 30%|██▉       | 10934/36473 [00:00<00:01, 15138.06it/s]
 34%|███▍      | 12471/36473 [00:00<00:01, 15210.70it/s]
 39%|███▊      | 14104/36473 [00:00<00:01, 15554.69it/s]
 43%|████▎     | 15730/36473 [00:01<00:01, 15768.86it/s]
 47%|████▋     | 17308/36473 [00:01<00:01, 15693.19it/s]
 52%|█████▏    | 18898/36473 [00:01<00:01, 15752.27it/s]
 60%|██████    | 22029/36473 [00:01<00:00, 15471.08it/s]
 65%|██████▍   | 23577/36473 [00:01<00:00, 15454.13it/s]
 69%|██████▉   | 25123/36473 [00:01<00:00, 15318.05it/s]
 73%|███████▎  | 26656/36473 [00:01<00:00, 15316.26it/s]


[2m[36m(trainable pid=3350669)[0m STARTING FOLD 1
[2m[36m(trainable pid=3350669)[0m TRAIN FOLD 1 16412
[2m[36m(trainable pid=3350669)[0m VALID FOLD 1 16413
Epoch 0:   0%|          | 0/1026 [00:00<?, ?it/s] 


[2m[36m(trainable pid=3350669)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(trainable pid=3350669)[0m 
[2m[36m(trainable pid=3350669)[0m   | Name       | Type              | Params
[2m[36m(trainable pid=3350669)[0m -------------------------------------------------
[2m[36m(trainable pid=3350669)[0m 0 | model      | DistilBertModel   | 66.4 M
[2m[36m(trainable pid=3350669)[0m 1 | criterion  | BCEWithLogitsLoss | 0     
[2m[36m(trainable pid=3350669)[0m 2 | fc_dropout | Dropout           | 0     
[2m[36m(trainable pid=3350669)[0m 3 | fc         | Linear            | 769   
[2m[36m(trainable pid=3350669)[0m 4 | attention  | Sequential        | 394 K 
[2m[36m(trainable pid=3350669)[0m -------------------------------------------------
[2m[36m(trainable pid=3350669)[0m 66.8 M    Trainable params
[2m[36m(trainable pid=3350669)[0m 0         Non-trainable params
[2m[36m(trainable pid=3350669)[0m 66.8 M    Total params
[2m[36m(trainable pid=3350669

Epoch 0:   0%|          | 2/1026 [00:01<12:08,  1.40it/s, loss=0.686, v_num=0, train_loss=0.695]
Epoch 0:   0%|          | 4/1026 [00:01<07:44,  2.20it/s, loss=0.671, v_num=0, train_loss=0.669]
Epoch 0:   1%|          | 6/1026 [00:02<06:14,  2.73it/s, loss=0.663, v_num=0, train_loss=0.633]
Epoch 0:   1%|          | 8/1026 [00:02<05:28,  3.10it/s, loss=0.665, v_num=0, train_loss=0.631]
Epoch 0:   1%|          | 10/1026 [00:02<05:02,  3.36it/s, loss=0.66, v_num=0, train_loss=0.632] 
Epoch 0:   1%|          | 12/1026 [00:03<04:43,  3.58it/s, loss=0.662, v_num=0, train_loss=0.691]
Epoch 0:   1%|▏         | 14/1026 [00:03<04:30,  3.74it/s, loss=0.659, v_num=0, train_loss=0.648]
Epoch 0:   2%|▏         | 16/1026 [00:04<04:19,  3.89it/s, loss=0.659, v_num=0, train_loss=0.657]
Epoch 0:   2%|▏         | 18/1026 [00:04<04:10,  4.02it/s, loss=0.662, v_num=0, train_loss=0.679]
Epoch 0:   2%|▏         | 20/1026 [00:04<04:04,  4.11it/s, loss=0.665, v_num=0, train_loss=0.673]
Epoch 0:   2%|▏         

[2m[36m(trainable pid=3350669)[0m Epoch 0, global step 513: 'val_loss' reached 0.58302 (best 0.58302), saving model to '/storagenfs/m.petix/hlt_usppm/src/checkpoints/best_checkpoint-v24.ckpt' as top 1


Epoch 1:   0%|          | 0/1026 [00:00<?, ?it/s, loss=0.603, v_num=0, train_loss=0.623, val_loss=0.583, val_score=0.699]           
Epoch 1:   0%|          | 2/1026 [00:01<11:26,  1.49it/s, loss=0.603, v_num=0, train_loss=0.623, val_loss=0.583, val_score=0.699]
Epoch 1:   0%|          | 4/1026 [00:01<07:20,  2.32it/s, loss=0.599, v_num=0, train_loss=0.586, val_loss=0.583, val_score=0.699]
Epoch 1:   1%|          | 6/1026 [00:02<05:57,  2.85it/s, loss=0.594, v_num=0, train_loss=0.610, val_loss=0.583, val_score=0.699]
Epoch 1:   1%|          | 8/1026 [00:02<05:16,  3.22it/s, loss=0.596, v_num=0, train_loss=0.571, val_loss=0.583, val_score=0.699]
Epoch 1:   1%|          | 10/1026 [00:02<04:51,  3.48it/s, loss=0.596, v_num=0, train_loss=0.541, val_loss=0.583, val_score=0.699]
Epoch 1:   1%|          | 12/1026 [00:03<04:35,  3.68it/s, loss=0.598, v_num=0, train_loss=0.632, val_loss=0.583, val_score=0.699]
Epoch 1:   1%|▏         | 14/1026 [00:03<04:24,  3.83it/s, loss=0.598, v_num=0, train

[2m[36m(trainable pid=3350669)[0m Epoch 1, global step 1026: 'val_loss' reached 0.56846 (best 0.56846), saving model to '/storagenfs/m.petix/hlt_usppm/src/checkpoints/best_checkpoint-v24.ckpt' as top 1
[2m[36m(trainable pid=3350669)[0m `Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 1026/1026 [02:20<00:00,  7.31it/s, loss=0.57, v_num=0, train_loss=0.572, val_loss=0.568, val_score=0.735]
Testing: 0it [00:00, ?it/s]0669)[0m 
Testing DataLoader 0:   2%|▏         | 2/114 [00:00<00:04, 27.39it/s]
Testing DataLoader 0:   4%|▎         | 4/114 [00:00<00:05, 19.86it/s]
Testing DataLoader 0:   5%|▌         | 6/114 [00:00<00:05, 18.59it/s]
Testing DataLoader 0:   7%|▋         | 8/114 [00:00<00:05, 18.17it/s]
Testing DataLoader 0:   9%|▉         | 10/114 [00:00<00:05, 17.91it/s]
Testing DataLoader 0:  11%|█         | 12/114 [00:00<00:05, 17.77it/s]
Testing DataLoader 0:  12%|█▏        | 14/114 [00:00<00:05, 17.68it/s]
Testing DataLoader 0:  14%|█▍        | 16/114 [00:00<00:05, 17.60it/s]
Testing DataLoader 0:  16%|█▌        | 18/114 [00:01<00:05, 17.54it/s]
Testing DataLoader 0:  18%|█▊        | 20/114 [00:01<00:05, 17.50it/s]
Testing DataLoader 0:  19%|█▉        | 22/114 [00:01<00:05, 17.45it/s]
Testing DataLoader 0:  21%|██        | 24/114 [00:01<

[2m[36m(trainable pid=3350669)[0m Epoch 0, global step 1539: 'val_loss' was not in top 1


Epoch 1:   0%|          | 2/1026 [00:01<11:39,  1.46it/s, loss=0.625, v_num=0, train_loss=0.620, val_loss=0.620, val_score=0.507, test_loss=0.570]
Epoch 1:   0%|          | 4/1026 [00:01<07:27,  2.29it/s, loss=0.619, v_num=0, train_loss=0.604, val_loss=0.620, val_score=0.507, test_loss=0.570]
Epoch 1:   1%|          | 6/1026 [00:02<06:02,  2.81it/s, loss=0.62, v_num=0, train_loss=0.602, val_loss=0.620, val_score=0.507, test_loss=0.570] 
Epoch 1:   1%|          | 8/1026 [00:02<05:20,  3.18it/s, loss=0.622, v_num=0, train_loss=0.628, val_loss=0.620, val_score=0.507, test_loss=0.570]
Epoch 1:   1%|          | 10/1026 [00:02<04:55,  3.44it/s, loss=0.618, v_num=0, train_loss=0.572, val_loss=0.620, val_score=0.507, test_loss=0.570]
Epoch 1:   1%|          | 12/1026 [00:03<04:37,  3.65it/s, loss=0.612, v_num=0, train_loss=0.605, val_loss=0.620, val_score=0.507, test_loss=0.570]
Epoch 1:   1%|▏         | 14/1026 [00:03<04:25,  3.82it/s, loss=0.61, v_num=0, train_loss=0.637, val_loss=0.620, val

[2m[36m(trainable pid=3350669)[0m Epoch 1, global step 2052: 'val_loss' was not in top 1
[2m[36m(trainable pid=3350669)[0m `Trainer.fit` stopped: `max_epochs=2` reached.


Testing: 0it [00:00, ?it/s]0669)[0m 
Testing DataLoader 0:   2%|▏         | 2/114 [00:00<00:03, 29.50it/s]
Testing DataLoader 0:   4%|▎         | 4/114 [00:00<00:05, 21.62it/s]
Testing DataLoader 0:   5%|▌         | 6/114 [00:00<00:05, 19.85it/s]
Testing DataLoader 0:   7%|▋         | 8/114 [00:00<00:05, 19.07it/s]
Testing DataLoader 0:   9%|▉         | 10/114 [00:00<00:05, 18.64it/s]
Testing DataLoader 0:  11%|█         | 12/114 [00:00<00:05, 18.34it/s]
Testing DataLoader 0:  12%|█▏        | 14/114 [00:00<00:05, 18.15it/s]
Testing DataLoader 0:  14%|█▍        | 16/114 [00:00<00:05, 17.99it/s]
Testing DataLoader 0:  16%|█▌        | 18/114 [00:01<00:05, 17.89it/s]
Testing DataLoader 0:  18%|█▊        | 20/114 [00:01<00:05, 17.81it/s]
Testing DataLoader 0:  19%|█▉        | 22/114 [00:01<00:05, 17.74it/s]
Testing DataLoader 0:  21%|██        | 24/114 [00:01<00:05, 17.68it/s]
Testing DataLoader 0:  23%|██▎       | 26/114 [00:01<00:04, 17.63it/s]
Testing DataLoader 0:  25%|██▍       | 28/1

[2m[36m(trainable pid=3350669)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
[2m[36m(trainable pid=3350669)[0m - This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(trainable pid=3350669)[0m - This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(trainable pid=3350669)[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_

Testing: 0it [00:00, ?it/s]0669)[0m 
Testing DataLoader 0:   0%|          | 0/114 [00:00<?, ?it/s]
[2m[36m(trainable pid=3350669)[0m ensemble_avg_loss 0.56372154
[2m[36m(trainable pid=3350669)[0m ensemble_avg_score 0.7995018533256664
[2m[36m(trainable pid=3350669)[0m ensemble_avg_loss 0.60622925
[2m[36m(trainable pid=3350669)[0m ensemble_avg_score 0.6143701551356964
Testing DataLoader 0:   2%|▏         | 2/114 [00:00<00:14,  7.52it/s]
[2m[36m(trainable pid=3350669)[0m ensemble_avg_loss 0.5947206
[2m[36m(trainable pid=3350669)[0m ensemble_avg_score 0.6905487889089901
[2m[36m(trainable pid=3350669)[0m ensemble_avg_loss 0.62885976
[2m[36m(trainable pid=3350669)[0m ensemble_avg_score 0.6951448590934086
Testing DataLoader 0:   4%|▎         | 4/114 [00:00<00:14,  7.59it/s]
[2m[36m(trainable pid=3350669)[0m ensemble_avg_loss 0.57747805
[2m[36m(trainable pid=3350669)[0m ensemble_avg_score 0.7470142109821758
[2m[36m(trainable pid=3350669)[0m ensemble_avg_loss 0

2022-11-14 00:01:10,495	INFO tune.py:758 -- Total run time: 628.11 seconds (627.63 seconds for the tuning loop).


Result(metrics={'val_score': 0.6134706616697575, 'train_loss': 0.5767638087272644, 'val_loss': 0.6001981496810913, 'done': True, 'trial_id': '9a289_00000', 'experiment_tag': '0_model=distilbert-base-uncased'}, error=None, log_dir=PosixPath('/storagenfs/m.petix/ray_results/tune_uspppm/trainable_9a289_00000_0_model=distilbert-base-uncased_2022-11-13_23-50-43'))


In [16]:
ray.shutdown()

In [17]:
# Get a dataframe for the last reported results of all of the trials 
df = results.get_dataframe() 

In [18]:
df.to_csv('grid_search_results.csv')