In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import os
import re
from torchmetrics import MeanSquaredError
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning import Callback
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from transformers import DebertaV2Config, DebertaV2Tokenizer, DebertaV2ForSequenceClassification, DebertaV2Model
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers.models.deberta.modeling_deberta import StableDropout

In [2]:
train_data = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test_data = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
sample_submission_data = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/sample_submission.csv")

In [3]:
OUTPUT_DIR = './'

In [4]:
# ====================================================
# CPC Data
# ====================================================
def get_cpc_texts():
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir('../input/cpc-data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'../input/cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results


cpc_texts = get_cpc_texts()
torch.save(cpc_texts, OUTPUT_DIR+"cpc_texts.pth")
train_data['context_text'] = train_data['context'].map(cpc_texts) 
test_data['context_text'] = test_data['context'].map(cpc_texts)
display(train_data.head())
display(test_data.head())

Unnamed: 0,id,anchor,target,context,score,context_text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...


Unnamed: 0,id,anchor,target,context,context_text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,PHYSICS. OPTICS
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,MECHANICAL ENGINEERING; LIGHTING; HEATING; WEA...
2,36baf228038e314b,lower trunnion,lower locating,B60,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...
3,1f37ead645e7f0c8,cap component,upper portion,D06,TEXTILES; PAPER. TREATMENT OF TEXTILES OR THE ...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,ELECTRICITY. ELECTRIC COMMUNICATION TECHNIQUE


In [5]:
class CFG:
    val_size = 0.20
    max_len = 128
    model_name = '../input/deberta/base'
    embedding = 128
    out_features = 1
    batch_size = 12
    epochs = 6
    lr = 2e-5
    max_lr = 1e-3
    steps_per_epoch = None
    pct_start = 0.3
    div_factor = 1e+2
    final_div_factor = 1e+4
    accumulate = 4
    patience = 3
    monitor = 'val_loss'
    seed = 42
    debug = False

In [6]:
class PhraseSimilarityDataset(Dataset):
    
    def __init__(self, df, tokenizer):
        self.df = df 
        self.tokenizer = tokenizer
        
        self.tokenizer_params = {
            'max_length' : CFG.max_len,
            'padding' : 'max_length',
            'truncation' : True
        }
        
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        anchor = self.df.anchor.iloc[index].lower()
        target = self.df.target.iloc[index].lower()
        text = self.df.context_text.iloc[index].lower()
        
        anchor_tokens = self.tokenizer(anchor , text,**self.tokenizer_params)
        target_tokens = self.tokenizer(target , text, **self.tokenizer_params)
        
        
        score = torch.tensor(self.df.score.iloc[index], dtype=torch.float32)
        
        return (
            (np.array(anchor_tokens['input_ids']),
            np.array(anchor_tokens['attention_mask']),
            np.array(anchor_tokens['token_type_ids'])),
            (np.array(target_tokens['input_ids']),
            np.array(target_tokens['attention_mask']),
            np.array(target_tokens['token_type_ids'])),
            score
        )
    
class PhraseSimilarityTestset(Dataset):
    
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        
        self.tokenizer_params = {
            'max_length' : CFG.max_len,
            'padding' : 'max_length',
            'truncation' : True
        }
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        anchor = self.df.anchor.iloc[index].lower()
        target = self.df.target.iloc[index].lower()
        text = self.df.context_text.iloc[index].lower()
        
        anchor_tokens = self.tokenizer(anchor , text,**self.tokenizer_params)
        target_tokens = self.tokenizer(target , text, **self.tokenizer_params)
        
        return (
            (np.array(anchor_tokens['input_ids']),
            np.array(anchor_tokens['attention_mask']),
            np.array(anchor_tokens['token_type_ids'])),
            (np.array(target_tokens['input_ids']),
            np.array(target_tokens['attention_mask']),
            np.array(target_tokens['token_type_ids']))
        )

In [7]:
def compute_kernel_bias(vecs,n_components):
    """计算kernel和bias
    y = (x + bias).dot(kernel)
    """
    vecs = np.concatenate(vecs, axis=0)
    mu = vecs.mean(axis=0, keepdims=True)
    cov = np.cov(vecs.T)
    u, s, vh = np.linalg.svd(cov)
    W = np.dot(u, np.diag(s**0.5))
    W = np.linalg.inv(W.T)
    W = W[:, :n_components]
    return W, -mu

In [8]:
def normalize(vecs):
    """标准化
    """
    return vecs / (vecs**2).sum(axis=1, keepdims=True)**0.5

In [9]:
def transform_and_normalize(vecs, kernel, bias):
    """应用变换，然后标准化
    """
    if not (kernel is None or bias is None):
        vecs = (vecs + bias).dot(kernel)
    return normalize(vecs)

In [10]:
class PhraseSimilarityModel(pl.LightningModule):
    def __init__(self):
        super(PhraseSimilarityModel, self).__init__()
        class PhraseSimilarityModelImpl(nn.Module):
            def __init__(self, model_name):
                super().__init__()
                self.config = AutoConfig.from_pretrained(model_name)
                self.bert = AutoModel.from_pretrained(model_name, config=self.config)
                self.dense = nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=True)  #1024, 1024
                self.head = nn.Linear(self.config.hidden_size, CFG.out_features, bias=True) #1024, 1
                # self.dropout = nn.Dropout(0.5)
                self.stabledropout = StableDropout(self.config.hidden_dropout_prob)  # hidden_dropout_prob = 0.1
                self.stabledropout_pooler = StableDropout(self.config.pooler_dropout) # pooler_dropout = 0
                # transformers.models.deberta.modeling_deberta.StableDropout, optimized dropout module for stabilizing the training
                self.act2fn = nn.GELU()
            
            def forward(self, input1, input2):
                
                feats1 = self.bert(input1[0], input1[1],input1[2])['last_hidden_state'][:, 0]
                feats2 = self.bert(input2[0],input2[1],input2[2])['last_hidden_state'][:, 0]
                # last_hidden_layers (batch_size, max_length, 1024)
                
                # feats = torch.sum(feats[0], 1)/feats[0].shape[1]
                # feats = self.dropout(feats)
                # output = self.head(feats)
                
                #context_token = feats[:, 0] 
                
                # pooler -> dropout
                #context_token = self.stabledropout_pooler(context_token)
                #pooled_output = self.dense(context_token) 
                
                pooled_output1,pooled_output2 = self.dense(self.stabledropout_pooler(feats1)),self.dense(self.stabledropout_pooler(feats2))
                
                # ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} 
                #pooled_output = self.act2fn(pooled_output)
                #pooled_output = self.stabledropout(pooled_output)
                pooled_output1,pooled_output2 = self.stabledropout(self.act2fn(pooled_output1)),self.stabledropout(self.act2fn(pooled_output2))
                #vec1,vec2 = pooled_output1.detach().numpy(),pooled_output2.detach().numpy()
                #kernel,bias = compute_kernel_bias([vec1,vec2],96)
                #anc_output = transforms_and_normalize()
                #output = torch.cosine_similarity(pooled_output1,pooled_output2,dim=1)
                #output = self.head(pooled_output) # (1, batch_size)
                return pooled_output1,pooled_output2


        self.model = PhraseSimilarityModelImpl(CFG.model_name)
        self.criterion = nn.MSELoss()
        self.metric = MeanSquaredError()
        
    def forward(self, input1, input2):
        return self.model(input1,input2)
    
    def configure_optimizers(self):
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=CFG.lr)
        return self.optimizer
    
    def training_step(self, batch, batch_idx): 

        anc_in,tar_in,label = batch[0], batch[1] ,batch[2]
        output1,output2 = self.model(anc_in,tar_in)
        preds = torch.cosine_similarity(output1,output2,dim=1)
        # print("preds: ", '\n', preds)
        # print("target: ", '\n', batch[2])
        # print('size: ', preds.shape, preds.squeeze(1).shape, batch[2].shape)
        loss = self.criterion(preds, label) 
        rmse = self.metric(preds, label)
        logs = {'train_loss': loss, 'train_error': rmse, 'lr': self.optimizer.param_groups[0]['lr']}
        self.log_dict(logs, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        anc_in,tar_in,label = batch[0], batch[1] ,batch[2]
        #preds = self.model(anc_in,tar_in)
        output1,output2 = self.model(anc_in,tar_in)
        vec1,vec2 = output1.cpu().numpy(),output2.cpu().numpy()
        kernel,bias = compute_kernel_bias([vec1,vec2],96)
        anc_output = torch.from_numpy(transform_and_normalize(vec1,kernel,bias))
        tar_output = torch.from_numpy(transform_and_normalize(vec2,kernel,bias))
        preds = torch.cosine_similarity(anc_output,tar_output,dim=1)
        loss = self.criterion(preds, label)
        rmse = self.metric(preds, label)
        logs = {'val_loss': loss, 'val_error': rmse}
        self.log_dict(logs, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def predict_step(self, batch, batch_idx):
        anc_in,tar_in = batch[0], batch[1]
        output1,output2 = self.model(anc_in,tar_in)
        vec1,vec2 = output1.cpu().numpy(),output2.cpu().numpy()
        kernel,bias = compute_kernel_bias([vec1,vec2],96)
        anc_output = torch.from_numpy(transform_and_normalize(vec1,kernel,bias))
        tar_output = torch.from_numpy(transform_and_normalize(vec2,kernel,bias))
        preds = torch.cosine_similarity(anc_output,tar_output,dim=1)
        return preds

In [11]:
np.random.seed(CFG.seed)
anchors = train_data.anchor.unique()
display(len(anchors))
np.random.shuffle(anchors)
val_sz = int(len(anchors)*CFG.val_size) #0.2
val_anchors = anchors[:val_sz]
display(len(val_anchors))
is_val = np.isin(train_data.anchor, val_anchors)
idxs = np.arange(len(train_data))
display(len(idxs))
val_idxs = idxs[is_val]
trn_idxs = idxs[~is_val]
print(len(val_idxs),len(trn_idxs))
#val = train_data.loc[val_idxs].iloc[:50]
#train = train_data.loc[trn_idxs].iloc[:200]
val = train_data.loc[val_idxs]
train = train_data.loc[trn_idxs]
print(len(val),len(train))

733

146

36473

7516 28957
7516 28957


In [12]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
train_dataset = PhraseSimilarityDataset(train, tokenizer)
val_dataset = PhraseSimilarityDataset(val, tokenizer)
test_dataset = PhraseSimilarityTestset(test_data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [13]:
CFG.steps_per_epoch = len(train_dataloader)
CFG.steps_per_epoch

2414

In [14]:
logger = CSVLogger(save_dir='./', name=CFG.model_name.split('/')[-1]+'_log')
logger.log_hyperparams(CFG.__dict__)
checkpoint_callback = ModelCheckpoint(monitor=CFG.monitor,
                                      save_top_k=1,
                                      save_last=True,
                                      save_weights_only=True,
                                      filename='{epoch:02d}-{valid_loss:.4f}-{valid_acc:.4f}',
                                      verbose=False,
                                      mode='min')
early_stop_callback = EarlyStopping(monitor=CFG.monitor, 
                                    patience=CFG.patience, 
                                    verbose=False, 
                                    mode="min")
trainer = Trainer(
    max_epochs=CFG.epochs,
    gpus=[0],
    accumulate_grad_batches=CFG.accumulate,
    callbacks=[checkpoint_callback, early_stop_callback], 
    log_every_n_steps=1,
    logger=logger,
    weights_summary='top',
)

In [None]:
driver = PhraseSimilarityModel()
trainer.fit(driver, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)