# U.S. Patent Phrase to Phrase Matching
## Kaggle Competition - Help Identify Similar Phrases in U.S. Patents

Over the past two centuries, the USPTO has amassed nearly 11 million patents, and such massive amounts of data have created difficulties in patent examination and search. How can a patent examiner determine whether a newly-filed patent has previously been described? What happens if a patent searcher finds the subject he is looking for in the vast ocean of data?

We can address the aforementioned issues by training models on a novel semantic similarity dataset to extract relevant information by matching key phrases in patent documents. Specifically, given a pair of phrases, our model can predict the similarity score (0/0.25/0.5/0.75/1) between the two phrases.

Cooperative Patent Classification was added as a technical domain context to assist us in resolving such ambiguities as an additional feature for the disambiguate. For example, if one invention claims to be "strong material" and another uses "steel," they may be equivalent if the domain is steel, but not if the domain is ripstop fabric (you don't want steel for your parachute).

### EDA

In [1]:
! nvidia-smi

Wed Jun 22 22:01:39 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    24W / 300W | Function Not Found   |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os
import gc
import random
import requests
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
from PIL import Image
from tqdm import tqdm
from scipy import stats
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoConfig, AutoModel, get_linear_schedule_with_warmup

import warnings
warnings.simplefilter('ignore')

#### load data

In [None]:
train = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
test = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')

In [None]:
train

#### Score

In [None]:
plt.figure(figsize=(20, 6))
sns.countplot(x='score', data=train, palette='flare')

#### Anchor

In [None]:
count_anchors = dict(train.anchor.value_counts().head(50))
plt.figure(figsize=(20, 6))
sns.barplot(x=list(count_anchors.keys()), y=list(count_anchors.values()), palette='flare')
plt.xticks(rotation=90)
plt.title("Top 50 First Phrases (Anchor)", fontsize=20)

In [None]:
plt.figure(figsize=(20, 6))
train['anchor_len'] = train['anchor'].str.split().str.len()
sns.countplot(x='anchor_len', data=train, palette='flare')
plt.title("Word Count Distribution", size=20)

In [None]:
anchor_desc_tr = train[train.anchor.notnull()].anchor.values
stopwords_tr = set(STOPWORDS) 
wordcloud_tr = WordCloud(width = 800, 
                      height = 800,
                      background_color ='white',
                      min_font_size = 10,
                      stopwords = stopwords_tr,).generate(' '.join(anchor_desc_tr))
anchor_desc_te = test[test.anchor.notnull()].anchor.values
stopwords_te = set(STOPWORDS) 
wordcloud_te = WordCloud(width = 800, 
                      height = 800,
                      background_color ='white',
                      min_font_size = 10,
                      stopwords = stopwords_te,).generate(' '.join(anchor_desc_te))
fig = plt.figure(figsize=(20, 16))
ax1 = fig.add_subplot(121)  
ax1.imshow(wordcloud_tr) 
ax1.axis("off") 
plt.title('train data')
ax2 = fig.add_subplot(122)  
ax2.imshow(wordcloud_te) 
ax2.axis("off") 
plt.title('test data')
plt.show()

#### Target

In [None]:
count_anchors = dict(train.target.value_counts().head(50))
plt.figure(figsize=(20, 6))
sns.barplot(x=list(count_anchors.keys()), y=list(count_anchors.values()), palette='flare')
plt.xticks(rotation=90)
plt.title("Top 50 First Phrases (Target)", fontsize=20)

In [None]:
plt.figure(figsize=(20, 6))
train['target_len'] = train['target'].str.split().str.len()
sns.countplot(x='target_len', data=train, palette='flare')
plt.title("Word Count Distribution", size=20)

In [None]:
target_desc_tr = train[train.target.notnull()].target.values
stopwords_tr = set(STOPWORDS) 
wordcloud_tr = WordCloud(width = 800, 
                      height = 800,
                      background_color ='white',
                      min_font_size = 10,
                      stopwords = stopwords_tr,).generate(' '.join(target_desc_tr))
target_desc_te = test[test.target.notnull()].target.values
stopwords_te = set(STOPWORDS) 
wordcloud_te = WordCloud(width = 800, 
                      height = 800,
                      background_color ='white',
                      min_font_size = 10,
                      stopwords = stopwords_te,).generate(' '.join(target_desc_te))
fig = plt.figure(figsize=(20, 16))
ax1 = fig.add_subplot(121)  
ax1.imshow(wordcloud_tr) 
ax1.axis("off") 
plt.title('train data')
ax2 = fig.add_subplot(122)  
ax2.imshow(wordcloud_te) 
ax2.axis("off") 
plt.title('test data')
plt.show()

#### Context

In [None]:
plt.figure(figsize=(20, 6))

sns.countplot(x='context', data=train, palette='flare', order = train['context'].value_counts().index)
plt.xticks(rotation=90)
plt.title("Distribution of Context", fontsize=20)

plt.show()

**Contexts have the following meaning:**

A:HumanNecessities

B:OperationsandTransport

C:ChemistryandMetallurgy

D:Textiles

E:FixedConstructions

F:MechanicalEngineering

G:Physics

H:Electricity

Y:Emerging Cross-Sectional Technologies

In [None]:
train['section'] = train['context'].astype(str).str[0]
di = {"A" : "A - Human Necessities", 
      "B" : "B - Operations and Transport",
      "C" : "C - Chemistry and Metallurgy",
      "D" : "D - Textiles",
      "E" : "E - Fixed Constructions",
      "F" : "F - Mechanical Engineering",
      "G" : "G - Physics",
      "H" : "H - Electricity",
      "Y" : "Y - Emerging Cross-Sectional Technologies"}

train = train.replace({"section": di})

plt.figure(figsize=(20, 6))

sns.countplot(x='section', data=train, palette='flare')
plt.xticks(rotation=90)

### Data Engineering

In [None]:
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    #model="AI-Growth-Lab/PatentSBERTa_Kaggle_V3"
    #model="google/bigbird-pegasus-large-bigpatent"
    #model="google/pegasus-big_patent"
    #model="Yanhao/simcse-bert-for-patent"
    model="prithivida/bert-for-patents-64d"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0.1
    epochs=10
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    fc_dropout=0.2
    target_size=1
    max_len=1024
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=4
    trn_fold=[3]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [None]:
cpc_texts = torch.load("../input/foldsdump/cpc_texts.pth")
train['context_text'] = train['context'].map(cpc_texts).str.lower()
test['context_text'] = test['context'].map(cpc_texts).str.lower()

In [None]:
train['text'] = train['anchor'] + '[SEP]' + train['target'] + '[SEP]'  + train['context_text']
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
display(train.head())
display(test.head())

In [None]:
train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train['score_map'])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())