# U.S. Patent Phrase to Phrase Matching

Patent Phrase Matcher

Tool: Compares similarity between patent phrases (0-1 score)

Examples:
"mobile phone" vs "cellphone" → high match
"solar panel" vs "car engine" → low match

Tech: DistilBERT + PyTorch + Gradio

Value: Quick patent search & comparison with no technical expert

Dataset: https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/data

Hugging Face: https://huggingface.co/spaces/alperugurcan/similarity-predictorise needed

In [3]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Load and reduce data size
train_df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv').sample(5000, random_state=42)
test_df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv')

# Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
head = torch.nn.Sequential(torch.nn.Linear(768, 1), torch.nn.Sigmoid()).to(device)
optimizer = torch.optim.AdamW(list(model.parameters()) + list(head.parameters()), lr=3e-5)

# Training for 10 epochs
batch_size = 32
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(range(0, len(train_df), batch_size), 
                       desc=f'Epoch {epoch+1}/{num_epochs}')
    
    for i in progress_bar:
        batch = train_df.iloc[i:i+batch_size]
        encoded = tokenizer(batch['anchor'].tolist(), batch['target'].tolist(), 
                          padding=True, truncation=True, max_length=64,
                          return_tensors='pt').to(device)
        scores = torch.tensor(batch['score'].values, dtype=torch.float).to(device)
        
        optimizer.zero_grad()
        output = head(model(**encoded)[0][:,0,:]).squeeze()
        loss = torch.nn.MSELoss()(output, scores)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': total_loss/(i//batch_size + 1)})

# Predict
predictions = []
model.eval()
with torch.no_grad():
    for i in tqdm(range(0, len(test_df), batch_size), desc='Predicting'):
        batch = test_df.iloc[i:i+batch_size]
        encoded = tokenizer(batch['anchor'].tolist(), batch['target'].tolist(), 
                          padding=True, truncation=True, max_length=64,
                          return_tensors='pt').to(device)
        pred = head(model(**encoded)[0][:,0,:]).squeeze().cpu().numpy()
        predictions.extend(pred.tolist() if hasattr(pred, 'tolist') else [pred])

# Save submission
pd.DataFrame({'id': test_df['id'], 'score': predictions}).to_csv('submission.csv', index=False)

KeyboardInterrupt: 

In [2]:
output_dir = "patent_similarity_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
torch.save(head.state_dict(), f"{output_dir}/head.pt")