# Libraries

In [None]:
# Script to install required packages via pip
import os

packages = [
    'numpy',
    'ipywidgets',
    'torch',
    'matplotlib',
    'scikit-learn',
    'seaborn',
    'transformers',
    'datasets',
    'evaluate',
    'sentence-transformers'
]

# Install each package
for package in packages:
    os.system(f'pip install {package}')

In [170]:
pip install reportlab

Collecting reportlab
  Downloading reportlab-4.2.5-py3-none-any.whl.metadata (1.5 kB)
Downloading reportlab-4.2.5-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.2.5


In [None]:
pip install -U sentence-transformers



## import functions

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display, clear_output, HTML
import ipywidgets as widgets
import torch
from torch import nn
import math
import matplotlib.pyplot as plt
from collections import Counter
import re
import csv
import xml.etree.ElementTree as ET
import ast
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from torch.utils.data import Dataset, DataLoader
import warnings
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModel
from datasets import Dataset
import evaluate
import seaborn as sns
import joblib
from sentence_transformers import util
import torch

In [181]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet

#Set up computing device

In [None]:
# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    setup_device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    setup_device = torch.device('cpu')
    print("Using CPU")

Using GPU: Tesla T4


# Load Model

In [None]:
# Load the trained model and encoders
loaded_classifier = joblib.load('/content/models/ipc_section_classifier.pkl')
loaded_label_encoder = joblib.load('/content/models/ipc_section_label_encoder.pkl')
# Initialize the SentenceTransformer model (PatentSBERTa)
model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa', device=setup_device)



#Load database

In [None]:
eval_file_path = '/content/data/eval_dataset.csv'
eval_df = pd.read_csv(eval_file_path)
eval_df.drop_duplicates(subset=['publication_number'], keep='first', inplace=True)
print("Done")

Done


##Load the embeddings of database

In [None]:
# To load document embeddings
document_embeddings = np.load('/content/data/eval_dataset_v3.npy')

In [None]:
copy_eval_df = eval_df.copy()

In [None]:
copy_eval_df.head()

Unnamed: 0,publication_number,ipc,claims,ipc_section
0,1447564,F04C5/00,rotor for cooling pumps comprising a core to b...,F
1,122080,"C12N15/00, C12P21/00, C12N9/52, A61K39/395",a process for the production of a soluble nati...,C
2,1134776,"H01J61/12, H01J61/82",a high pressure mercury vapor discharge lamp h...,H
3,1382403,"B21D53/08, F28F1/04, F02K9/64, F02K9/97",a method for forming an article 10 having a wa...,B
4,3152547,G01N21/63,what is claimed is a method of aligning a lig...,G


In [None]:
copy_eval_df['claims'] = document_embeddings.tolist()

In [None]:
copy_eval_df.head()

Unnamed: 0,publication_number,ipc,claims,ipc_section
0,1447564,F04C5/00,"[0.3121407926082611, -1.1214916706085205, -0.0...",F
1,122080,"C12N15/00, C12P21/00, C12N9/52, A61K39/395","[0.19636858999729156, -0.6120919585227966, -0....",C
2,1134776,"H01J61/12, H01J61/82","[0.04835645854473114, -0.3993348479270935, -0....",H
3,1382403,"B21D53/08, F28F1/04, F02K9/64, F02K9/97","[-0.11545654386281967, -0.37583962082862854, -...",B
4,3152547,G01N21/63,"[0.26978930830955505, -0.40174582600593567, -0...",G


#Define program

In [186]:
# Function to take user input and predict the IPC section
def predict_ipc_section():
    input_records = []
    pred_records = []

    while True:
        # Take user input (claims)
        user_input = input("Enter the claims (or type 'exit' to quit): ")

        if user_input.lower() == 'exit':
            print("Exiting the prediction prompt.")
            #-------internal processing
            #-------Create a dataframe to store the result, and export it as pdf
            result_df = pd.DataFrame()
            #-------
            for idx, target in enumerate(pred_records):

                #similar_patents = eval_df[eval_df['ipc_section']  == target]
                similar_patents = copy_eval_df[copy_eval_df['ipc_section']  == target]

                similar_app_nums = similar_patents['publication_number'].tolist()
                similar_claims = similar_patents['claims'].tolist()

                # add user input at the begining of the list
                # similar_claims.insert(0, input_records[idx])

                # calculate the relevance/similarity
                # Step 1: Generate BERT embeddings for each document
                # embeddings = model.encode(similar_claims)
                embedding_input = model.encode(input_records[idx])
                similar_claims.insert(0, embedding_input)
                embeddings = np.array(similar_claims)

                # Step 2: Compute cosine similarity between embeddings

                embeddings = torch.from_numpy(embeddings)
                embeddings = embeddings.to(setup_device)

                similarity_matrix = model.similarity(embeddings, embeddings)
                ### NOTICE:
                #   with GPU of 16 GB, maximum 39000 claims can be computed.
                #   If try to compute more than 39000 claims, the system is likely to crash
                ### -----

                similarity_matrix = similarity_matrix.cpu().numpy()

                # Step 3: Create a pandas DataFrame for reporting
                names = []
                for i in range(len(similar_claims)):
                    if i == 0:
                        name = 'doc_' + str(idx)
                        names.append(name)
                    else:

                        names.append(similar_app_nums[i-1])
                #print('OK')
                df_sim = pd.DataFrame(similarity_matrix, columns=[names], index=[names])
                df_sim.to_csv('df_sim_saved.csv')
                #print('OK')
                #df_sim = df_sim.set_index(names)
                #print(df_sim)
                #Report of the input document
                print(f"document {idx}")
                print(f"content: {input_records[idx]}")
                print(f"IPC: {target}")
                print(f"Prior arts:")
                doc_column = df_sim[names[0]]
                #print(type(doc_column))
                print(doc_column)

                print(f"the top 10 most similar prior arts:")
                trial = doc_column.values.tolist()
                #print(type(trial))
                #print(len(trial), trial)

                values = np.array(trial).reshape(-1)
                #print(values.shape)
                pub_nums = []
                for idx_1 in df_sim.index:
                    pub_nums.append(idx_1)
                #print(pub_nums)

                index_largest = np.argpartition(values, -10)[-10:]
                #print(index_largest)
                pub_res = []
                pub_score = []
                for idx_2, elem in enumerate(index_largest):

                    print(f"publication number: {pub_nums[elem]}, similar score: {values[elem]}")
                    pub_res.append(pub_nums[elem])
                    pub_score.append(round(values[elem],3))

                result = {'Document input number': idx,'Input claim ': input_records[idx], 'ipc section': target, 'relevant publication numbers': pub_res, 'corresponding similarity score': pub_score}
                result_df = result_df._append(result, ignore_index=True)

                #largest_10_values = doc_column[names[0]].nlargest(10)
                #print(largest_10_values)

                #print(pub_nums)
            # save result
            result_df.to_csv('result_df.csv', index=False)
            return #input_records, pred_records
            #-------internal processing

            #break

        input_records.append(user_input)
        # Encode the input claims into embeddings
        claims_embedding = model.encode([user_input])

        # Predict using the loaded classifier
        predicted_encoded_section = loaded_classifier.predict(claims_embedding)

        # Decode the predicted section back to the IPC format
        predicted_section = loaded_label_encoder.inverse_transform(predicted_encoded_section)
        pred_records.append(predicted_section[0])

        # Show the prediction
        print(f"Predicted IPC Section: {predicted_section[0]}")

# Run program

In [190]:
# Start the interactive prompt
predict_ipc_section()

Enter the claims (or type 'exit' to quit): rotor for cooling pumps comprising a core to be assembled on a shaft connected with engine means and a body, fitted in the said core, provided with a plurality of radial tabs of flexible material, characterized in that said core and said body with said tabs are both of a material like the rubber, but with different hardness.rotor for cooling pumps according to claim 1, characterized in that said core is made of a mixture of neoprene, nitrile, pvc and aramidic fiber.rotor according to claim 2, characterized in that said aramidic fiber is kevlar.rotor according to claim 2, characterized in that said core is made of a mixture comprising polychloroprene30 to 50acrylonitrile  pvc50 to 80aramidic fiber30 to 50silica30 to 50resin30 to 50zinc oxide30 to 50sulphur30 to 50 the said percentages being expressed in weight.rotor according to claim 4, characterized in that said core is made of a mixture comprising polychloroprene25acrylonitrile  pvc25aramidi

#Check

In [None]:
eval_df.head()

Unnamed: 0,publication_number,ipc,claims,ipc_section
0,1447564,F04C5/00,rotor for cooling pumps comprising a core to b...,F
1,122080,"C12N15/00, C12P21/00, C12N9/52, A61K39/395",a process for the production of a soluble nati...,C
2,1134776,"H01J61/12, H01J61/82",a high pressure mercury vapor discharge lamp h...,H
3,1382403,"B21D53/08, F28F1/04, F02K9/64, F02K9/97",a method for forming an article 10 having a wa...,B
4,3152547,G01N21/63,what is claimed is a method of aligning a lig...,G


In [None]:
eval_df[eval_df['publication_number']  == 1844572]['claims'].tolist()[0]

'a method for generating and authenticating a cryptographic key k for reciprocal authentication of a first station 1 and at least a second station 2 between which a communication is to be established, comprising generating a first carrier x1 and at least a second carrier x2 at said first station 1 and said second station 2, respectively performing a data exchange, comprising said carriers x1, x2, between said first and second stations 1, 2 through a communication channel h extracting, from the data received by said first and second stations 1, 2, information corresponding to at least one feature of said communication channel h and generating said cryptographic key k at said first and second stations 1, 2, based on said extracted informationsaid method being characterized in that said first carrier x1 is known exclusively to said first station 1 and that said second carrier x2 is known exclusively to said second station 2.a method according to any one of the preceding claims, wherein sa

#Check saved csv

In [191]:
res_file_path = '/content/result_df.csv'
res_df = pd.read_csv(res_file_path)

In [192]:
res_df

Unnamed: 0,Document input number,Input claim,ipc section,relevant publication numbers,corresponding similarity score
0,0,rotor for cooling pumps comprising a core to b...,F,"[(2419608,), (1163443,), (799974,), (2904212,)...","[0.653, 0.692, 0.676, 0.687, 0.66, 0.654, 0.66..."
1,1,A method for securing communications over a ne...,H,"[(79706,), (2840819,), (1908238,), (1125419,),...","[0.464, 0.464, 0.468, 0.47, 0.474, 0.478, 0.48..."
2,2,"The method of claim 1, further comprising the ...",B,"[(2419189,), (2935016,), (2951083,), (1735098,...","[0.65, 0.653, 0.653, 0.661, 0.662, 0.716, 0.66..."
3,3,"The method of claim 5, further comprising the ...",A,"[(2699280,), (3106092,), (1919371,), (3130252,...","[0.657, 0.658, 0.667, 0.673, 0.673, 0.664, 0.6..."
