Virtual env: (base)

# Importings and Downloads

In [None]:
import regex as re
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from textblob import TextBlob
import spacy
import ast

In [None]:
nltk.download('brown')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Rule based methods:

In [3]:
def find_claim_numbers(claim_text):
    #return: A list of integers with the positions in the string of the start of each claim.
    if type(claim_text)!= str:
        claim_text = str(claim_text)
    size = len(claim_text)
    positions = []
    current = 0
    claim_no = 1
    claim_marker = create_claim_marker(claim_no)
    while current < size:
        if claim_text[current: current + len(claim_marker)] == claim_marker:
            positions.append(current)
            claim_no += 1
            claim_marker = create_claim_marker(claim_no)
        current += 1
    if len(positions) == 0:
        positions = [0]
    return positions


def create_claim_marker(claim_no):  
    #return: The string that matches the placement of that claim in the text of a patent's claims document.
    return str(claim_no) + '.'

In [4]:
def convert_claim_text(claim_text):
    """
    This method takes the text of the patent claims and then converts them into a dictionary where the keys are the
    claim numbers and the values are a list of strings.  Each item in the list is a clause of the patent claim, and the
    clauses are listed in the order in which they are provided by the claim.
    """
    # Dictionary object to store the claims
    claim_text_dict = {}
    claim_number_positions = find_claim_numbers(claim_text)
    for i in range(len(claim_number_positions)):
        current_claim_number = str(i+1)

        # We slice the text starting after the number where the text starts up to the next number.
        if i != len(claim_number_positions) - 1:
            claim_text_dict[current_claim_number] = claim_text[claim_number_positions[i] + len(current_claim_number) + 2:claim_number_positions[i+1]]

        # To prevent an index error for the last claim we slice to the remainder of the text
        else:
            claim_text_dict[current_claim_number] = claim_text[claim_number_positions[i] + len(current_claim_number) + 2:]
    claim_text_dict = {int(k):v for k,v in claim_text_dict.items()}


    return claim_text_dict

In [5]:
def create_node(root,components,relations):
    #return a dictionary with component specification
    d = {}
    d['root'] = root
    d['components'] = components
    d['relations'] = relations
    return d

In [6]:
def get_claim_type(claim):
    #this method returns if the claim is a method or apparatus claim
    first_phrase = claim[:20]
    if 'method' in first_phrase:
        return 'method'
    return 'apparatus'


In [7]:
nlp = spacy.load("en_core_web_sm")
punctuation_match = 0
res = []
def create_subtree(row):
    global punctuation_match
    global res
    #create only subtree for independant claim 1
    claim = row['claims_text_dict'][1]
    print(row.name)
    #initialisation
    relations = ''  
    regex_ex = re.compile(r'(.*?) comprising:, ((.*?);){1,}(.*?).,', re.VERBOSE | re.IGNORECASE)
    
    #if there is punctuation, we segment using punctuation
    if not claim.endswith('.,'):
        claim = claim + '.,'
    if regex_ex.fullmatch(claim):
        punctuation_match += 1
        #we should not delete comprising and we don't use split because it can be multiple "comprising" in claim
        root = claim[:claim.index("comprising") + len('comprising')].strip()
        elements = claim[claim.index("comprising") + len('comprising'):].strip().split(';')
        elements = [elt + ';' for elt in elements]
        if 'wherein' in elements[-1]:
            relations = elements[-1][elements[-1].index('wherein')+len('wherein'):]
            elements[-1] = elements[-1][:elements[-1].index('wherein')+len('wherein')]

    else:
        if 'wherein' in claim and 'comprising' not in claim:
            root = claim.split('wherein')[0]
            elements = claim.split('wherein')[0]
            if 'comprising' in elements:
                elements = elements.split('comprising')[1].replace('\n ',"").replace(',','').split(';')
                relations = claim.split('wherein')[1]
        elif 'comprising' in claim:
            root = claim.split('comprising')[0]
            elements = claim[claim.index("comprising") + len('comprising'):].strip().split(';')
        else:
            elements = []
            root = claim


    #clean root the root (define line break for purpose of invention)
    if 'for' in root:
        if len(TextBlob(root).noun_phrases) > 0:
            root = root.split('for')[0] + ' [LB] for' + root.split('for')[1]

    for elt in elements:
        #search for noun if they begin with ['a','an'] or ['the','said'] to classify them as sub-elements
        #print('elt',elt)
        doc = nlp(elt.strip())
        L = list(doc.noun_chunks)[1:]

        #convert type of spacy object to str
        for i in range(len(L)):
            L[i] = str(L[i])

        #create '[sub]' tag to sub-elements  
        new_elt = elt
        for se in L:
            if se.startswith('a') and elt.replace('[1-9 a-z A-Z] \)','') != 0:
                new_elt = new_elt.replace(se,'[sub] '+se)
        #elements.replace(elt,new_elt)
        elements = list(map(lambda x: x.replace(elt,new_elt),elements))
        #print(elt)
        res.append(create_node(root,elements,relations))
    return create_node(root,elements,relations)

In [8]:
def create_segments_list(claim_tree):
    #this function takes the claim tree as input and returns 2 lists one being the segments and the other one has the same 
    #length as the first one and indicates whether or not each segment ends a sentence.
    if type(claim_tree) == str:
        claim_tree = ast.literal_eval(claim_tree)
    root_result = []
    components_result = []
    relations_result = []
    components_result_binary = []
    root_result = claim_tree['root'].split()
    for component in claim_tree['components']:
        components_result.append(component.split())
        components_result_binary.append([0 for i in range(len(component.split())-1) ] + [1])
    components_result = [item for sublist in components_result for item in sublist]
    components_result_binary = [item for sublist in components_result_binary for item in sublist]
    relations_result = claim_tree['relations'].split()
    final_result = root_result + components_result + relations_result
    final_result_binary = [0 for i in range(len(root_result)-1)] + [1] + components_result_binary + [0 for i in range(len(relations_result))]
    for k in range(len(final_result)):
        if final_result[k] == '[LB]' or final_result[k] == '[sub]':
            if k > 0:
                final_result_binary[k-1] = 1
                
    index_remove = [idx for idx, x in enumerate(final_result) if x == "[sub]" or x == "[LB]"]
    if index_remove:
        final_result = [ele for idx, ele in enumerate(final_result) if idx not in index_remove]
        final_result_binary = [ele for idx, ele in enumerate(final_result_binary) if idx not in index_remove]
    return final_result, final_result_binary
    
    
    

# Apply rule based methods to segment the claims

In [9]:
#read document
df = pd.read_csv('./uspto_df')
#clean claim_text from NaN values
df = df[df['claims_text'].notna()]

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,grant_id,patent_title,kind,number_of_claims,inventors,citations_applicant_count,citations_examiner_count,claims_text,abstract
0,0,USD0961886,Candy,Design Patent,1,"['Rhett Vance Barney', 'Chad Taylor Robinson',...",2,10,"The ornamental design for candy, as shown and ...",
1,1,USD0961887,Garment,Design Patent,1,['Wenchang Hu'],0,9,"The ornamental design for a garment, as shown ...",
2,2,USD0961888,Vest,Design Patent,1,['Izzy Benoliel'],0,11,"The ornamental design for a vest, as shown and...",
3,3,USD0961889,Headband with LED lights,Design Patent,1,['Joshua Chen'],0,26,The ornamental design for a headband with LED ...,
4,4,USD0961890,Backless baseball cap,Design Patent,1,['Adrienne Walker'],1,21,"The ornamental design for a backless ball cap,...",


In [11]:
df = df[150000:]

In [12]:
claim_text_dict = df['claims_text'].apply(convert_claim_text)
df['claims_text_dict'] = claim_text_dict

In [None]:
claim_subtree = df.apply(create_subtree,axis=1)
df['claim_tree'] = claim_subtree

In [14]:
df['claim_tree'] = claim_subtree

In [15]:
df.head()

Unnamed: 0.1,Unnamed: 0,grant_id,patent_title,kind,number_of_claims,inventors,citations_applicant_count,citations_examiner_count,claims_text,abstract,claims_text_dict,claim_tree
150001,150001,US11300077,Deployable fairing for door reversers systems ...,Utility Patent Grant (with a published applica...,8,['Timothy Gormley'],27,1,1. An actuation arrangement for a thrust rever...,"A thrust reverser may include a frame, an actu...",{1: 'An actuation arrangement for a thrust rev...,{'root': 'An actuation arrangement [LB] for a...
150002,150002,US11300078,Variable thrust catapult,Utility Patent Grant (with a published applica...,20,"['Jeff Benjamin', 'Matthew D. Salois']",2,4,1. A rocket catapult assembly for use in an ej...,A rocket catapult assembly for an ejection sea...,{1: 'A rocket catapult assembly for use in an ...,{'root': 'A rocket catapult assembly [LB] for...
150003,150003,US11300079,Diagnostic apparatus for evaporative fuel proc...,Utility Patent Grant (with a published applica...,10,"['Daisuke Kugo', 'Masahiro Ono']",13,4,1. A diagnostic apparatus for an evaporative f...,A diagnostic apparatus for an evaporative fuel...,{1: 'A diagnostic apparatus for an evaporative...,{'root': 'A diagnostic apparatus [LB] for an ...
150004,150004,US11300080,Fuel tank protector valve and engine systems h...,Utility Patent Grant (with a published applica...,20,"['Chester E. Duffield, III']",3,25,"1. A fuel tank protector valve comprising:,a h...",A dual chamber fuel tank protector valve has a...,"{1: 'A fuel tank protector valve comprising:,a...",{'root': 'A fuel tank protector valve comprisi...
150005,150005,US11300081,Engine intake bypass system,Utility Patent Grant (no published application...,10,['Il Suk Yang'],1,2,"1. An engine intake bypass system comprising:,...",An engine intake bypass system includes: an in...,{1: 'An engine intake bypass system comprising...,{'root': 'An engine intake bypass system compr...


In [16]:
df.to_csv('./uspto_df_segmented_claims_4.csv')

# Get the list of segments with output for embeddings

In [17]:
output = df['claim_tree'].apply(create_segments_list)
df[['claim_segments', 'claim_segments_binary']] = pd.DataFrame(output.tolist(),index=df.index)

In [18]:
df.to_csv('./uspto_df_final_4.csv')

## Example of sequence sentences of a segmented claim:

In [4]:
example_tree = df.loc[3400,'claim_tree']

In [5]:
example_tree

"{'root': 'A cartridge configured  [LB] for a non-lethal self-protection system, the cartridge comprising', 'components': [':,a) a housing having [sub] a barrel;', ' b) [sub] a compressed gas vessel carried by the housing and containing [sub] a compressed gas, the compressed gas vessel having two positions comprising: i) [sub] a retained position farther from the barrel, and ii) a released position closer to the barrel;', ' c) a spring carried by the housing to bias the compressed gas vessel towards the released position;', ' d) a retainer carried by the housing and selectively retaining the compressed gas vessel in the retained position;', ' e) a projectile carried by the housing and positioned in front of the compressed gas vessel, the projectile containing [sub] an irritant;', ' f) a diffuser positioned between the projectile and the compressed gas vessel, the diffuser comprising [sub] a passage therethrough configured to spread out gas from the compressed gas vessel behind the proj

In [None]:
final_result, final_result_binary = create_segments_list(example_tree)

In [None]:
len(final_result)== len(final_result_binary)

In [2]:
import pandas as pd
df = pd.read_csv('./220503_df_final.csv')

In [8]:
print(df['claim_segments'][0])
print(df['claims_text'][0])
print(len(df['claim_segments'][0])==len(df['claims_text'][0].split()))

['ornamental', 'design', 'for', 'a', 'hot', 'dog', 'pet', 'treat,', 'as', 'shown', 'and', 'described.']
The ornamental design for a hot dog pet treat, as shown and described.
False


# Other useful Methods

In [140]:
def is_only_relation(text):
    R = re.findall(':',text)
    if R == []:
        return True
    else: 
        return False

In [None]:
TextBlob('a headband member having a frontal portion and a mirror'.strip()).noun_phrases[1:]

WordList(['frontal portion'])

In [None]:
# Importing the required libraries
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
blob = TextBlob('an eye shield member removably secured to said frontal portion.')
print(blob.noun_phrases)

['eye shield member', 'frontal portion']


In [None]:
sentences = nltk.sent_tokenize('a receive antenna configured to control generation of an ultra-wideband radar signal and reception of one or more resultant signals;the transmit antenna is angled relative to the receive antenna.')
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    words = [word for word in words if word not in set(stopwords.words('english'))]
    tagged = nltk.pos_tag(words)
    for (word, tag) in tagged:
        if tag == 'NP': # If the word is a proper noun
            print(word)

In [152]:
#get syonymes of a word
#this needs lemmatisation before
lemmatizer = WordNetLemmatizer()
def synonym_extractor(phrase):
    synonyms = []

    for syn in wordnet.synsets(phrase):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return synonyms