In [1]:
import urllib.request, urllib.parse, urllib.error, urllib3, re
import pandas as pd
from IPython.display import display, HTML, Image

In [2]:
pd.set_option('display.max_colwidth', None)

def render(df):  
    display(HTML(df.fillna('').to_html(render_links=True,escape=False)))

    
    
def _util_add_links(df_in, patent_df_column='index'):
    """
    Create a link to google patents
    :arg
        df to be used
        patent_df_column (def 'index') column to use for patent number
            can be any of column label or column number as string or number
            or default of 'index' to use a df where the index is the nummber
    """
    df = df_in.copy(deep=True)

    if patent_df_column == 'index':
        list_of_patents = df.index.tolist()
    # If the column is entered as a string, either
    # a number string ('0') or a column label string ('column'):
    elif type(patent_df_column) == str:
        # if it's a number string
        if patent_df_column.isdigit():
            list_of_patents = df.iloc[:, int(patent_df_column)].tolist()
            patent_df_column_number = int(patent_df_column)
        # Otherwise it's a non-numeric label:
        else:
            list_of_patents = df[patent_df_column].tolist()
            patent_df_column_number = df.columns.get_loc(patent_df_column)
    # Otherwise it's entered as a number
    else:
        list_of_patents = df.iloc[:, patent_df_column].tolist()
        patent_df_column_number = patent_df_column

    list_patent_links = []
    # The patent number either does NOT start with 'US', in which case it's good to go after stripping leading zeroes,
    # OR it starts with 'US' in which case it's good to go as is:
    for patent in list_of_patents:
        patent = str(patent)
        patent = patent.replace('/', '')
        # parse_patent = patent.split('US')
        if patent[0].isdigit():
            # Get rid of any leading zeroes and add 'US' to beginning:
            patent = 'US' + str(int(patent))
        list_patent_links.append(
            '''<a href="https://patents.google.com/patent/{patent}" target="_blank">{patent}</a>'''.format(
                patent=patent))

    if patent_df_column == 'index':
        df.index = list_patent_links
    else:
        # leaving the next line commented out so that it's easier to
        # search because the original (non-link format) id is present
        #df = df.drop(df.columns[patent_df_column_number], axis=1)
        df.insert(patent_df_column_number, 'link', list_patent_links)
    # This is so that original df input doesn't get changed

    return df

    
    
    
    
def _get_html(page_number, query, uspat, usapp, pct, stem):
    """ Get HTML from FreePatentsOnline using query and other args
    
        Query - e.g. ttl/"nanopore membrane" 
        uspat, usapp, pct, stem - "on" or "off"
    """
    # Clean up query before passing to FreePatentsOnline:
    query = urllib.parse.quote_plus(query,safe='"')
    #
    # Set up appropriate language for database and stemming to create URL:
    if uspat != "on":
        uspat = "off"
    if usapp != "on":
        usapp = "off"
    if pct != "on":
        pct = "off"
    if stem != "on":
        stem = "off"
    #
    # Can search all years or only last 20; set up to search all by default.
    # other_search_terms = '&uspat=' + uspat + '&usapp=' + usapp + '&pct=' + pct + '&date_range=last20&stemming=' + stem
    other_search_terms = '&uspat=' + uspat + '&usapp=' + usapp + '&pct=' + pct + '&date_range=all&stemming=' + stem
    #
    # If this is the first time through search (page_number = 1)
    # then construct a first page URL
    if page_number == 1:
        # Construct the FreePatentsOnline URL from the query and the
        # indicators of search database (uspat, usapp, pct) and stemming:
        url = 'http://www.freepatentsonline.com/result.html?p=1&edit_alert=&srch=xprtsrch&query_txt=' + query + other_search_terms + '&sort=relevance&search=Search'
    #
    else: #If NOT the first page, then construct a second (or greater) page URL:
        url = "http://www.freepatentsonline.com/result.html?p=" + str(page_number) + '&srch=xprtsrch&query_txt=' + query + other_search_terms + '&sort=relevance'
    #
    # Open the URL; if can't, indicate that failed to open:
    try:
        req = urllib.request.urlopen(url)
    except:
        message = "Could not open " + url
        _show_warning_dialog(message)
    #
    # AOS ADDED 2021-08-26
    return req.read().decode()



def _get_first_search_page(query, uspat, usapp, pct, stem):
    """ Take query and stemming and return num_hits, num_pages, list first page of nums, titles, abstracts
    """
    # Take query and return num_hits, num_pages to retrieve, and
    # list of first 50 hits numbers, titles, and partial abstracts
    # Note that need to set first_page flag (first arg in _get_html) to 1:
    retrieved_html = _get_html(1, query, uspat, usapp, pct, stem)

    #
    # Get number of hits, by regex of html such as "Matches 1 - 50 out of 5409                </td>"
    # Note that could do error checking by making sure returned temp_num_hits was always list of 2 elements:
    re_num_hits = re.compile("Matches.*out of (\d+)\s+</td", re.IGNORECASE)
    list_num_hits = re_num_hits.findall(retrieved_html)
    
    if len(list_num_hits)>0:

        # Since the number of hits occurs twice on each page, take only the first occurrence:
        num_hits = int(list_num_hits[0])

        # Compute number of pages to retrieve, at 50 hits per page:
        # AOS 2021-08-26
        if num_hits%50 != 0:
            num_pages = int(num_hits/50) + 1
        else:
            num_pages = int(num_hits/50)

        #
        # Parse through retrieved first page to get ipnums, titles, and abstracts:
        num_hits_parsed, first_ipnum_title_abstract_score = _get_ipnum_title_abstract_score(retrieved_html)
        #
        return num_hits, num_pages, first_ipnum_title_abstract_score
    else:
        return 0, 0, []



def _get_remaining_search_pages(query, num_pages, uspat, usapp, pct, stem):
    """ Take query and number of pages to retrieve, and retrieve all remaining hits number, title, abstracts
    """
    remaining_ipnum_title_abstract_score = []
    for page_number in range(2,num_pages+1):
        # Note that need to set first_page flag (first arg in _get_html) to "y"
        retrieved_html = _get_html(page_number, query, uspat, usapp, pct, stem)
        num_hits_parsed, additionalpage_ipnum_title_abstract_score = _get_ipnum_title_abstract_score(retrieved_html)    
        remaining_ipnum_title_abstract_score = remaining_ipnum_title_abstract_score + additionalpage_ipnum_title_abstract_score
    
    return remaining_ipnum_title_abstract_score




def _get_ipnum_title_abstract_score(retrieved_html):
    """ Take retrieved html from FreePatentsOnline and parse through it
        to obtain ipnum, title, and abstract, which are returned as list
        of tuples of these values in list ipnum_title_abstract
    """
    #
    # Pull out IP document number and title from html such as follows (for issued patents):
    # <a href="/7660058.html">Methods for etching layers within a MEMS device to achieve a tapered edge</a>
    # Note that US publications have HTML such as y2010/0030167, which would correspond to US20100030167
    re_ipnum_title = re.compile('<a href="/(.+\d+).html">(.+)</a>')
    list_ipnum_title = re_ipnum_title.findall(retrieved_html)
    #
    # Now pull out partial abstract, if any is provided (there will be some entries with NO abstracts):
    re_abstract = re.compile('</a>.+?&nbsp;(.+?)</td>', re.DOTALL)
    list_abstract = re_abstract.findall(retrieved_html)
    #
    # Now pull out the "score":
    re_score = re.compile("<td width='5%'>\n.+?(\d+).+?</td>", re.DOTALL)
    list_score = re_score.findall(retrieved_html)
    #
    # ERROR CHECKING: make sure that length of list of numbers and
    # titles is the same as length of the list of abstracts:
    if len(list_ipnum_title) != len(list_abstract):
        print ("ERROR: number of abstracts retrieved is not equal to number of numbers/titles!")
    #
    # Now loop through all the retrieved hits and package together the ipnum, title, and
    # abstract (if there is one) into a single tuple, so that the end result is a list of
    # tuples.  Note that have to do some cleaning up of both the ipnum and abstract fields:
    num_hits_parsed = len(list_ipnum_title)
    ipnum_title_abstract_score = []
    for i in range(num_hits_parsed):
        ipnum,title = list_ipnum_title[i]
        ipnum = ipnum.replace("y","")
        ipnum = ipnum.replace("/","")
        abstract = list_abstract[i]
        abstract = abstract.replace("<br/>","")
        abstract = abstract.lstrip()
        abstract = abstract.rstrip()
        score = list_score[i]
        ipnum_title_abstract_score.append((ipnum,title,abstract,score))
    #    
    return num_hits_parsed, ipnum_title_abstract_score




def _format_hits(list_of_hits,offset=0):
    """ This function formats the output for display in a wxPython CheckListBox.  The
        format is specific to this particular output and could easily be redone for other output.
    """
    formatted_list_of_hits = []
    for i, each_hit in enumerate(list_of_hits):
        ipnum, title, abstract, score = each_hit
        # Format title so that the line-length is less than 85 chars:
        lines_of_title = textwrap.wrap(title,width=70)
        title_text = ""
        for line in lines_of_title:
            title_text = title_text + "     " + line + "\n"
        #
        # Now format abstract text for correct line-lengths:
        abstract_text = ""
        lines_of_abstract = textwrap.wrap(abstract,width=85)
        for line in lines_of_abstract:
            abstract_text = abstract_text + "     " + line + "\n"
        #
        # Following line provides an alternative format to display output:
        #hit_text = " [" + str(i+1) + "]\n     " + ipnum + "\n" + title_text + "     -----\n" + abstract_text
        hit_text = " [" + str(i+1+offset) + "]\n" + title_text + "     [" + ipnum + "]\n" + abstract_text
        formatted_list_of_hits.append(hit_text)
    #
    return formatted_list_of_hits



def _format_hits_as_df(list_of_hits):
    """
    This formats the output as a df
    """
    df = pd.DataFrame()
    
    list_of_numbers = []
    list_of_titles = []
    list_of_abstracts = []
    list_of_scores = []
    for i, each_hit in enumerate(list_of_hits):
        ipnum, title, abstract, score = each_hit
        
        list_of_numbers.append(ipnum)
        list_of_titles.append(title)
        list_of_abstracts.append(abstract)
        list_of_scores.append(score)
    
    df['number'] = list_of_numbers
    df['title'] = list_of_titles
    df['abstract'] = list_of_abstracts
    df['score'] = list_of_scores
      
    return df


def search_fpo(search_string):
    num_hits, num_pages, first_ipnum_title_abstract_score = _get_first_search_page(search_string, uspat="on", usapp="on", pct="off", stem="off")
    
    if num_hits>0:
        print('...', search_string, '...', num_hits,'hits')

        df = _format_hits_as_df(first_ipnum_title_abstract_score)

        remaining_ipnum_title_abstract_score = _get_remaining_search_pages(search_string, 5, uspat="on", usapp="on", pct="off", stem="off")
        df2 = _format_hits_as_df(remaining_ipnum_title_abstract_score)

        df_combined = df.append(df2)
        df_combined.reset_index(inplace=True,drop=True)
        df_combined.index += 1

        return df_combined
    else:
        print('...', search_string, '... no hits found')


In [3]:
search_string = 'aclm/metastable and aclm/vanadium'
df_combined = search_fpo(search_string)

... aclm/metastable and aclm/vanadium ... 54 hits


In [4]:
sentinels = ['20200321613','20200321614','20210130188']

for s in sentinels:
    df_result = df_combined[df_combined['number'] == s]
    if len(df_result)>0:
        result_number = df_result.index.tolist()[0]
        print(s,'sentinel hit as result number:', result_number)
        df_combined['title'].iloc[result_number-1] = '<div class="alert-success">' + df_combined['title'].iloc[result_number-1] + '</div>'
    else:
        print('   ', s, 'is NOT in results')

    20200321613 is NOT in results
20200321614 sentinel hit as result number: 8
20210130188 sentinel hit as result number: 31


In [8]:
df_links = _util_add_links(df_combined, patent_df_column='number')
render(df_links.drop(columns=['number']))

Unnamed: 0,link,title,abstract,score
1,US20080199350,Metastable beta-titanium alloy,"Metastable β-titanium alloy contains, in mass %: from 1.5 to 3.5 aluminum; from 4.5 to 8.0 molybdenum; from 1.0 to 3.5 vanadium; from 1.5 to 3.8 iron; titanium balance. This alloy combines high...",1000
2,US20200232094,"NUCLEAR COMPONENT WITH METASTABLE CR COATING, DLI-MOCVD METHOD FOR PRODUCING SAME, AND USES FOR CONTROLLING OXIDATION/HYDRIDATION","Process for manufacturing a nuclear component comprising i) a support containing a substrate based on a metal (1), the substrate (1) being coated or not coated with an interposed layer (3)...",871
3,US20060039819,Metastable beta-titanium alloy,"Metastable β-titanium alloy contains, in mass %: from 1.5 to 3.5 aluminum; from 4.5 to 8.0 molybdenum; from 1.0 to 3.5 vanadium; from 1.5 to 3.8 iron; titanium balance. This alloy combines high...",844
4,US4347076,Aluminum-transition metal alloys made using rapidly solidified powers and method,A method of fabricating aluminum alloys containing finely dispersed aluminum-transition metal intermetallic phases is disclosed. The alloys are subjected to melt spinning to form a brittle...,841
5,US4851206,Methods and compostions involving high specific surface area carbides and nitrides,Methods and compostions produced thereby are provided concerning the preparation and use of high specific surface area carbides and nitrides. The carbides and nitrides can be obtained by thermal...,832
6,US10889506,Vanadium oxide for infrared coatings and methods thereof,"The present invention relates to vanadium oxide and methods of controlling reaction processes for making such materials (e.g., powders). In particular embodiments, the method includes control of...",831
7,US5135589,Metastable hydrogen storage alloy material,"Disclosed is an improved metastable, multi-component, multi-phase hydrogen storage alloy material formed by rapid solidifcation from a melt. The improved metastable hydrogen storage alloy is...",821
8,US20200321614,SYNTHESIS OF A METASTABLE VANADIUM PENTOXIDE AS A CATHODE MATERIAL FOR ION BATTERIES,"A highly scalable process has been developed for stabilizing large quantities of the zeta-polymorph of V2O5, a metastable kinetically trapped phase, with high compositional and phase purity. The...",820
9,US4745977,Method for resisting corrosion in geothermal fluid handling systems,"A method for resisting corrosion while conducting a flow of hot corrosive geothermal fluid, particularly brine, comprises flowing the fluid through fluid flow conducting elements, such as...",814
10,US20190071319,VANADIUM OXIDE FOR INFRARED COATINGS AND METHODS THEREOF,"The present invention relates to vanadium oxide and methods of controlling reaction processes for making such materials (e.g., powders). In particular embodiments, the method includes control of...",812
