In [45]:
import urllib.request, urllib.parse, urllib.error, urllib3, pandas, re

In [109]:
def _get_html(page_number, query, uspat, usapp, pct, stem):
    """ Get HTML from FreePatentsOnline using query and other args
    
        Query - e.g. ttl/"nanopore membrane" 
        uspat, usapp, pct, stem - "on" or "off"
    """
    # Clean up query before passing to FreePatentsOnline:
    query = urllib.parse.quote_plus(query,safe='"')
    #
    # Set up appropriate language for database and stemming to create URL:
    if uspat != "on":
        uspat = "off"
    if usapp != "on":
        usapp = "off"
    if pct != "on":
        pct = "off"
    if stem != "on":
        stem = "off"
    #
    # Can search all years or only last 20; set up to search all by default.
    # other_search_terms = '&uspat=' + uspat + '&usapp=' + usapp + '&pct=' + pct + '&date_range=last20&stemming=' + stem
    other_search_terms = '&uspat=' + uspat + '&usapp=' + usapp + '&pct=' + pct + '&date_range=all&stemming=' + stem
    #
    # If this is the first time through search (page_number = 1)
    # then construct a first page URL
    if page_number == 1:
        # Construct the FreePatentsOnline URL from the query and the
        # indicators of search database (uspat, usapp, pct) and stemming:
        url = 'http://www.freepatentsonline.com/result.html?p=1&edit_alert=&srch=xprtsrch&query_txt=' + query + other_search_terms + '&sort=relevance&search=Search'
    #
    else: #If NOT the first page, then construct a second (or greater) page URL:
        url = "http://www.freepatentsonline.com/result.html?p=" + str(page_number) + '&srch=xprtsrch&query_txt=' + query + other_search_terms + '&sort=relevance'
    #
    # Open the URL; if can't, indicate that failed to open:
    try:
        req = urllib.request.urlopen(url)
    except:
        message = "Could not open " + url
        _show_warning_dialog(message)
    #
    # AOS ADDED 2021-08-26
    return req.read().decode()



def _get_first_search_page(query, uspat, usapp, pct, stem):
    """ Take query and stemming and return num_hits, num_pages, list first page of nums, titles, abstracts
    """
    # Take query and return num_hits, num_pages to retrieve, and
    # list of first 50 hits numbers, titles, and partial abstracts
    # Note that need to set first_page flag (first arg in _get_html) to 1:
    retrieved_html = _get_html(1, query, uspat, usapp, pct, stem)



    #
    # Get number of hits, by regex of html such as "Matches 1 - 50 out of 5409                </td>"
    # Note that could do error checking by making sure returned temp_num_hits was always list of 2 elements:
    re_num_hits = re.compile("Matches.*out of (\d+)\s+</td", re.IGNORECASE)
    list_num_hits = re_num_hits.findall(retrieved_html)
    # Since the number of hits occurs twice on each page, take only the first occurrence:

    num_hits = int(list_num_hits[0])
    #
    # Compute number of pages to retrieve, at 50 hits per page:
    # AOS 2021-08-26
    if num_hits%50 != 0:
        num_pages = int(num_hits/50) + 1
    else:
        num_pages = int(num_hits/50)

    #
    # Parse through retrieved first page to get ipnums, titles, and abstracts:
    num_hits_parsed, first_ipnum_title_abstract_score = _get_ipnum_title_abstract_score(retrieved_html)
    #
    return num_hits, num_pages, first_ipnum_title_abstract_score



def _get_remaining_search_pages(query, num_pages, uspat, usapp, pct, stem):
    """ Take query and number of pages to retrieve, and retrieve all remaining hits number, title, abstracts
    """
    remaining_ipnum_title_abstract_score = []
    for page_number in range(2,num_pages+1):
        # Note that need to set first_page flag (first arg in _get_html) to "y"
        retrieved_html = _get_html(page_number, query, uspat, usapp, pct, stem)
        num_hits_parsed, additionalpage_ipnum_title_abstract_score = _get_ipnum_title_abstract_score(retrieved_html)    
        remaining_ipnum_title_abstract_score = remaining_ipnum_title_abstract_score + additionalpage_ipnum_title_abstract_score
    
    return remaining_ipnum_title_abstract_score




def _get_ipnum_title_abstract_score(retrieved_html):
    """ Take retrieved html from FreePatentsOnline and parse through it
        to obtain ipnum, title, and abstract, which are returned as list
        of tuples of these values in list ipnum_title_abstract
    """
    #
    # Pull out IP document number and title from html such as follows (for issued patents):
    # <a href="/7660058.html">Methods for etching layers within a MEMS device to achieve a tapered edge</a>
    # Note that US publications have HTML such as y2010/0030167, which would correspond to US20100030167
    re_ipnum_title = re.compile('<a href="/(.+\d+).html">(.+)</a>')
    list_ipnum_title = re_ipnum_title.findall(retrieved_html)
    #
    # Now pull out partial abstract, if any is provided (there will be some entries with NO abstracts):
    re_abstract = re.compile('</a>.+?&nbsp;(.+?)</td>', re.DOTALL)
    list_abstract = re_abstract.findall(retrieved_html)
    #
    # Now pull out the "score":
    re_score = re.compile("<td width='5%'>\n.+?(\d+).+?</td>", re.DOTALL)
    list_score = re_score.findall(retrieved_html)
    #
    # ERROR CHECKING: make sure that length of list of numbers and
    # titles is the same as length of the list of abstracts:
    if len(list_ipnum_title) != len(list_abstract):
        print ("ERROR: number of abstracts retrieved is not equal to number of numbers/titles!")
    #
    # Now loop through all the retrieved hits and package together the ipnum, title, and
    # abstract (if there is one) into a single tuple, so that the end result is a list of
    # tuples.  Note that have to do some cleaning up of both the ipnum and abstract fields:
    num_hits_parsed = len(list_ipnum_title)
    ipnum_title_abstract_score = []
    for i in range(num_hits_parsed):
        ipnum,title = list_ipnum_title[i]
        ipnum = ipnum.replace("y","")
        ipnum = ipnum.replace("/","")
        abstract = list_abstract[i]
        abstract = abstract.replace("<br/>","")
        abstract = abstract.lstrip()
        abstract = abstract.rstrip()
        score = list_score[i]
        ipnum_title_abstract_score.append((ipnum,title,abstract,score))
    #    
    return num_hits_parsed, ipnum_title_abstract_score




def _format_hits(list_of_hits,offset=0):
    """ This function formats the output for display in a wxPython CheckListBox.  The
        format is specific to this particular output and could easily be redone for other output.
    """
    formatted_list_of_hits = []
    for i, each_hit in enumerate(list_of_hits):
        ipnum, title, abstract, score = each_hit
        # Format title so that the line-length is less than 85 chars:
        lines_of_title = textwrap.wrap(title,width=70)
        title_text = ""
        for line in lines_of_title:
            title_text = title_text + "     " + line + "\n"
        #
        # Now format abstract text for correct line-lengths:
        abstract_text = ""
        lines_of_abstract = textwrap.wrap(abstract,width=85)
        for line in lines_of_abstract:
            abstract_text = abstract_text + "     " + line + "\n"
        #
        # Following line provides an alternative format to display output:
        #hit_text = " [" + str(i+1) + "]\n     " + ipnum + "\n" + title_text + "     -----\n" + abstract_text
        hit_text = " [" + str(i+1+offset) + "]\n" + title_text + "     [" + ipnum + "]\n" + abstract_text
        formatted_list_of_hits.append(hit_text)
    #
    return formatted_list_of_hits



def _format_hits_as_df(list_of_hits):
    """
    This formats the output as a df
    """
    df = pd.DataFrame()
    
    list_of_numbers = []
    list_of_titles = []
    list_of_abstracts = []
    list_of_scores = []
    for i, each_hit in enumerate(list_of_hits):
        ipnum, title, abstract, score = each_hit
        
        list_of_numbers.append(ipnum)
        list_of_titles.append(title)
        list_of_abstracts.append(abstract)
        list_of_scores.append(score)
    
    df['number'] = list_of_numbers
    df['title'] = list_of_titles
    df['abstract'] = list_of_abstracts
    df['score'] = list_of_scores
      
    return df


def search_fpo(search_string):
    num_hits, num_pages, first_ipnum_title_abstract_score = _get_first_search_page(search_string, uspat="on", usapp="on", pct="off", stem="off")
    print('...', search_string, '...', num_hits,'hits')

    df = _format_hits_as_df(first_ipnum_title_abstract_score)

    remaining_ipnum_title_abstract_score = _get_remaining_search_pages(search_string, 5, uspat="on", usapp="onf", pct="off", stem="off")
    df2 = _format_hits_as_df(remaining_ipnum_title_abstract_score)

    df_combined = df.append(df2)
    df_combined.reset_index(inplace=True,drop=True)
    df_combined.index += 1
    
    return df_combined


In [117]:
search_string = 'aclm/v2o5 and aclm/metastable'
df_combined = search_fpo(search_string)

... aclm/v2o5 and aclm/metastable ... 9 hits


In [118]:
sentinels = ['20200321613','20200321614','20210130188']

for s in sentinels:
    df_result = df_combined[df_combined['number'] == s]
    if len(df_result)>0:
        result_number = df_result.index.tolist()[0]
        print(s,'sentinel hit as result number:', result_number)
    else:
        print('   ', s, 'is NOT in results')


20200321613 sentinel hit as result number: 1
    20200321614 is NOT in results
20210130188 sentinel hit as result number: 5


In [120]:
df_links = pr._util_add_links(df_combined, patent_df_column='number')
render(df_links)

Unnamed: 0,link,number,title,abstract,score
1,US20200321613,20200321613,ELECTROCHEMICAL STORAGE INCORPORATING SIZE- AND MORPHOLOGY-CONTROLLED METASTABLE VANADIUM PENTOXIDE AS A CATHODE MATERIAL FOR ION BATTERIES,"The Li-ion paradigm of battery technology is fundamentally constrained by the monovalency of the Li-ion. A straightforward solution is to transition to multivalent ion chemistries, with Mg2+ the...",1000
2,US20160111720,20160111720,METASTABLE VANADIUM OXIDE CATHODE MATERIALS FOR RECHARGEABLE MAGNESIUM BATTERY,"A magnesium electrochemical cell having a positive electrode containing as an active ingredient, a material of formula [V2O5]c [MaOb] d and/or a material of formula [V2O5]c[MaOb]d[MgXe]g in a...",826
3,US9819021,9819021,Metastable vanadium oxide cathode materials for rechargeable magnesium battery,"A magnesium electrochemical cell having a positive electrode containing as an active ingredient, a material of formula [V2O5]c [MaOb]d and/or a material of formula [V2O5]c[MaOb]d[MgXe]g in a...",826
4,US10889506,10889506,Vanadium oxide for infrared coatings and methods thereof,"The present invention relates to vanadium oxide and methods of controlling reaction processes for making such materials (e.g., powders). In particular embodiments, the method includes control of...",448
5,US20210130188,20210130188,"QUANTUM MATERIAL/VANADIUM OXIDE HETEROSTRUCTURES, METHODS OF MAKING SAME, AND USES THEREOF","The subject invention pertains to the synthesis and characterization of V2O5/CdE NW/QD heterostructures. The V2O5/CdE heterostructures are versatile new materials constructs for light harvesting,...",438
6,US20190071319,20190071319,VANADIUM OXIDE FOR INFRARED COATINGS AND METHODS THEREOF,"The present invention relates to vanadium oxide and methods of controlling reaction processes for making such materials (e.g., powders). In particular embodiments, the method includes control of...",428
7,US9431474,9431474,Metal-insulator-metal stack and method for manufacturing the same,A method for manufacturing a metal-insulator-metal (MIM) stack is described. The method includes forming a temporary stack by depositing a bottom electrode comprising at least one metal layer;...,372
8,US20130155572,20130155572,Metal-Insulator-Metal Stack and Method for Manufacturing the Same,A method for manufacturing a metal-insulator-metal (MIM) stack is described. The method includes forming a temporary stack by depositing a bottom electrode comprising at least one metal layer;...,348
9,US20070062408,20070062408,Defectivity and process control of electroless deposition in microelectronics applications,"Methods and compositions for electrolessly depositing Co, Ni, or alloys thereof onto a substrate in manufacture of microelectronic devices. Grain refiners, levelers, oxygen scavengers, and...",167
