In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from bs4 import BeautifulSoup
from bs4.element import Tag

import requests

import re
import tqdm
import pandas as pd
import time

In [2]:
df = pd.read_csv('../../input/us-patent-phrase-to-phrase-matching/train.csv')

anchors = df['anchor'].unique()

quers = []
for anchor in anchors:
    contexts = df[df.anchor == anchor].context.unique()
    for c in contexts:
        quers.append(anchor + '+' + c)

In [3]:
def get_patent_ids(q, driver):
    """Scrapping google patent search 
    q - query = keyword + CPC code
    N - num of patents for query
    returns - tuple of patents (id, name)
    """   
    patents = []
    for n in range(1):
        url = f'https://patents.google.com/?q={q}&num=50&oq={q}&page={n}'
        driver.get(url)
        time.sleep(10)

        txt = []
        elements = driver.find_elements(By.TAG_NAME, 'a')
        for e in elements:
            href = e.get_attribute('href')
            txt.append(e.text)

        patents = []
        code = re.compile(r'[A-Z][A-Z]\d.')

        for i, t in enumerate(txt):
            if re.match(code, t):
                patents.append((txt[i], txt[i-1]))
        
    return patents

In [4]:
def google_description(pid, driver):
    url = f'https://patents.google.com/patent/{pid}/en'
    driver.get(url)
    time.sleep(5)
    elements = driver.find_elements(by=By.CLASS_NAME, value='patent-text')
    lines=[pid]
    for e in elements:
        lines.append(e.text)
    
    return lines

In [5]:
def search_justify(patent_name, driver):
    url = f"https://patents.justia.com/search?q={patent_name.replace(' ', '+')}"
    driver.get(url)
    e = driver.find_elements(By.CLASS_NAME, 'number')
    out = ''
    try:
        out = e[0].text.split()[-1]
    except IndexError:
        print(f'No search result for: {patent_name}')
    return out

In [19]:
def load_justify(pid):
    url = f'https://patents.justia.com/patent/{pid}'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")

    div_desc = soup.find('div', {'id':'description'})
    
    patent_description = []
    if div_desc:
        patent_description.append(div_desc.text)

    return patent_description

In [20]:
driver = webdriver.Firefox()
driver.implicitly_wait(3)

i = 366

while True:
    q = quers[i]
    print(f'---------Start scrapping {q}, i = {i}------------')
    pids = get_patent_ids(q, driver)
    with open(f'pids/{q}.txt', 'w') as f:
        f.writelines([p[0]+'\n' for p in pids])
        
    try:
        txt = []
        for pid, pname in tqdm.tqdm(pids):
            pnum = search_justify(pname, driver)
            desc = load_justify(pnum)
            txt+=desc
    except WebDriverException:
        print('WebDriverException')
        continue
        
    with open(f'texts/{q}.txt', 'w') as f:
        f.writelines(txt)
    
    i += 1

---------Start scrapping curing resin composition+C08, i = 365------------


  4%|██▋                                                            | 2/46 [00:05<01:57,  2.68s/it]

No search result for: Thermoplastic polyurethane with very good fire resistance


  7%|████                                                           | 3/46 [00:10<02:40,  3.74s/it]

No search result for: Polypropylenes Having Balanced Strain Hardening, Melt Strength, and Shear …


 15%|█████████▌                                                     | 7/46 [00:21<01:39,  2.54s/it]

No search result for: Polymer comprising one or more 1, 1-disubstituted alkene compounds and polymer …


 22%|█████████████▍                                                | 10/46 [00:29<01:31,  2.54s/it]

No search result for: Branched polyarylene ethers and thermoplastic molding compounds containing the …


 26%|████████████████▏                                             | 12/46 [00:36<01:37,  2.87s/it]

No search result for: Light-cured self-repairing polyurethane acrylic resin and preparation method …


 28%|█████████████████▌                                            | 13/46 [00:41<01:54,  3.46s/it]

No search result for: Polypropylene resin composition excellent in melt tension and method for …


 30%|██████████████████▊                                           | 14/46 [00:46<02:04,  3.90s/it]

No search result for: Cis-1,4-polybutadiene and method for producing the same


 41%|█████████████████████████▌                                    | 19/46 [00:59<01:07,  2.49s/it]

No search result for: Method of producing and using materials which are reinforced against impact and …


 43%|██████████████████████████▉                                   | 20/46 [01:04<01:24,  3.25s/it]

No search result for: Crosslinkable siloxane urea copolymer, process for producing the same, use …


 57%|███████████████████████████████████                           | 26/46 [01:18<00:45,  2.28s/it]

No search result for: Method for preparing asa-based graft copolymer, method for preparing …


 59%|████████████████████████████████████▍                         | 27/46 [01:24<00:59,  3.11s/it]

No search result for: Neutralization of non-activated polymerization catalysts using phosphoric acid …


 61%|█████████████████████████████████████▋                        | 28/46 [01:29<01:06,  3.70s/it]

No search result for: Vulcanizable compositions containing epoxy group-containing ethylene-vinyl …


 74%|█████████████████████████████████████████████▊                | 34/46 [01:43<00:28,  2.34s/it]

No search result for: Lactide copolymer, a preparation method thereof, and a resin composition …


 83%|███████████████████████████████████████████████████▏          | 38/46 [01:54<00:18,  2.32s/it]

No search result for: β-PINENE POLYMER AND PROCESS FOR PRODUCTION THEREOF


 89%|███████████████████████████████████████████████████████▎      | 41/46 [02:03<00:12,  2.57s/it]

No search result for: Novel High Viscosity Liquid Benzoate Ester Compositions And Polymer …


 98%|████████████████████████████████████████████████████████████▋ | 45/46 [02:14<00:02,  2.34s/it]

No search result for: Cyclopentene ring-opening polymer and manufacturing method therefor


100%|██████████████████████████████████████████████████████████████| 46/46 [02:19<00:00,  3.02s/it]


---------Start scrapping cut from stack+A01, i = 366------------


  6%|███▊                                                           | 3/50 [00:05<01:28,  1.88s/it]

No search result for: Method for suppression of weed plants in coniferous seedling nursery


 12%|███████▌                                                       | 6/50 [00:14<01:49,  2.49s/it]

No search result for: Method for breeding Odontobutis obscura through fish-rice symbiosis in pool


 14%|████████▊                                                      | 7/50 [00:19<02:22,  3.31s/it]

No search result for: Apparatus and method for no-till inter-row simultaneous application of …


 16%|██████████                                                     | 8/50 [00:24<02:43,  3.89s/it]

No search result for: Apparatus and method for no-till inter-row simultaneous application of …


 18%|███████████▎                                                   | 9/50 [00:29<02:53,  4.24s/it]

No search result for: Corroding pest or diseased mushroom distribution method, involves carrying out …


 22%|█████████████▋                                                | 11/50 [00:36<02:23,  3.67s/it]

No search result for: Fungicidal active ingredient combinations containing fluoxastrobin


 28%|█████████████████▎                                            | 14/50 [00:45<01:43,  2.89s/it]

No search result for: Technology development of eco-culture by using paddy field in misgurnus …


 32%|███████████████████▊                                          | 16/50 [00:51<01:41,  2.98s/it]

No search result for: The method of rice field ecology cultivation freshwater shrimp


 36%|██████████████████████▎                                       | 18/50 [00:59<01:44,  3.28s/it]


KeyboardInterrupt: 