### Scrape names of patents

In [1]:
# BigQuery
from google.cloud import bigquery
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery

# Set BigQuery application credentials
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:\Program Files\GoogleCloud\project-115a4092973b.json"

#### Read <a href=https://cloud.google.com/docs/authentication/getting-started>here</a> how to create a json file

In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]

'C:\\Program Files\\GoogleCloud\\project-115a4092973b.json'

In [3]:
#@markdown Insert bigquery project id.
bq_project_id = "abbra-project" #@param {type:"string"}
client = bigquery.Client(project=bq_project_id)

In [6]:
#In the following example query, was selected a random set of patents 
#(and collected their embeddings) granted after Jan 1, 2000 with a top term of "levetiracetam"
search_term = "levetiracetam" #@param {type:"string"}
return_count = 10000 #@param {type:"integer"}

def create_query(search_term):

  q = r'''
  WITH 
  pubs as (
    SELECT DISTINCT 
      pub.publication_number
    FROM `patents-public-data.patents.publications` pub
      INNER JOIN `patents-public-data.google_patents_research.publications` gpr ON
        pub.publication_number = gpr.publication_number
    WHERE 
      "levetiracetam" IN UNNEST(gpr.top_terms)
      AND pub.grant_date > 20000101
  )

  SELECT
    publication_number, url
  FROM 
    `patents-public-data.google_patents_research.publications`
  WHERE
    publication_number in (SELECT publication_number from pubs)
    AND RAND() <= 1000/(SELECT COUNT(*) FROM pubs)
  '''

  return q

df = client.query(create_query(search_term)).to_dataframe()

if len(df) == 0:
  raise ValueError('No results for your search term. Retry with another term.')
else:
  print('Search complete for search term: \"{}\". {} random assets selected.'
  .format(search_term, len(df)))

embedding_dict = dict(zip(df.publication_number.tolist(), 
                          df.embedding_v1.tolist()))

df.head()

Forbidden: 403 GET https://bigquery.googleapis.com/bigquery/v2/projects/abbra-project/queries/400b6634-a750-45fd-a9b2-d94ea341e114?maxResults=0&location=US: Quota exceeded: Your project exceeded quota for free query bytes scanned. For more information, see https://cloud.google.com/bigquery/troubleshooting-errors

(job ID: 400b6634-a750-45fd-a9b2-d94ea341e114)

                            -----Query Job SQL Follows-----                            

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:
   2:  WITH 
   3:  pubs as (
   4:    SELECT DISTINCT 
   5:      pub.publication_number
   6:    FROM `patents-public-data.patents.publications` pub
   7:      INNER JOIN `patents-public-data.google_patents_research.publications` gpr ON
   8:        pub.publication_number = gpr.publication_number
   9:    WHERE 
  10:      "levetiracetam" IN UNNEST(gpr.top_terms)
  11:      AND pub.grant_date < 20000101
  12:  )
  13:
  14:  SELECT
  15:    publication_number, url, 
  16:    embedding_v1
  17:  FROM 
  18:    `patents-public-data.google_patents_research.publications`
  19:  WHERE
  20:    publication_number in (SELECT publication_number from pubs)
  21:    AND RAND() <= 250/(SELECT COUNT(*) FROM pubs)
  22:  
    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |

In [5]:
df.shape

(3444, 3)

In [96]:
df.head(2)

Unnamed: 0,publication_number,url,embedding_v1
0,US-2011207794-A1,https://patents.google.com/patent/US20110207794A1,"[-0.119191304, -0.20370719, 0.020101016, 0.015..."
1,EP-3096790-B1,https://patents.google.com/patent/EP3096790B1,"[0.041941997, -0.0845463, 0.016990302, -0.0900..."


### Save file

In [6]:
df.to_csv('patents3444.csv')