In [None]:
import requests
import pandas as pd

In [None]:
# Define the correct base URL for the USPTO Patent Grants API
base_url = 'https://api.uspto.gov/api/v1/patent/applications/search'

# Define the query parameters to filter patents by grant date (2009)
params = {  # Filter patents granted in 2009
    'start': 0,  # Start at record 0 (for pagination)
    'rows': 100  # Number of patents to return per request (adjustable)
}

# Make the request to the USPTO API
response = requests.get(base_url, params=params)

In [None]:
response.json()

In [None]:

# Check if the request was successful
if response.status_code == 200:
    patents = response.json()['results']
    # Loop through the returned patents and print basic information
    for patent in patents:
        print(f"Patent Number: {patent['patentApplicationNumber']}")
        print(f"Title: {patent['inventionTitle']}")
        print(f"Date: {patent.get('publicationDate')}")
        print("-" * 80)
else:
    print(f"Error: {response.status_code}, {response.text}")



In [None]:
import requests

# Define the correct base URL for the USPTO Patent Grants API
base_url = 'https://developer.uspto.gov/ibd-api/v1/application/publications'

# Initialize a set to keep track of downloaded patent application numbers
downloaded_patents = set()

# Function to make a query and update the downloaded_patents set
def fetch_patents(start, rows=100):
    params = {
        'start': start,
        'rows': rows
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        patents = response.json()['results']
        new_patents = [patent for patent in patents if patent['patentApplicationNumber'] not in downloaded_patents]
        for patent in new_patents:
            downloaded_patents.add(patent['patentApplicationNumber'])
        return new_patents
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return []

# Loop to fetch multiple batches of patents
all_patents = []
start = 0
while True:
    new_patents = fetch_patents(start)
    if not new_patents:
        break
    all_patents.extend(new_patents)
    start += 100

# Print the number of unique patents downloaded
print(f"Total unique patents downloaded: {len(all_patents)}")

In [None]:

# Convert the patents list into a pandas DataFrame
patents_df = pd.DataFrame(all_patents)

# Flatten the lists in 'abstractText', 'claimText', and 'descriptionText' columns
for col in ['abstractText', 'claimText', 'descriptionText']:
    patents_df[col] = patents_df[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)




In [None]:
patents_df.to_pickle('all_patents.pkl')

### Start from here to download data:

In [None]:
patents_df = pd.read_pickle('all_patents.pkl')

In [None]:
# Display the first few rows of the DataFrame
patents_df.head()

In [None]:
patents_df.columns

In [None]:
len(patents_df)

In [None]:
patents_df['descriptionText'][0:5]

In [None]:
for i, desc in enumerate(patents_df['descriptionText'][:5]):
    desc = '\n'.join([desc[i:i+100] for i in range(0, len(desc), 100)])
    print("Application Number: " + patents_df['patentApplicationNumber'][i])
    print("Invention Title: " + patents_df['inventionTitle'][i])  
    print(f"{'-'*80}")
    print(f"Description {i+1}:\n{desc}\n{'-'*80}\n")


In [None]:
# word2vec requires sentences as input
from nltk import sent_tokenize
from string import punctuation
translator = str.maketrans('','',punctuation) 
from nltk.corpus import stopwords
stoplist = set(stopwords.words('english'))
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

def normalize_text(doc):
    "Input doc and return clean list of tokens"
    doc = doc.replace('\r', ' ').replace('\n', ' ')
    lower = doc.lower() # all lower case
    nopunc = lower.translate(translator) # remove punctuation
    words = nopunc.split() # split into tokens
    nostop = [w for w in words if w not in stoplist] # remove stopwords
    no_numbers = [w if not w.isdigit() else '#' for w in nostop] # normalize numbers
    stemmed = [stemmer.stem(w) for w in no_numbers] # stem each word
    return stemmed

def get_sentences(doc):
    sent=[]
    for raw in sent_tokenize(doc):
        raw2 = normalize_text(raw)
        sent.append(raw2)
    return sent


In [None]:
from tqdm.notebook import tqdm
import gensim
from gensim.models import Word2Vec

sentences = []
for doc in patents_df['descriptionText'][0:5000]:
    sentences += get_sentences(doc)
from random import shuffle

shuffle(sentences) # stream in sentences in random order



# Create progress bar callback
class ProgressBar(gensim.models.callbacks.CallbackAny2Vec):
    def __init__(self, total_epochs):
        self.tqdm = None
        self.epoch = 0
        self.total_epochs = total_epochs

    def on_train_begin(self, model):
        self.tqdm = tqdm(total=self.total_epochs, desc="Training Progress")

    def on_epoch_end(self, model):
        self.epoch += 1
        self.tqdm.update(1)
        if self.epoch >= self.total_epochs:
            self.tqdm.close()


# Define total number of epochs for progress tracking
total_epochs = 10

# Create a callback instance
progress_bar = ProgressBar(total_epochs=total_epochs)

# train the model

w2v = Word2Vec(sentences,  # list of tokenized sentences
               workers = 8, # Number of threads to run in parallel
               vector_size=300,  # Word vector dimensionality     
               min_count =  25, # Minimum word count  
               window = 5, # Context window size      
               sample = 1e-3, # Downsample setting for frequent words
               callbacks=[progress_bar] # Add the callback to the model
               )

# done training, so delete context vectors
w2v.init_sims(replace=True)

w2v.save('w2v-vectors.pkl')


In [None]:
vocabulary = list(w2v.wv.key_to_index.keys())
print(vocabulary)

In [None]:
# Vector format of word 'invent'
w2v.wv['invent']

In [None]:
#Length of the vector
len(w2v.wv['invent'])

In [None]:
w2v.wv.most_similar('invent') # most similar words

In [None]:
import gensim.downloader as api

info = api.info()  # show info about available models/datasets
model = api.load("glove-wiki-gigaword-300")  # download the model and return as object ready for use
model.most_similar("invent")


In [None]:
import fasttext
import fasttext.util
# fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')

# Example usage
# print(ft.get_word_vector("king"))      # 300-dim vector
# print(ft.get_nearest_neighbors("queen"))

In [None]:
# Cosine similarity
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(v1, v2):
    return dot(v1, v2) / (norm(v1) * norm(v2))


# Word2Vec
print("Word2Vec similarity:", w2v.wv.similarity('machin', 'devic')) # similarity between two words
print("GloVe similarity:", model.similarity('machine', 'device'))
print("FastText similarity:", cosine_similarity(ft.get_word_vector('machine'), ft.get_word_vector('device')))

In [None]:
# Word2Vec
print("Word2Vec similarity:", w2v.wv.similarity('may', 'june')) # similarity between two words
print("GloVe similarity:", model.similarity('may', 'june'))
print("FastText similarity:", cosine_similarity(ft.get_word_vector('may'), ft.get_word_vector('june')))


In [None]:
# Word2Vec: K-Means Clusters
from sklearn.cluster import KMeans
kmw = KMeans(n_clusters=50)
kmw.fit(w2v.wv.vectors)

invent_clust = kmw.labels_[w2v.wv.key_to_index['invent']]
for i, cluster in enumerate(kmw.labels_):
    if cluster == invent_clust and i<=100:
        print(w2v.wv.index_to_key[i])

In [None]:
sentences[0]

In [None]:
# create list docs so that each element is a list of sentences of each document in descriptionText
docs = []
for doc in patents_df['descriptionText'][:1000]:
    sentences = get_sentences(doc)
    # flatten list of sentences into one list
    docs.append([item for sublist in sentences for item in sublist])

# flatten the list of sentences into one list


In [None]:
docs[0]

In [None]:
###
# Make document vectors by averaging word embeddings in a document
##
import numpy as np

sentvecs = []
for sentence in docs:
    vecs = [w2v.wv[w] for w in sentence if w in w2v.wv]
    if len(vecs)== 0:
        sentvecs.append(np.nan)
        continue
    sentvec = np.mean(vecs,axis=0)
    sentvecs.append(sentvec.reshape(1,-1))
# First 30 elements of the first sentence:
sentvecs[0][0][:30]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(sentvecs[0],
                  sentvecs[1])[0][0]

In [None]:
# Calculate the pairwise cosine similarity between all sentences:


In [None]:
# Filter out NaN values from sentvecs
filtered_sentvecs = [vec for vec in sentvecs if not isinstance(vec, float)]

# Stack the sentence vectors into a single numpy array
sentvecs_array = np.vstack(filtered_sentvecs)

# Calculate the pairwise cosine similarity
similarity_matrix = cosine_similarity(sentvecs_array)

# Display the first 5x5 block of the similarity matrix
similarity_matrix[:5, :5]

In [None]:
# Find the most similar sentences that are not on the diagonal
max_similarity = -1
most_similar_sentences = None
for i in range(similarity_matrix.shape[0]):
    for j in range(similarity_matrix.shape[1]):
        if i != j and similarity_matrix[i, j] > max_similarity:
            max_similarity = similarity_matrix[i, j]
            most_similar_sentences = (i, j)


In [None]:
# Mask the diagonal by setting it to a very low value
np.fill_diagonal(similarity_matrix, -np.inf)

# Find the indices of the maximum value in the similarity matrix
i, j = np.unravel_index(np.argmax(similarity_matrix), similarity_matrix.shape)

# The most similar sentences
most_similar_sentences = (i, j)
max_similarity = similarity_matrix[i, j]

print(f"Most similar sentences are at indices: {most_similar_sentences} with similarity score: {max_similarity}")

In [None]:
# Extract the indices of the most similar sentences
j = most_similar_sentences[0]

# Display the titles and abstracts of the most similar sentences
print("Title 1:", patents_df.iloc[0]['inventionTitle'])
print("Abstract 1:", patents_df.iloc[0]['abstractText'])
print("\nTitle 2:", patents_df.iloc[j]['inventionTitle'])
print("Abstract 2:", patents_df.iloc[j]['abstractText'])

In [None]:
# Display the titles and abstracts of the most similar sentences
print("Title 1:", patents_df.iloc[934]['inventionTitle'])
print("Abstract 1:", patents_df.iloc[934]['abstractText'])
print("\nTitle 2:", patents_df.iloc[992]['inventionTitle'])
print("Abstract 2:", patents_df.iloc[992]['abstractText'])