In [1]:
from google.colab import drive
from zipfile import ZipFile
import torch
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
import json
import ast
import random
import xgboost as xgb
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import train_test_split

drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# read config
config
with open("config.json") as f:
  config = json.load(f)

In [135]:
# Loading all embeddings

def load_embeddings(filepath):
  with open(filepath) as f:
    dat = json.load(f)
    return pd.DataFrame(dat)
  
universal_embedding_path = config["universalEmbeddingLocation"]
bert_embedding_path = config["bertEmbeddingLocation"]

# bert_embeddings = load_embeddings(bert_embedding_path)
universal_embeddings = load_embeddings(universal_embedding_path)

# del universal_embeddings
universal_embeddings.head()

Unnamed: 0,embedding
1446563.txt,"[[-0.012815116], [0.0148477275], [0.0148712862..."
1525015.txt,"[[0.0154291857], [-0.0182424523], [-0.00245026..."
1863216.txt,"[[-0.0030075712], [0.0033099782], [-0.00538926..."
1978631.txt,"[[0.00690126], [0.0033768883], [0.0246417094],..."
2027962.txt,"[[-0.015760392], [0.0233437903], [0.04123668],..."


In [137]:
# Checking all embeddings to ensure no non-zero, null embeddings
def check_embeddings(embedding_df):
  unique_patents = list(embedding_df.index.unique())
  patents_without_valid_embeddings = [x for x in unique_patents if (
      not embedding_df.loc[x].embedding or 
      len(embedding_df.loc[x].embedding) < 512
  )]
  return patents_without_valid_embeddings

# invalid_use = check_embeddings(universal_embeddings)
# invalid_bert_embeddings = check_embeddings(bert_embeddings)
check_embeddings(bert_embeddings)

[]

In [138]:
## Getting nearest patents from dependents 

# Load citation dependency file and extract positive examples
citation_dependency_file = config["patentsDictFileLocation"]
# df = pd.read_csv(citation_dependency_file)
positive_examples = []
df.apply(
    lambda x: [positive_examples.append((x.patent, y)) for y in ast.literal_eval(x.citations)], 
    axis=1
)
all_1000_patents = []
for item in positive_examples:
  all_1000_patents.append(str(item[0]))
  all_1000_patents.append(str(item[1]))
all_1000_patents = set(all_1000_patents)
len(all_1000_patents)
patents_with_valid_embeddings = set([x.split(".")[0] for x in universal_embeddings.index])

all_1000_patents_with_embeddings = set(all_1000_patents).intersection(patents_with_valid_embeddings)
# len(all_1000_patents_with_embeddings)
positive_examples_with_text = ([
  x
  for x in positive_examples
  if (
      str(x[0]) in all_1000_patents_with_embeddings and 
      str(x[1]) in all_1000_patents_with_embeddings
  )                               
])

dependency_dict = {}
def assign_to_dependency_dict(key, value):
  if str(key) not in all_1000_patents_with_embeddings:
    return
  value = [x for x in value if x in all_1000_patents_with_embeddings]
  dependency_dict[key] = value

df.apply(
    lambda x: assign_to_dependency_dict(x.patent, ast.literal_eval(x.citations)),
    axis=1
)
[]

[]

In [0]:
bert_embeddings.set_index("patents", inplace=True)

In [140]:
def get_knn_patents(dependency_dict, embeddings_df, k=10):
  knn_patents = {}
  for key, val in tqdm(dependency_dict.items()):
    curr_embedding = np.array([embeddings_df.loc[f"{key}.txt"]]).reshape(1, -1)
    dependents_embeddings = np.array([embeddings_df.loc[f"{x}.txt"] for x in val]).reshape(len(val), 512)
    dists = euclidean_distances(curr_embedding, dependents_embeddings)
    
    top_k = dists.argsort()[0][:k]
    _dists = [dists[0][x] for x in top_k]
    top_k_patents = ([val[x] for x in top_k])
    knn_patents[str(key)] = list(zip(top_k_patents, _dists))
  return knn_patents

nearest_patents = get_knn_patents(dependency_dict, universal_embeddings)

100%|██████████| 994/994 [01:22<00:00, 12.08it/s]


In [None]:
with open(config["bertNearest10"], "w") as f:
    json.dump(get_knn_patents(dependency_dict, bert_embeddings))

# Uncomment follwing two for Universal Embedding

# with open(config["universalNearest10"], "w") as f:
#     json.dump(get_knn_patents(dependency_dict, universal_embeddings))