In [2]:
from google.colab import drive
from zipfile import ZipFile
import torch
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
import json
import ast
import random
import xgboost as xgb
from sklearn.model_selection import train_test_split

drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [14]:
# Load citation dependency file and extract positive examples
citation_dependency_file = "gdrive/Shared drives/SWM 20/patent_dict_1000.csv"
df = pd.read_csv(citation_dependency_file)
positive_examples = []
edge_list_df = df.apply(
    lambda x: [positive_examples.append((x.patent, y)) for y in ast.literal_eval(x.citations)], 
    axis=1
)
all_1000_patents = []
for item in positive_examples:
  all_1000_patents.append(str(item[0]))
  all_1000_patents.append(str(item[1]))
all_1000_patents = set(all_1000_patents)
len(all_1000_patents)

70698

In [4]:
# Load embeddings file and check if all embeddings are valid
embeddings_file = "gdrive/Shared drives/SWM 20/Embeddings/universal_sentence_embeddings_2.json"
with open(embeddings_file) as f:
  dat = json.load(f)
  patents_embeddings_df = pd.DataFrame(dat)

patents_with_embeddings = list(patents_embeddings_df.index.unique())
patents_without_valid_embeddings = [x for x in patents_with_embeddings if (
    not patents_embeddings_df.loc[x].embedding or 
    len(patents_embeddings_df.loc[x].embedding) == 500
    )]

if len(patents_without_valid_embeddings) > 0:
  print("Some patents without embeddings found")
  [patents_embeddings_df.drop(x, inplace=True) for x in patents_without_valid_embeddings]

patents_with_valid_embeddings = set([x.split(".")[0] for x in patents_embeddings_df.index])
patents_with_valid_embeddings

del patents_without_valid_embeddings
del patents_with_embeddings
len(patents_embeddings_df)

Some patents without embeddings found


70621

Create Dataset containing positive and negative sample

In [5]:
# Filter patents in all_1000_patents that have embeddings
all_1000_patents_with_embeddings = set(all_1000_patents).intersection(patents_with_valid_embeddings)
len(all_1000_patents_with_embeddings)  / len(all_1000_patents)

# Find pairs that have embeddings
positive_examples_with_text = ([
  x
  for x in positive_examples
  if (
      str(x[0]) in all_1000_patents_with_embeddings and 
      str(x[1]) in all_1000_patents_with_embeddings
  )                               
])

# Contains 99.28% of positive_examples
# len(positive_examples_with_text) / len(positive_examples)

# Check if all embeddings have shape 512
sum([len(x) for x in patents_embeddings_df.embedding]) == len(patents_embeddings_df) * 512

True

In [0]:
# Extract features for positive case

# feature1 = np.array([x[0] for x in positive_examples_with_text])
# feat_1_embedding = np.array([np.array(patents_embeddings_df.loc[f"{x}.txt"].embedding) for x in feature1])
# feat_1_embedding.shape

# feature2 = np.array([x[1] for x in positive_examples_with_text])
# feat_2_embedding = np.array([np.array(patents_embeddings_df.loc[f"{x}.txt"].embedding) for x in feature2])
# feat_2_embedding.shape

# np.save("gdrive/My Drive/Embeddings/New Embeddings Universal/feature2embedding", feat_2_embedding)
# np.save("gdrive/My Drive/Embeddings/New Embeddings Universal/feature1embedding", feat_1_embedding)

In [55]:
## create negative examples:

# dependency_dict = {}
# def assign_to_dependency_dict(key, value):
#   dependency_dict[key] = value
# df.apply(
#     lambda x: assign_to_dependency_dict(x.patent, ast.literal_eval(x.citations)),
#     axis=1
# )
all_false_edges = []
count = 0
for patent, dependents in dependency_dict.items():
  patent = str(patent)
  if patent not in all_1000_patents_with_embeddings:
    # print(patent)
    continue
  # print("here")
  false_edges = []
  num_edges = max((70 * len(dependents)) // 100, 50)
  while len(false_edges) < num_edges:
    candidate = random.choice(all_1000_patents)
    if candidate != patent and candidate not in dependents and candidate in all_1000_patents_with_embeddings:
      false_edges.append(candidate)
      all_false_edges.append((str(patent), candidate))

# "3860003" in all_1000_patents_with_embeddings
len(all_false_edges)

96888

In [0]:
# neg_feature1 = np.array([x[0] for x in all_false_edges])
# neg_feat_1_embedding = np.array([np.array(patents_embeddings_df.loc[f"{x}.txt"].embedding) for x in neg_feature1])
# print(neg_feat_1_embedding.shape)

# neg_feature2 = np.array([x[1] for x in all_false_edges])
# neg_feat_2_embedding = np.array([np.array(patents_embeddings_df.loc[f"{x}.txt"].embedding) for x in neg_feature1])
# print(neg_feat_2_embedding.shape)

np.save("gdrive/My Drive/Embeddings/New Embeddings Universal/negfeature2embedding", neg_feat_2_embedding)
np.save("gdrive/My Drive/Embeddings/New Embeddings Universal/negfeature1embedding", neg_feat_1_embedding)

In [0]:
feat_1_embedding = np.load("gdrive/My Drive/Embeddings/New Embeddings Universal/feature1embedding.npy")
feat_2_embedding = np.load("gdrive/My Drive/Embeddings/New Embeddings Universal/feature2embedding.npy")
neg_feat_1_embedding = np.load("gdrive/My Drive/Embeddings/New Embeddings Universal/negfeature1embedding.npy")
neg_feat_2_embedding = np.load("gdrive/My Drive/Embeddings/New Embeddings Universal/negfeature2embedding.npy")

positives = np.concatenate((feat_1_embedding, feat_2_embedding), axis=1).reshape((138856, 1024))
negatives = np.concatenate((neg_feat_1_embedding, neg_feat_2_embedding), axis=1).reshape((96888, 1024))
features = np.concatenate((positives, negatives), axis=0)
labels = np.concatenate(
  (
      np.array([1] * positives.shape[0]),
      np.array([0] * negatives.shape[0])
  )  
)

In [0]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(
   features, labels, test_size=0.33, random_state=42)
model = xgb.XGBClassifier(random_state=1, learning_rate=0.01)
model.fit(X_train, y_train)

test score:  0.8486084027654671
train score:  0.845454779166024


In [12]:
print("test score: ", model.score(X_train, y_train))
print("train score: ", model.score(X_test, y_test))

test score:  0.8486084027654671
train score:  0.845454779166024


In [0]:
model.save_model("gdrive/My Drive/Embeddings/New Embeddings Universal/xgb_model")