In [14]:
import numpy as np
import pandas as pd

df = pd.read_csv('patents.csv')

patent_number = df['publication_number'].to_numpy()
patent_features = df['patent_embedding'].to_numpy()
temp = []
for i in range(patent_features.size):
    s = str(patent_features[i]) 
    s1 = s.replace(r'\n', '')
    temp.append(
        np.array(s.split()[1:-1], dtype='float')[:16]
    )

patent_features = np.stack(temp)
print(patent_features)
patent_category = df['category']

[[ 0.00135472  0.01564001 -0.04858465 ... -0.00588488  0.03504592
  -0.05058029]
 [ 0.02487706 -0.05628379  0.01121092 ...  0.00514467 -0.07313328
  -0.00974522]
 [-0.0204547   0.00863679  0.096082   ...  0.03805348 -0.04626897
  -0.00725784]
 ...
 [ 0.02963509 -0.02319208  0.07132088 ...  0.0203206   0.00058703
  -0.03183366]
 [ 0.01027578 -0.02018255  0.05681729 ...  0.02291106 -0.04222927
  -0.05573821]
 [ 0.03079691 -0.02468791  0.08207965 ...  0.04529444 -0.04848425
  -0.02298722]]


### max_distance_patent

In [15]:

euclidean_distances = np.linalg.norm(patent_features, axis=1)


max_distance_patent_index = np.argmax(euclidean_distances)
max_distance_patent = df.iloc[max_distance_patent_index]

print("max_distance_patent")
print(max_distance_patent)

max_distance_patent
publication_number                                          CH-527846-A
title                                             penicillanylaldehydes
cpc_code                                                     C07D499/00
patent_embedding      [-1.58957148e-03  3.83572765e-02 -1.47625625e-...
category                                                              5
Name: 10839, dtype: object


## Two patents with the maximum Euclidean distance

In [27]:
patent_features=patent_features[:5000]
# Calculate Euclidean distances between all pairs of patents
distances = np.linalg.norm(patent_features[:, np.newaxis] - patent_features, axis=-1)
num_neighbors = np.argmax(distances, axis=1)

print(num_neighbors)

# Find the indices of the two patents with the maximum distance
max_distance_indices = np.unravel_index(np.argmax(distances), distances.shape)

patent1_index, patent2_index = max_distance_indices
patent1 = df.iloc[patent1_index]
patent2 = df.iloc[patent2_index]

print("Two patents with the maximum Euclidean distance: ",max_distance_indices)
print("Patent 1:")
print(patent1)
print("Patent 2:")
print(patent2)


[1433 4905 2635 ... 1661 4012  592]
Two patents with the maximum Euclidean distance:  (1661, 4012)
Patent 1:
publication_number                                      KR-100793527-B1
title                                                          abrasive
cpc_code                                                       C09G1/02
patent_embedding      [-3.89078408e-02 -3.91889922e-02 -1.55463070e-...
category                                                              5
Name: 1661, dtype: object
Patent 2:
publication_number                                      JP-2018025642-A
title                                                        microscope
cpc_code                                                     G02B21/088
patent_embedding      [ 8.91560167e-02  5.98845594e-02 -4.88110306e-...
category                                                              2
Name: 4012, dtype: object


In [22]:
# Calculate the distance matrix between all patents
distances = np.linalg.norm(patent_features[:, np.newaxis] - patent_features, axis=-1)
print(distances)


# Find the number of closest neighbors for each patent
num_neighbors = np.argmin(distances, axis=1)

print(num_neighbors)

# Calculate the number of patents that have the closest neighbor in the same category
num_same_category = sum(patent_category[i] == patent_category[j] for i, j in enumerate(num_neighbors))

print("Number of patents with the closest neighbor in the same category:", num_same_category)

[[0.         0.245627   0.26009143 ... 0.15029349 0.20039899 0.10186195]
 [0.245627   0.         0.22291979 ... 0.31589119 0.23150106 0.26945603]
 [0.26009143 0.22291979 0.         ... 0.27262679 0.16427144 0.25019372]
 ...
 [0.15029349 0.31589119 0.27262679 ... 0.         0.22647368 0.13184871]
 [0.20039899 0.23150106 0.16427144 ... 0.22647368 0.         0.21055265]
 [0.10186195 0.26945603 0.25019372 ... 0.13184871 0.21055265 0.        ]]
[   0    1    2 ... 4997 4998 4999]
Number of patents with the closest neighbor in the same category: 5000
