In [None]:
!pip install sentence-transformers
!pip install qdrant-client



In [None]:
# pip install qdrant-client
# pip install sentence-transformers

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct

In [None]:
#QdrantSetup
client = QdrantClient(
    url="URL_DE_SEU_CLUSTER",
    api_key="SUA_API_KEY",
)

In [None]:
# Carrega o dataset
patent_ds = pd.read_json('2015_USPTO.json')
patent_ds

Unnamed: 0,Subclass_labels,Abstract,Title,No
0,"[B64D, B64G]",a method of countering the effects of g forces...,aircrew ensembles,US08925112
1,"[A63B, A41D]",a protective hand covering has a membrane to c...,protective hand covering,US08925113
2,[A41C],a perspiration concealing brassiere having a b...,perspiration concealing brassiere,US08925114
3,"[A45F, A41D]",a first aid systems for an ultra compact first...,low profile medical kit,US08925115
4,"[A43B, A41D, A43C]",a decorative and or promotional accessory to b...,accessory for shoe laces hat brims and the like,US08925116
...,...,...,...,...
49895,"[G09G, G03B, H01S, H04N]",a wavelength conversion device includes an exc...,wavelength conversion device and image display...,US08976203
49896,"[H05K, H01R]",an electronic device including an electronic u...,cable assembly and electronic device,US08976510
49897,"[H04J, G01R]",a current detection module generates a first c...,circuit and method for metering electricity,US08976819
49898,"[H04Q, H04B, H04L]",embodiments enable a network operator to use a...,unified network management of hybrid fiber coa...,US08977126


In [None]:
index_name = "patent-200" # Name of your Cluster

dimensions_768 = 768 #Size of embeddings
model_768 = SentenceTransformer('all-distilroberta-v1') #Loads the pre-trained SBERT model

dimensions_384 = 384 #Size of embeddings
model_384 = SentenceTransformer('all-MiniLM-L6-v2') #Loads the pre-trained SBERT model

dimensions = dimensions_384
model = model_384



In [None]:
def histogram(classes:list, k:int):
    result = {}
    ctr=1
    for value, key in sorted(((classes.count(e), e) for e in set(classes)), reverse=True):
        if (ctr > k): break
        result[key] = value
        ctr+=1
    return result

In [None]:
def process_result(accuracy_dict, k, n, type):
    index = "{}-{}-{}".format(k,n,type)
    if (index in accuracy_dict):
        accuracy_dict[index] = accuracy_dict.get(index) + 1
    else:
        accuracy_dict[index] = 1

In [None]:
def get_process_result(accuracy_dict, k, n, type):
    index = "{}-{}-{}".format(k,n,type)
    if (index in accuracy_dict):
        return accuracy_dict[index]
    else:
        return 0

In [None]:
def print_process_result(accuracy_dict, k_list, n_list):
    for k in k_list:
        for n in n_list:
            positive = get_process_result(accuracy_dict, k, n, 'positive')
            negative = get_process_result(accuracy_dict, k, n, 'negative')
            accuracy = positive / (positive + negative)
            print("k={} - n={} - Positive: {} - Negative: {} - "
                "Accuracy: {} ".format(k,n,positive,negative,accuracy))

In [None]:
def transform_process_result(accuracy_dict, k_list, n_list):
    matrix = np.zeros((len(k_list), len(n_list)))
    i = j = 0
    for k in k_list:
        j=0
        for n in n_list:
            positive = get_process_result(accuracy_dict, k, n, 'positive')
            negative = get_process_result(accuracy_dict, k, n, 'negative')
            accuracy = positive / (positive + negative)
            matrix[i][j] = accuracy
            j+=1
        i+=1
    return matrix

In [None]:
#Method to approximate search
def search(vector):
    hits = client.search(
        collection_name=index_name,
        query_vector=vector,
        limit=100
    )
    return hits

In [None]:
#Performs queries to group returned patent subclasses for each input patent
print("Index name: ",index_name)
k_list = [1,2,3,4,5,6,7,8,9,10]
n_list = [10,25,50,75,100]
accuracy_dict = {}
ctr_hit = 0
hits = 0
positive = negative = 0
subclass_list = []
hit_list = []
ctr_queries = 0
for index, row in patent_ds.iterrows():
    ctr_queries+=1
    #if (ctr_queries > 2000): break
    embedding = model.encode(row.Title+" "+row.Abstract).tolist()

    hits = search(embedding)

    hit_list.clear()
    hits_count=0
    for hit in hits:
        hit_list.append(hit.payload['subclass_labels'])
        #if (hits_count <= 20): print(str(hit.score) +" - "+hit.payload['subclass_labels'])
        hits_count+=1

    print("Query id: "+str(ctr_queries)+" - Patent No: "+row.No+" - Subclasses: "+
        ';'.join(map(str, row.Subclass_labels))+" - Hits: "+str(hits_count)) #This last part concatenate the subclasses using the ";" character

    for k in k_list:
        for n in n_list:
            ctr_hit = 0
            for subclass in hit_list:
                ctr_hit+=1
                if (ctr_hit > n): break
                subclass_list.append(subclass)

            histogram_res = histogram(subclass_list, k)
            #print('k: '+str(k)+ ' - n: '+str(n))
            #print(histogram_res)
            subclass_list.clear()

            for subclass in row.Subclass_labels:
                #print("Subclass: "+subclass+" Subclass List: "+str(subclass_list))
                if (subclass in histogram_res):
                    process_result(accuracy_dict, k, n, 'positive')
                else:
                    process_result(accuracy_dict, k, n, 'negative')


print_process_result(accuracy_dict, k_list, n_list)
print("Accuracy by k and n")
matrix = transform_process_result(accuracy_dict, k_list, n_list)
print(matrix)

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
Query id: 44962 - Patent No: US08970192 - Subclasses: G05F;H02M - Hits: 100
Query id: 44963 - Patent No: US08970193 - Subclasses: G05F - Hits: 100
Query id: 44964 - Patent No: US08970194 - Subclasses: G05F - Hits: 100
Query id: 44965 - Patent No: US08970195 - Subclasses: G05F;H02M - Hits: 100
Query id: 44966 - Patent No: US08970196 - Subclasses: H02M - Hits: 100
Query id: 44967 - Patent No: US08970197 - Subclasses: G05F - Hits: 100
Query id: 44968 - Patent No: US08970199 - Subclasses: H02M - Hits: 100
Query id: 44969 - Patent No: US08970200 - Subclasses: G09G;H02M - Hits: 100
Query id: 44970 - Patent No: US08970201 - Subclasses: G01N;G01M;E02D - Hits: 100
Query id: 44971 - Patent No: US08970202 - Subclasses: G06F;G01R;B60K;G01N;F16H;G01B - Hits: 100
Query id: 44972 - Patent No: US08970203 - Subclasses: H03M - Hits: 100
Query id: 44973 - Patent No: US08970204 - Subclasses: G01R - Hits: 100
Query id: 44974 - Patent 