In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from datasets import load_from_disk
from transformers import LongformerTokenizerFast

In [20]:
tokenized_data = load_from_disk("../data/refined_patents/tokenized/longformer_tokenizer/train")

df = pd.DataFrame(columns=['patent_id','text', 'labels'])

df['patent_id'] = tokenized_data['patent_id'][:10000]
df['labels'] = tokenized_data['labels'][:10000]
df['text'] = tokenized_data['input_ids'][:10000]
del tokenized_data

In [21]:
general = pd.read_pickle('../data/refined_patents/tfidf/longformer_tokenizer/train_tfidf.pkl')

In [22]:
general = general[:10000]

In [3]:
label_based = pd.read_pickle('../data/refined_patents/tfidf/label_based/train_tfidf.pkl')

In [16]:
s=0
for i in tqdm(range(general.shape[0])):
    id = [general.loc[i].patent_id]
    general_tfidf_sum = general.loc[i][1:].sum()
    label_based_tfidf_sum = label_based[label_based.patent_id.isin(id)].iloc[0][1:].sum()
    s += abs(general_tfidf_sum-label_based_tfidf_sum)
print(s/general.shape[0])

100%|██████████| 10000/10000 [04:43<00:00, 35.30it/s]

0.16010481436252594





In [23]:
kmeans = KMeans(n_clusters=8, random_state=42)

In [24]:
X = general.drop(columns=['patent_id']).to_numpy()

In [25]:
kmeans.fit(X)

KMeans(random_state=42)

In [26]:
clusters = kmeans.labels_

In [27]:
pca = PCA(n_components=2, random_state=42)

In [28]:
pca_vecs = pca.fit_transform(X)

In [29]:
x0 = pca_vecs[:, 0]
x1 = pca_vecs[:, 1]

In [30]:
df['cluster'] = clusters
df['x0'] = x0
df['x1'] = x1

In [31]:
terms = general.columns[1:]

In [32]:
data = pd.DataFrame(X).groupby(clusters).mean()

In [15]:
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', max_length=4096)

In [19]:
#Preprocessed
for i,r in data.iterrows():
    print('\nCluster {}'.format(i))
    ids = [int(terms[t]) for t in np.argsort(r)[:20]]

    print(tokenizer.convert_ids_to_tokens(ids))


Cluster 0
['latable', 'Ġdont', 'Ġvirtues', 'rella', 'Ġdocks', 'Ġmarriage', 'Ġcaves', 'erous', 'Ġseaf', 'Ġeerie', 'Ġpunishments', 'Ġbona', 'Ġstripe', 'Ġnefarious', 'iscover', 'Ġfamous', 'Ġclerks', 'testing', 'Ġcollapses', 'Ġrarity']

Cluster 1
['latable', 'relations', 'Ġengulf', 'Ġacad', 'Ġship', 'Ġhops', 'thening', 'Ġrecollection', 'rapnel', 'ateral', 'Ġanew', 'Ġfoliage', 'Ġnumer', 'itably', 'der', 'Ġfatig', 'wolf', 'vice', 'Ġhilar', 'Ġeyed']

Cluster 2
['latable', 'Ġbearer', 'Ġbegins', 'Ġflips', 'Ġcurl', 'rael', 'udden', 'Ġcirculate', 'Ġraven', 'Ġbraces', 'cerned', 'Ġmisunderstand', 'Ġreinforcement', 'Ġmonths', 'Ġuploading', 'allion', 'Ġecstasy', 'fail', 'Ġgorilla', 'orc']

Cluster 3
['Ġfleet', 'Ġeste', 'orie', 'Ġbalcony', 'Ġincredibly', 'Ġinsightful', 'Ġsolemn', 'ahon', 'Ġsmuggled', 'Ġhated', 'Ġcondemnation', 'Ġeater', 'Ġstaffer', 'intelligence', 'Ġsubmitting', 'uten', 'Ġsubsistence', 'arrell', 'Ġhardships', 'Ġinning']

Cluster 4
['latable', 'Ġhilar', 'Ġeyed', 'ool', 'indu', 'Ġgende

In [33]:
#Classic
for i,r in data.iterrows():
    print('\nCluster {}'.format(i))
    ids = [int(terms[t]) for t in np.argsort(r)[:20]]

    print(tokenizer.convert_ids_to_tokens(ids))



Cluster 0
['Ġnav', 'Ġsparkling', 'Ġ1945', 'allas', 'existence', 'Ġfasc', 'Ġexpectations', 'stration', 'Ġdissemination', 'Ġlantern', 'alis', 'irled', 'utra', 'dule', '467', 'enforcement', 'Ġtweet', 'Ġlegends', 'Ġdab', 'Ġpric']

Cluster 1
['Ġnav', 'Ġdirectives', 'olerance', 'Ġassailants', 'ussen', 'Ġaggrav', 'Ġtame', 'Ġunfit', 'Ġpastry', 'cards', 'Ġdefy', 'Ġquickest', 'Ġupheaval', 'Ġspoiled', 'baby', 'athom', 'umbs', 'affles', 'ryan', 'Ġepidem']

Cluster 2
['eland', 'neau', 'Ġsir', 'Ġjealousy', 'ardon', 'Ġpsych', 'Ġunin', 'quist', 'eful', 'iscovery', 'idy', 'yx', 'Ġinvaluable', 'Ġtranscription', 'Ġproxies', 'Ġservings', 'alian', 'Ġhonor', 'Ġthreatened', 'Ġpra']

Cluster 3
['eland', 'Ġdistressed', 'istent', 'container', 'Ġpayable', 'ounters', 'Ġclimate', 'bably', 'izabeth', 'Ġquint', 'Ġstray', 'inctions', 'Ġelusive', 'Ġgrandma', 'Ġgadgets', 'Ġlandslide', 'assert', 'Ġfaintly', 'license', 'gets']

Cluster 4
['Ġnav', '393', 'irled', 'pun', 'Ġlantern', 'É', 'Ġpigs', 'approved', 'Ġdisseminati