In [1]:
# !pip install simcse -q

In [3]:
import os
import sys
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from simcse import SimCSE

In [4]:
# import model for embeddings 
bert_sup = SimCSE('princeton-nlp/sup-simcse-bert-base-uncased')

In [5]:
from preprocessing_datasets.preprocessing_itunes_amazon import clean_amazon_itunes
table, pairs = clean_amazon_itunes()

In [6]:
table.head()

Unnamed: 0,Song_Name,Artist_Name,Album_Name,Genre,Price,CopyRight,Time,Released
0,runaway train,cam,welcome to cam country - ep,"country , music , contemporary country , honky...",$ 0.99,2015 sony music entertainment,3:01,31-mar-15
1,track 14,omi,me 4 u,"pop/rock , music , pop , dance , r&b / soul",album only,"2015 ultra records , llc under exclusive licen...",3:41,unk
2,lips are movin,meghan trainor,title,"pop , music , pop/rock , dance , rock , teen pop",$ 1.29,"2014 , 2015 epic records , a division of sony ...",3:01,9-jan-15
3,i want a hippopotamus for christmas,a great big world,i 'll be home for christmas,"holiday , music",$ 1.29,"compilation ( p ) 2014 epic records , a divisi...",2:20,24-nov-14
4,credit,meghan trainor,title ( deluxe ),"pop , music , rock , pop/rock , dance , teen pop",$ 1.29,"2014 , 2015 epic records , a division of sony ...",2:51,9-jan-15


In [8]:
attributes = ['Song_Name', 'Artist_Name', 'Album_Name']
record_set = table[attributes].agg(' '.join, axis=1)

In [9]:
start = time.time()
vectors = bert_sup.encode(list(record_set), device="cuda", max_length=256)
print("TIME: {0}".format(time.time() - start))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 982/982 [00:24<00:00, 39.66it/s]

TIME: 29.063883066177368





# best run


In [8]:
algorithm = "louvain"
time_list = []
for num_clusters in [14]*10:
    """loop to evaluate cluster performance"""
    # make imports
    from evaluation import calc_index
    from graph_clustering.knn_graph_clusteriser import all_in_one_clusteriser
    key_values = {}
    print()
    key_values["num_clusters"] = num_clusters
    print(f"building blocks with: ",key_values["num_clusters"], "clusters")
    start = time.time()
    data = all_in_one_clusteriser(vectors, algorithm, num_clusters)
    reduction_ratio,pair_completeness,reference_metric,pair_quality,fmeasure=calc_index(data,table,pairs)
    print("(RR) Reduction ratio is: {0}".format(reduction_ratio))
    print("(PC) Pair completeness is: {0}".format(pair_completeness))
    print("(RM) Reference metric (Harmonic mean RR and PC) is: {0}".format(reference_metric))
    print("(PQ) Pair quality - Precision is: {0}".format(pair_quality))
    print("(FM) Fmeasure is: {0}".format(fmeasure))
    end = time.time()
    blocking_time = end - start
    time_list.append(blocking_time) # blocking time for 30 loops
    print(">> Blocking time was roughly {} seconds for {} tuples!".format(blocking_time,vectors.shape[0]))
    print("*"*50)

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool'}
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw'}

building blocks with:  14 clusters
using louvian clusterizer
Graph with 62830 nodes and 626965 edges
graph constructed in: 144.18136620521545  seconds
(RR) Reduction ratio is: 0.9925718617389055
(PC) Pair completeness is: 0.8712121212121212
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9279408397445579
(PQ) Pair quality - Precision is: 7.843694137125098e-06
(FM) Fmeasure is: 1.568724703887866e-05
>> Blocking time was roughly 156.17388010025024 seconds for 62830 tuples!
**********************

In [9]:
print("average time :", np.average(time_list))

average time : 160.94419350624085


# KNN graph clustering

using original vectors

### LOUVAIN_clusteriser

In [14]:
vectors.shape

torch.Size([62830, 768])

In [15]:
# clusterize using louvain or leiden 
algorithm = "louvain"
dataset = 'DBLP-Scholar-clean'
model = "bert_supCon"
stage = "stage_1"
results = pd.DataFrame()
time_list = []
RR = []
PC = []
RM = []
for num_clusters in range(2,30):
    """loop to evaluate cluster performance"""
    # make imports
    from evaluation import calc_index
    from graph_clustering.knn_graph_clusteriser import all_in_one_clusteriser
    key_values = {}
    print()
    key_values["num_clusters"] = num_clusters
    print(f"building blocks with: {num_clusters} clusters")
    start = time.time()
    data = all_in_one_clusteriser(vectors, algorithm, num_clusters)
    reduction_ratio,pair_completeness,reference_metric,pair_quality,fmeasure=calc_index(data,table,pairs)
    print("(RR) Reduction ratio is: {0}".format(reduction_ratio))
    print("(PC) Pair completeness is: {0}".format(pair_completeness))
    print("(RM) Reference metric (Harmonic mean RR and PC) is: {0}".format(reference_metric))
    print("(PQ) Pair quality - Precision is: {0}".format(pair_quality))
    print("(FM) Fmeasure is: {0}".format(fmeasure))
    end = time.time()
    blocking_time = end - start
    RR.append(reduction_ratio)
    PC.append(pair_completeness)
    RM.append(reference_metric)
    time_list.append(blocking_time) # blocking time for 30 loops
    print(">> Blocking time was roughly {} seconds for {} tuples!".format(blocking_time,vectors.shape[0]))
    print("*"*50)
    
results["RR"] = RR
results["PC"] = PC
results["RM"] = RM
results["n_neigh"] = range(2,30)
print("Average blocking time is: {}".format(np.average(time_list)))
print("-----------------<<<>>>------------------")
print(f'best results as follows; \n{results.iloc[results["RM"].idxmax()]}')
print(f'dumping or updating results......')
# handle to automatically create paths and save results to csv
# in specified path
directory = "f-measure-results/{}/{}/{}".format(model,dataset, stage)
if not os.path.exists(directory):
    os.makedirs(directory)
    
results.to_csv(directory+"/{}.csv".format(algorithm))
scheme_df = pd.DataFrame(data={"louvain":time_list}) # running times
print(f'done!')
print("check {}.csv file at: {}".format(algorithm,directory))

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool'}
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw'}

building blocks with: 2 clusters
using louvian clusterizer
Graph with 62830 nodes and 111886 edges
graph constructed in: 80.89920711517334  seconds
(RR) Reduction ratio is: 0.9998772705900302
(PC) Pair completeness is: 0.4318181818181818
(RM) Reference metric (Harmonic mean RR and PC) is: 0.6031522755782367
(PQ) Pair quality - Precision is: 0.00023530383091149273
(FM) Fmeasure is: 0.0004703513607182348
>> Blocking time was roughly 81.05881977081299 seconds for 62830 tuples!
*************************

Graph with 62830 nodes and 754655 edges
graph constructed in: 154.33047938346863  seconds
(RR) Reduction ratio is: 0.9895953042037582
(PC) Pair completeness is: 0.8636363636363636
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9223352965797532
(PQ) Pair quality - Precision is: 5.551089795791963e-06
(FM) Fmeasure is: 1.1102108231921094e-05
>> Blocking time was roughly 175.9934527873993 seconds for 62830 tuples!
**************************************************

building blocks with: 18 clusters
using louvian clusterizer
Graph with 62830 nodes and 797772 edges
graph constructed in: 159.84939217567444  seconds
(RR) Reduction ratio is: 0.9898104814265031
(PC) Pair completeness is: 0.8712121212121212
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9267323114710433
(PQ) Pair quality - Precision is: 5.71803703070392e-06
(FM) Fmeasure is: 1.1435999003377548e-05
>> Blocking time was roughly 181.3801772594452 seconds for 62830 tuples!
***********************************************

## LEDEN_clusteriser

In [17]:

# clusterise using louvain, leiden or markov
algorithm = "leiden"
dataset = 'DBLP-Scholar-clean'
model = "bert_supCon"
stage = "stage_1"
results = pd.DataFrame()
time_list = []
RR = []
PC = []
RM = []
for num_clusters in range(2,30):
    """loop to evaluate cluster performance"""
    # make imports
    from evaluation import calc_index
    from graph_clustering.knn_graph_clusteriser import all_in_one_clusteriser
    key_values = {}
    print()
    key_values["num_clusters"] = num_clusters
    print(f"building blocks with: {num_clusters} clusters")
    start = time.time()
    data = all_in_one_clusteriser(vectors, algorithm, num_clusters)
    reduction_ratio,pair_completeness,reference_metric,pair_quality,fmeasure=calc_index(data,table,pairs)
    print("(RR) Reduction ratio is: {0}".format(reduction_ratio))
    print("(PC) Pair completeness is: {0}".format(pair_completeness))
    print("(RM) Reference metric (Harmonic mean RR and PC) is: {0}".format(reference_metric))
    print("(PQ) Pair quality - Precision is: {0}".format(pair_quality))
    print("(FM) Fmeasure is: {0}".format(fmeasure))
    end = time.time()
    blocking_time = end - start
    RR.append(reduction_ratio)
    PC.append(pair_completeness)
    RM.append(reference_metric)
    time_list.append(blocking_time) # blocking time for 30 loops
    print(">> Blocking time was roughly {} seconds for {} tuples!".format(blocking_time,vectors.shape[0]))
    print("*"*50)
    
results["RR"] = RR
results["PC"] = PC
results["RM"] = RM
results["n_neigh"] = range(2,30)
print("Average blocking time is: {}".format(np.average(time_list)))
print("-----------------<<<>>>------------------")
print(f'best results as follows; \n{results.iloc[results["RM"].idxmax()]}')
print(f'dumping or updating results......')
# handle to automatically create paths and save results to csv
# in specified path
directory = "f-measure-results/{}/{}/{}".format(model,dataset, stage)
if not os.path.exists(directory):
    os.makedirs(directory)
    
results.to_csv(directory+"/{}.csv".format(algorithm)) # precison, recall and f1- score
print(f'done!')
print("check {}.csv file at: {}".format(algorithm,directory))


building blocks with: 2 clusters
using leiden clusterizer
Graph with 62830 nodes and 111886 edges
graph constructed in: 71.33163857460022  seconds
blocking completed in: 4.2601165771484375  seconds
(RR) Reduction ratio is: 0.9998772705900302
(PC) Pair completeness is: 0.4318181818181818
(RM) Reference metric (Harmonic mean RR and PC) is: 0.6031522755782367
(PQ) Pair quality - Precision is: 0.00023530383091149273
(FM) Fmeasure is: 0.0004703513607182348
>> Blocking time was roughly 84.98406744003296 seconds for 62830 tuples!
**************************************************

building blocks with: 3 clusters
using leiden clusterizer
Graph with 62830 nodes and 158845 edges
graph constructed in: 74.46150279045105  seconds
blocking completed in: 2.0944221019744873  seconds
(RR) Reduction ratio is: 0.9985968954125468
(PC) Pair completeness is: 0.5909090909090909
(RM) Reference metric (Harmonic mean RR and PC) is: 0.7424696587880173
(PQ) Pair quality - Precision is: 2.8164843775388982e-05
(F

Graph with 62830 nodes and 711768 edges
graph constructed in: 93.27806305885315  seconds
blocking completed in: 12.944490671157837  seconds
(RR) Reduction ratio is: 0.9903166135816623
(PC) Pair completeness is: 0.8636363636363636
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9226484700660927
(PQ) Pair quality - Precision is: 5.964586991329949e-06
(FM) Fmeasure is: 1.1929091596012522e-05
>> Blocking time was roughly 125.72199821472168 seconds for 62830 tuples!
**************************************************

building blocks with: 17 clusters
using leiden clusterizer
Graph with 62830 nodes and 754655 edges
graph constructed in: 91.97013783454895  seconds
blocking completed in: 13.6752450466156  seconds
(RR) Reduction ratio is: 0.989020417942836
(PC) Pair completeness is: 0.8560606060606061
(RM) Reference metric (Harmonic mean RR and PC) is: 0.9177498520399705
(PQ) Pair quality - Precision is: 5.2142929028905316e-06
(FM) Fmeasure is: 1.0428522285313755e-05
>> Blocking time was 