In [2]:
import blocking_hash as hash 
import blocking_ngram as ngram
import blocking_structured_and_sort as ss
import matchers as m
import similarity as sim
import cluster as c
import csv

In [2]:
import pandas as pd
dblp_csv = '../CSV-files/dblp.csv'
dblp = pd.read_csv(dblp_csv)

acm_csv = '../CSV-files/acm.csv'
acm = pd.read_csv(acm_csv)

In [3]:
def similar_pairs_to_csv(similar_pairs, output_csv_file):
    header = ['dblp_index', 'acm_index']
    with open(output_csv_file, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(header)
        for pair in similar_pairs:
            writer.writerow(pair)

def evaluate_similarity(baseline, comparison):
    baseline_set, comparison_set = set(baseline), set(comparison)

    tp = len(baseline_set.intersection(comparison_set))
    fp = len(comparison_set - baseline_set)
    fn = len(baseline_set - comparison_set)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f_measure = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    result = {'precision': precision, 'recall': recall, 'f_measure': f_measure}

    return str(result)

For the Baselines we make a row wise comparison where we compare respective columns
The idea is also to not use id, because both datasets have different ids even for corresponding enteties 

In [None]:
# Baselines 0.7, 0.85 

dblp['year'] = dblp['year'].astype(str)
acm['year'] = acm['year'].astype(str)

selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_7_jac = m.apply_similarity_baseline(dblp, acm, 0.7, selected_columns, sim.jaccard_similarity)
similar_pairs_to_csv(base_7_jac,'baselines/base_7_jac.csv')

selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_85_jac = m.apply_similarity_baseline(dblp, acm, 0.85, selected_columns, sim.jaccard_similarity)
similar_pairs_to_csv(base_85_jac,'baselines/base_85_jac.csv')



In [8]:
selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_7_n = m.apply_similarity_baseline(dblp, acm, 0.7, selected_columns, sim.n_gram_similarity)
similar_pairs_to_csv(base_7_n,'baselines/base_7_n.csv')

selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_85_n = m.apply_similarity_baseline(dblp, acm, 0.85, selected_columns, sim.n_gram_similarity)
similar_pairs_to_csv(base_85_n,'baselines/base_85_n.csv')

In [9]:
selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_7_lev = m.apply_similarity_baseline(dblp, acm, 0.7, selected_columns, sim.levensthein_distance)
similar_pairs_to_csv(base_7_lev,'baselines/base_7_lev.csv')

selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_85_lev = m.apply_similarity_baseline(dblp, acm, 0.85, selected_columns, sim.levensthein_distance)
similar_pairs_to_csv(base_85_lev,'baselines/base_85_lev.csv')

In [4]:
# the baselines take a time to compute so to not compute it again save csv and transform to list pairs
def reconstructed_pairs(path):
    df_pairs = pd.read_csv(path)
    return list(zip(df_pairs['dblp_index'], df_pairs['acm_index']))

base_7_jac = reconstructed_pairs('../baselines/base_7_jac.csv')
base_85_jac = reconstructed_pairs('../baselines/base_85_jac.csv')

base_7_n = reconstructed_pairs('../baselines/base_7_n.csv')
base_85_n = reconstructed_pairs('../baselines/base_85_n.csv')

base_7_lev = reconstructed_pairs('../baselines/base_7_lev.csv')
base_85_lev = reconstructed_pairs('../baselines/base_85_lev.csv')

In [5]:
dblp_csv = '../CSV-files/dblp.csv'
dblp = pd.read_csv(dblp_csv)

acm_csv = '../CSV-files/acm.csv'
acm = pd.read_csv(acm_csv)

threshold = 0.9

year_block = [1995,1996,1997, 1998, 1999,2000,2001, 2002, 2003, 2004,2005]
labels = ["1995", "1996", "1997", "1998", "1999", "2000", "2001", "2002", "2003", "2004"]

hash_indices = ['hash_value']
ngram_indices = ['ngram_values']

selected_columns = ['author_names', 'paper_title']
dblp_s = ss.block_by_year_and_publisher(dblp, year_block, labels)
acm_s = ss.block_by_year_and_publisher(acm, year_block, labels)
sorted_ap = m.apply_similarity_sorted(dblp_s, acm_s, threshold, sim.jaccard_similarity, selected_columns)

Processing time: 3.3457040786743164 seconds. Number of similar pairs: 1665


In [36]:
dblp_csv = '../CSV-files/dblp.csv'
dblp = pd.read_csv(dblp_csv)

acm_csv = '../CSV-files/acm.csv'
acm = pd.read_csv(acm_csv)

threshold = 0.9

year_block = [1995,1996,1997, 1998, 1999,2000,2001, 2002, 2003, 2004,2005]
labels = ["1995", "1996", "1997", "1998", "1999", "2000", "2001", "2002", "2003", "2004"]

hash_indices = ['hash_value']
ngram_indices = ['ngram_values']





selected_columns = ['author_names', 'paper_title']

dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_ap_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_ap_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_ap_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_ap_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_ap_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_ap_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

dblp_s = ss.block_by_year_and_publisher(dblp, year_block, labels)
acm_s = ss.block_by_year_and_publisher(acm, year_block, labels)
sorted_ap = m.apply_similarity_sorted2(dblp_s, acm_s, threshold, sim.jaccard_similarity, selected_columns)




Processing time: 10.310678005218506 seconds. Number of similar pairs: 1235
Processing time: 27.914609909057617 seconds. Number of similar pairs: 0
Processing time: 9.807164907455444 seconds. Number of similar pairs: 1235
Processing time: 33.01195693016052 seconds. Number of similar pairs: 0
Processing time: 8.09684133529663 seconds. Number of similar pairs: 1235
Processing time: 13.279062986373901 seconds. Number of similar pairs: 1099
Processing time: 5.1999640464782715 seconds. Number of similar pairs: 1665


In [37]:
result_combined = (
    evaluate_similarity(base_7_jac, initial_n2_ap_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, initial_n3_ap_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, initial_h_ap_jac) + "\n" +
    evaluate_similarity(base_7_jac, h_ap_jac) + "\n" +
    evaluate_similarity(base_7_jac, sorted_ap)
)

print(result_combined)


{'precision': 0.9433198380566802, 'recall': 0.7609405617243632, 'f_measure': 0.8423716558206797}
{'precision': 0.9433198380566802, 'recall': 0.7609405617243632, 'f_measure': 0.8423716558206797}
{'precision': 0.9433198380566802, 'recall': 0.7609405617243632, 'f_measure': 0.8423716558206797}
{'precision': 0.9770220588235294, 'recall': 0.6943174395819726, 'f_measure': 0.8117602138220695}
{'precision': 0.8994581577363034, 'recall': 0.9758327890267798, 'f_measure': 0.9360902255639099}


Cosine Similarity: 0.0


In [7]:
def blocks_to_df(blocks):
    dfs = []
    for block in blocks:
        df_block = pd.DataFrame(block)
        dfs.append(df_block)
    return dfs

def block_dfs(dataframes, blocking_function, *args):
    blocks = {}
    for df in dataframes:
        block = blocking_function(df, *args)
        blocks.update(block)
    return blocks

In [6]:
import time
def apply_similarity_sw(blocks1, blocks2, threshold, similarity_function, indices):
    similar_pairs = []

    start_time = time.time()

    for (key1, block1), (key2, block2) in zip(blocks1.items(), blocks2.items()):
        for elem1 in block1:
            for elem2 in block2:
                average_similarity = 0.0
                if isinstance(block1, dict):
                    print('block1 is a dictionary')
                if isinstance(block2, dict):
                    print('block2 is a dictionary')
                
                value_block1 = [elem1.get(i, '') for i in indices]
                value_block2 = [elem2.get(i, '') for i in indices]

                similarity = similarity_function(value_block1, value_block2)
                average_similarity += similarity

                if len(indices) > 1:
                    average_similarity /= len(indices)

                if average_similarity >= threshold:
                    index_pair = (elem1['index'], elem2['index'])
                    similar_pairs.append(index_pair)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Processing time: {elapsed_time} seconds. Number of similar pairs: {len(similar_pairs)}")

    return similar_pairs

In [8]:
import blocking_hash as hash 
import blocking_ngram as ngram
import blocking_structured_and_sort as ss
import matchers as m
import similarity as sim
import csv


# basically sorted neighboor hood blocking and then blocking the blocks again with ngram 
dblp_csv = '../CSV-files/dblp.csv'
dblp = pd.read_csv(dblp_csv)

acm_csv = '../CSV-files/acm.csv'
acm = pd.read_csv(acm_csv)

threshold = 0.7

year_block = [1995,1996,1997, 1998, 1999,2000,2001, 2002, 2003, 2004,2005]
labels = ["1995", "1996", "1997", "1998", "1999", "2000", "2001", "2002", "2003", "2004"]

n = 2
ngram_indices = ['ngram_values']

selected_columns = ['author_names', 'paper_title']
db = ss.block_by_year_and_publisher(dblp, year_block, labels)
db = blocks_to_df(db)
db = block_dfs(db, ngram.initial_ngram, n, selected_columns)

ac = ss.block_by_year_and_publisher(acm, year_block, labels)
ac = blocks_to_df(ac)
ac = block_dfs(ac, ngram.initial_ngram, n, selected_columns)



sorted_ap = m.apply_similarity_sorted_dictionary(db, ac, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

Processing time: 0.41853904724121094 seconds. Number of similar pairs: 2184


In [None]:
import csv

# return pairs in Format [1232, 2323]
def read_matched_entities(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        next(reader) 
        matched_entities = [row for row in reader]
    return matched_entities

# print cluster based on the specific meh
def print_clusters(clusters):
    for i, cluster in enumerate(clusters):
        print(f'Cluster {i + 1}: {cluster}')


file_path = '../baselines/base_7_jac.csv'
matched_entities = read_matched_entities(file_path)
clusters = c.build_clusters(matched_entities)
print_clusters(clusters)


The other ER is too chaotic -> ask wednesday how to do it properly then do it here
structure: baseline, respective matchers to compare, then compare and take the best ones