Code für Blocking mit Intervall + Key (publisher)

In [1]:
import pandas as pd
def block_by_year_ranges(df):
    year_ranges = []
    year_block = [1995, 1998, 2001, 2005]
    labels = ["1995-1997", "1998-2000", "2001-2004"]

    df["year_range"] = pd.cut(df['year'], bins=year_block, labels=labels, right=False)
    for label in labels:
        intervals = df[df["year_range"] == label].to_numpy()
        year_ranges.append(intervals)

    return year_ranges


def block_by_year_ranges_key_dynamic(df, year_block, labels):
    year_ranges = []
    df["year_range"] = pd.cut(df['year'], bins=year_block, labels=labels, right=False)
    publishers = ['sigmod', 'vldb']  # List of publishers to separate data based on

    for label in labels:
        publisher_blocks = []
        intervals = df[df["year_range"] == label]

        for publisher in publishers:
            publisher_block = intervals[intervals["publication_venue"].str.contains(publisher)]
            if publisher_block.size > 0:
                # publisher_block.iterrows() for  all columns?
                publisher_blocks.append(publisher_block)

        year_ranges.extend(publisher_blocks)

    return year_ranges


dblp_csv = '/Users/aliaslan/Downloads/DIA_Exercise/CSV-files/dblp.csv'
dblp = pd.read_csv(dblp_csv)
dblp_blocks = block_by_year_ranges(dblp)
print(len(dblp_blocks))
acm_csv = '/Users/aliaslan/Downloads/DIA_Exercise/CSV-files/acm.csv'
acm = pd.read_csv(acm_csv)
acm_block = block_by_year_ranges(acm)

year_block = [1995, 1998, 2001, 2005]
labels = ["1995-1997", "1998-2000", "2001-2004"]
dblp_blocks = block_by_year_ranges_key_dynamic(dblp, year_block, labels)
length = len(dblp_blocks[0]) + len(dblp_blocks[1]) + len(dblp_blocks[2]) + len(dblp_blocks[3]) + len(dblp_blocks[4]) + len(dblp_blocks[5])
print(length)
print(len(dblp_blocks))

3
2153
6


In [8]:
def block_by_year_ranges_key(df):
    year_ranges = []
    year_block = [1995, 1998, 2001, 2005]
    labels = ["1995-1997", "1998-2000", "2001-2004"]

    df["year_range"] = pd.cut(df['year'], bins=year_block, labels=labels, right=False)
    publishers = ['sigmod', 'vldb']  # List of publishers to separate data based on

    for label in labels:
        publisher_blocks = []
        intervals = df[df["year_range"] == label]

        for publisher in publishers:
            publisher_block = intervals[intervals["publication_venue"].str.contains(publisher)]
            if publisher_block.size > 0:
                publisher_blocks.append(publisher_block)

        year_ranges.extend(publisher_blocks)

    return year_ranges
    

dblp = pd.read_csv(dblp_csv)
dblp_blocks = block_by_year_ranges_key(dblp)
length = len(dblp_blocks[0]) + len(dblp_blocks[1]) + len(dblp_blocks[2]) + len(dblp_blocks[3]) + len(dblp_blocks[4]) + len(dblp_blocks[5])


Blocking with Hash - choose columns freely

In [None]:
import hashlib

def hash_blocking(dataframe, blocking_columns):
    dataframe['hash_key'] = dataframe[blocking_columns].astype(str).agg(''.join, axis=1)
    dataframe['hash_value'] = dataframe['hash_key'].apply(lambda x: hashlib.md5(x.encode()).hexdigest())
    blocks = dataframe.groupby('hash_value').apply(lambda x: {'hash_value': x['hash_value'].iloc[0], 'index': x['index'].tolist()}).to_dict()
    return blocks

Blocking by initials combined and the hashed

In [31]:
import hashlib
import pandas as pd



def initial_hash(df, columns):
    df ['publication_venue'] = df['publication_venue'].apply(lambda x: 'sigmod' if 'sigmod' in x else 'vldb')
    blocks = {}
    for index, row in df.iterrows():
        components_values = []

        for column in columns:
            if column == 'author_names':
                author_initials = []
                for name in row['author_names'].split():
                    parts = name.split()
                    if len(parts) > 1:
                        author_initials.append(parts[0][0] + parts[-1][0])
                    else:
                        author_initials.append(name[0])
                component_value = "".join(author_initials)
            elif column == 'paper_title':
                component_value = "".join(word[0] for word in row['paper_title'].split())
            elif column == 'publication_venue':
                component_value = row['publication_venue']
            elif column == 'year':
                component_value = str(row["year"])
            

            components_values.append(component_value)

        blocking_key = ''.join(components_values)
        blocking_key_hash = hashlib.md5(blocking_key.encode()).hexdigest()
        block_data = {'ngram_values': blocking_key_hash, 'index': [row['index']]}

        if blocking_key in blocks:
            unique_indices = list(set(blocks[blocking_key]['index'] + block_data['index']))
            blocks[blocking_key]['index'] = unique_indices
        else:
            blocks[blocking_key] = block_data

    return blocks

Blocking with hash

N-Gram blocking - wähl die Columns freiwillig aus 

In [None]:
import hashlib

def ngrams_tuple(s, n):
    return [tuple(s[i:i+n]) for i in range(len(s)-n+1)]

def hash_ngram_values(ngram_values):
    return hashlib.sha256(str(hash(tuple(sorted(ngram_values.items())))).encode()).hexdigest()

def n_gram_blocking(dataframe, columns, n):
    # n-gram for each selected column 
    for column in columns:
        key_name = f'ngram_key_{column}'
        dataframe[key_name] = dataframe[column].apply(lambda x: ''.join(map(str, x)))

        ngram_col_name = f'ngram_values_{column}'
        dataframe[ngram_col_name] = dataframe[key_name].apply(lambda x: tuple(ngrams_tuple(x, n)))

    # block them and each block consists of n-gram values and the respective index (id) 
    blocks = {}
    for index, row in dataframe.iterrows():
        ngram_values = {column: row[f'ngram_values_{column}'] for column in columns}
        hashable_ngram_values = hash_ngram_values(ngram_values)
        
        if hashable_ngram_values in blocks:
            blocks[hashable_ngram_values]['index'].add(row['index'])
        else:
            blocks[hashable_ngram_values] = {'index': {row['index']}, **ngram_values}

    return blocks

# if you want to try out
""""
selected_columns = ['author_names', 'paper_title']
n_value = 2
result_blocks = n_gram_blocking(dblp, selected_columns, n_value)

for key, block in result_blocks.items():
    print(f"Block key: {key}")
    print(f"Block content: {block}")
    print(f"Values for selected columns: {[block.get(i, '') for i in selected_columns]}")
    print("-----")
"""

Initials als combined string mit n-gram wobei Jahr komplett genommen wird und beim publisher_venue nur das Stichwort sigmod or vldb

In [32]:


def ngrams_string(s, n):
    return [s[i:i+n] for i in range(len(s)-n+1)]

# created this afterwards so we dont need to use the other initial_..._ngram methods (didnt delete the others for know, 
# because I used them for matching can be changed after we decide our baseline)
def initial_ngram(df, n, columns):
    df ['publication_venue'] = df['publication_venue'].apply(lambda x: 'sigmod' if 'sigmod' in x else 'vldb')
    blocks = {}
    for index, row in df.iterrows():
        components_values = []

        for column in columns:
            if column == 'author_names':
                author_initials = []
                for name in row['author_names'].split():
                    parts = name.split()
                    if len(parts) > 1:
                        author_initials.append(parts[0][0] + parts[-1][0])
                    else:
                        author_initials.append(name[0])
                component_value = "".join(author_initials)
            elif column == 'paper_title':
                component_value = "".join(word[0] for word in row['paper_title'].split())
            elif column == 'publication_venue':
                component_value = row['publication_venue']
            elif column == 'year':
                component_value = str(row["year"])
        
            components_values.append(component_value)

        blocking_key = ''.join(components_values)
        blocking_key_ngrams = list(ngrams_string(blocking_key, n))
        block_data = {'ngram_values': blocking_key_ngrams, 'index': [row['index']]}

        if blocking_key in blocks:
            unique_indices = list(set(blocks[blocking_key]['index'] + block_data['index']))
            blocks[blocking_key]['index'] = unique_indices
        else:
            blocks[blocking_key] = block_data

    return blocks

In [12]:
import pandas as pd

def ngrams_string(s, n):
    return [s[i:i+n] for i in range(len(s)-n+1)]

# ap = author + paper title

def initial_ap_ngram(df, n):
    blocks = {}
    for index, row in df.iterrows():
        author_initials = []
        for name in row['author_names'].split():
            parts = name.split()
            if len(parts) > 1:
                author_initials.append(parts[0][0] + parts[-1][0])
            else:
                author_initials.append(name[0])

        combined_author_initials = "".join(author_initials)
        paper_title_initials = "".join(word[0] for word in row['paper_title'].split())

        blocking_key = combined_author_initials + paper_title_initials
        blocking_key_ngrams = list(ngrams_string(blocking_key, n))
        block_data = {'ngram_values': blocking_key_ngrams, 'index': [row['index']]}

        if blocking_key in blocks:
            unique_indices = list(set(blocks[blocking_key]['index'] + block_data['index']))
            blocks[blocking_key]['index'] = unique_indices
        else:
            blocks[blocking_key] = block_data

    return blocks

# author + publication venue
def initial_apv_ngram(df, n):
    blocks = {}
    for index, row in df.iterrows():
        author_initials = []
        for name in row['author_names'].split():
            parts = name.split()
            if len(parts) > 1:
                author_initials.append(parts[0][0] + parts[-1][0])
            else:
                author_initials.append(name[0])

        combined_author_initials = "".join(author_initials)

        venue_key = row["publication_venue"] 

        publisher_keyword = "sigmod" if "sigmod" in venue_key else "vldb"
        blocking_key = combined_author_initials + publisher_keyword
        blocking_key_ngrams = list(ngrams_string(blocking_key, n))
        block_data = {'ngram_values': blocking_key_ngrams, 'index': [row['index']]}

        if blocking_key in blocks:
            unique_indices = list(set(blocks[blocking_key]['index'] + block_data['index']))
            blocks[blocking_key]['index'] = unique_indices
        else:
            blocks[blocking_key] = block_data

    return blocks


# author + year
def initial_ay_ngram(df, n):
    blocks = {}
    for index, row in df.iterrows():
        author_initials = []
        for name in row['author_names'].split():
            parts = name.split()
            if len(parts) > 1:
                author_initials.append(parts[0][0] + parts[-1][0])
            else:
                author_initials.append(name[0])

        combined_author_initials = "".join(author_initials)
        
        year_key = str(row["year"])
    
        blocking_key = combined_author_initials + year_key
        blocking_key_ngrams = list(ngrams_string(blocking_key, n))
        block_data = {'ngram_values': blocking_key_ngrams, 'index': [row['index']]}

        if blocking_key in blocks:
            unique_indices = list(set(blocks[blocking_key]['index'] + block_data['index']))
            blocks[blocking_key]['index'] = unique_indices
        else:
            blocks[blocking_key] = block_data

    return blocks

def initial_apy_ngram(df, n):
    blocks = {}
    for index, row in df.iterrows():
        author_initials = []
        for name in row['author_names'].split():
            parts = name.split()
            if len(parts) > 1:
                author_initials.append(parts[0][0] + parts[-1][0])
            else:
                author_initials.append(name[0])

        combined_author_initials = "".join(author_initials)
        paper_title_initials = "".join(word[0] for word in row['paper_title'].split())
        year_key = str(row["year"])

        blocking_key = combined_author_initials + paper_title_initials + year_key
        blocking_key_ngrams = list(ngrams_string(blocking_key, n))
        block_data = {'ngram_values': blocking_key_ngrams, 'index': [row['index']]}

        if blocking_key in blocks:
            unique_indices = list(set(blocks[blocking_key]['index'] + block_data['index']))
            blocks[blocking_key]['index'] = unique_indices
        else:
            blocks[blocking_key] = block_data

    return blocks

   

def initial_apvy_ngram(df, n):
    blocks = {}
    for index, row in df.iterrows():
        author_initials = []
        for name in row['author_names'].split():
            parts = name.split()
            if len(parts) > 1:
                author_initials.append(parts[0][0] + parts[-1][0])
            else:
                author_initials.append(name[0])

    
        combined_author_initials = "".join(author_initials)
        year_key = str(row["year"])
        venue_key = row["publication_venue"] 

     
        publisher_keyword = "sigmod" if "sigmod" in venue_key else "vldb"
        blocking_key = combined_author_initials + publisher_keyword + year_key
        blocking_key_ngrams = list(ngrams_string(blocking_key, n))
        block_data = {'ngram_values': blocking_key_ngrams, 'index': [row['index']]}

        if blocking_key in blocks:
            unique_indices = list(set(blocks[blocking_key]['index'] + block_data['index']))
            blocks[blocking_key]['index'] = unique_indices
        else:
            blocks[blocking_key] = block_data

    return blocks

def initial_appvy_ngram(df, n):
    blocks = {}
    for index, row in df.iterrows():
        author_initials = []
        for name in row['author_names'].split():
            parts = name.split()
            if len(parts) > 1:
                author_initials.append(parts[0][0] + parts[-1][0])
            else:
                author_initials.append(name[0])

         
        combined_author_initials = "".join(author_initials)
        paper_title_initials = "".join(word[0] for word in row['paper_title'].split())

        year_key = str(row["year"])
        venue_key = row["publication_venue"] 

        # Directly use "sigmod" or "vldb" in the blocking key
        publisher_keyword = "sigmod" if "sigmod" in venue_key else "vldb"
        blocking_key =  combined_author_initials + paper_title_initials + year_key  + publisher_keyword
        blocking_key_ngrams = list(ngrams_string(blocking_key, n))
        block_data = {'ngram_values': blocking_key_ngrams, 'index': [row['index']]}

        if blocking_key in blocks:
            unique_indices = list(set(blocks[blocking_key]['index'] + block_data['index']))
            blocks[blocking_key]['index'] = unique_indices
        else:
            blocks[blocking_key] = block_data

    return blocks

def initial_py_ngram(df, n):
    blocks = {}
    for index, row in df.iterrows():
        paper_title_initials = "".join(word[0] for word in row['paper_title'].split())
        year_key = str(row["year"])

        blocking_key = paper_title_initials + year_key
        blocking_key_ngrams = list(ngrams_string(blocking_key, n))
        block_data = {'ngram_values': blocking_key_ngrams, 'index': [row['index']]}

        if blocking_key in blocks:
            unique_indices = list(set(blocks[blocking_key]['index'] + block_data['index']))
            blocks[blocking_key]['index'] = unique_indices
        else:
            blocks[blocking_key] = block_data

    return blocks

# pp = paper_tite and publication venue
def initial_ppv_ngram(df, n):
    blocks = {}
    for index, row in df.iterrows():
        paper_title_initials = "".join(word[0] for word in row['paper_title'].split())
               
        venue_key = row["publication_venue"] 

        # Directly use "sigmod" or "vldb" in the blocking key
        publisher_keyword = "sigmod" if "sigmod" in venue_key else "vldb"
        blocking_key = paper_title_initials + publisher_keyword
        blocking_key_ngrams = list(ngrams_string(blocking_key, n))
        block_data = {'ngram_values': blocking_key_ngrams, 'index': [row['index']]}

        if blocking_key in blocks:
            unique_indices = list(set(blocks[blocking_key]['index'] + block_data['index']))
            blocks[blocking_key]['index'] = unique_indices
        else:
            blocks[blocking_key] = block_data

    return blocks

# like method before but with year
def initial_ppvy_ngram(df, n):
    blocks = {}
    for index, row in df.iterrows():
        paper_title_initials = "".join(word[0] for word in row['paper_title'].split())
               
        year_key = str(row["year"])
        venue_key = row["publication_venue"] 

        # Directly use "sigmod" or "vldb" in the blocking key
        publisher_keyword = "sigmod" if "sigmod" in venue_key else "vldb"
        blocking_key = paper_title_initials + year_key  + publisher_keyword
        blocking_key_ngrams = list(ngrams_string(blocking_key, n))
        block_data = {'ngram_values': blocking_key_ngrams, 'index': [row['index']]}

        if blocking_key in blocks:
            unique_indices = list(set(blocks[blocking_key]['index'] + block_data['index']))
            blocks[blocking_key]['index'] = unique_indices
        else:
            blocks[blocking_key] = block_data

    return blocks


Similarity methods

In [44]:

def apply_similarity(df1, df2, threshold, selected_columns, similarity_function):
    similar_pairs = []

    for index1, row1 in df1.iterrows():
        for index2, row2 in df2.iterrows():
            average_similarity = 0.0

            for col_df1, col_df2 in zip(selected_columns, selected_columns):
                value_df1 = set(row1[col_df1].lower().split())
                value_df2 = set(row2[col_df2].lower().split())

                similarity = similarity_function(value_df1, value_df2)
                average_similarity += similarity

            if len(selected_columns) > 1:
                average_similarity /= len(selected_columns)

            if average_similarity >= threshold:
                pair = (row1[selected_columns].to_dict(), row2[selected_columns].to_dict(), average_similarity)
                similar_pairs.append(pair)

    return similar_pairs



def apply_similarity_blocks(blocks1, blocks2, threshold, similarity_function, indices):
    similar_pairs = []

    for key1, block1 in blocks1.items():
        for key2, block2 in blocks2.items():
            average_similarity = 0.0

            value_block1 = [block1.get(i, '') for i in indices]
            value_block2 = [block2.get(i, '') for i in indices]
            similarity = similarity_function(value_block1, value_block2)

            average_similarity += similarity

            if len(indices) > 1:
                average_similarity /= len(indices)

            if average_similarity > threshold:
                index_pairs = [(i, j) for i in block1['index'] for j in block2['index']]
                similar_pairs.extend(index_pairs)
                
                
    return similar_pairs



In [14]:
# get intersection and union then divide it
#def jaccard_similarity(set1, set2):
#    intersection = len(set1 & set2)
#    union = len(set1 | set2)
#    return intersection / union if union > 0 else 0


def jaccard_similarity(set1, set2):
    if not isinstance(set1, set):
        set1 = set(set1)
    if not isinstance(set2, set):
        set2 = set(set2)
    
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    
    return intersection / union if union > 0 else 0

In [15]:
import numpy as np
def levensthein_distance(set1, set2):
    # make the subcriptable 
    list1, list2 = list(set1), list(set2)
    m, n = len(list1), len(list2)

    # create matrix
    matrix = np.zeros((m + 1, n + 1))

    # fill row with length of string 
    for i in range(1, m + 1):
        matrix[i][0] = i

    # fill column with lenth of string 
    for j in range(1, n + 1):
        matrix[0][j] = j
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if list1[i - 1] ==  list2[j - 1]:
                cost = 0
            else:
                cost = 1
            
            matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost)
    similarity = 1 - (matrix[m][n] / (m + n))

    return similarity

In [16]:
# get n_gram
def get_n_grams(string, n):
    characters = list(string.lower())
    n_grams = [tuple(characters[i:i+n]) for i in range(len(characters) - (n-1))]
    return n_grams

# transform respective columns to n_gram
def transform_columns_to_n_gram(df, n, columns):
    for column in columns:
        df[f'{column}'] = df[column].apply(lambda x: get_n_grams(str(x), n))
    return df


# as in lecture
"""ArithmeticErrordef n_gram_similarity(df1, df2):
    df1, df2 = set(df1), set(df2)
    intersection = len(df1) & len(df2)
    return 2 * intersection / len(df1) + len(df2)"""

def n_gram_similarity(df1, df2):
    set_df1 = {item for sublist in df1 for item in sublist}
    set_df2 = {item for sublist in df2 for item in sublist}
    intersection = len(set_df1 & set_df2)
    return 2 * intersection / (len(set_df1) + len(set_df2))

Matching of all initial with n_gram with two different n and treshholds!

In [17]:
import csv

def similar_pairs_to_csv(similar_pairs, output_csv_file):
    header = ['dblp_index', 'acm_index']
    with open(output_csv_file, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(header)
        for pair in similar_pairs:
            writer.writerow(pair)


In [41]:
dblp_csv = 'CSV-files/dblp.csv'
dblp = pd.read_csv(dblp_csv)
acm_csv = 'CSV-files/acm.csv'
acm = pd.read_csv(acm_csv)

threshold = 0.7
n = 2
selected_indices = ['ngram_values']
selected_columns = ['author_names', 'paper_title']


# damit können wir alle anderen matching aufrufe mit initial_..._ngram ersetzen (kann man später machen!!!
db = initial_ngram(dblp, n, selected_columns)
am = initial_ngram(acm, n, selected_columns)
similairty =apply_similarity_blocks(db, am, threshold, n_gram_similarity, selected_indices)
print(len(similairty))

# Ergebnis ist gleich wie beim unteren sim_ap funktioniert

1775


In [45]:
dblp_csv = 'CSV-files/dblp.csv'
dblp = pd.read_csv(dblp_csv)
acm_csv = 'CSV-files/acm.csv'
acm = pd.read_csv(acm_csv)

threshold = 0.7

selected_indices = ['ngram_values']
selected_columns = ['author_names', 'paper_title']

db = initial_hash(dblp, selected_columns)
am = initial_hash(acm, selected_columns)
similairty_ap =apply_similarity_blocks(db, am, threshold, jaccard_similarity, selected_indices)
print(len(similairty_ap))

selected_columns = ['author_names', 'paper_title','year']
db = initial_hash(dblp, selected_columns)
am = initial_hash(acm, selected_columns)
similairty_apy =apply_similarity_blocks(db, am, threshold, jaccard_similarity, selected_indices)
print(len(similairty_apy))

selected_columns = ['author_names', 'paper_title', 'publication_venue']
db = initial_hash(dblp, selected_columns)
am = initial_hash(acm, selected_columns)
similairty_appv =apply_similarity_blocks(db, am, threshold, jaccard_similarity, selected_indices)
print(len(similairty_appv))

selected_columns = ['author_names', 'publication_venue','year']
db = initial_hash(dblp, selected_columns)
am = initial_hash(acm, selected_columns)
similairty_apvy =apply_similarity_blocks(db, am, threshold, jaccard_similarity, selected_indices)
print(len(similairty_apvy))

selected_columns = ['author_names', 'paper_title', 'publication_venue','year']
db = initial_hash(dblp, selected_columns)
am = initial_hash(acm, selected_columns)
similairty_appvy =apply_similarity_blocks(db, am, threshold, jaccard_similarity, selected_indices)
print(len(similairty_appvy))


selected_columns = ['paper_title', 'publication_venue']
db = initial_hash(dblp, selected_columns)
am = initial_hash(acm, selected_columns)
similairty_ppv =apply_similarity_blocks(db, am, threshold, jaccard_similarity, selected_indices)
print(len(similairty_appv))

selected_columns = ['paper_title', 'publication_venue','year']
db = initial_hash(dblp, selected_columns)
am = initial_hash(acm, selected_columns)
similairty_ppvy =apply_similarity_blocks(db, am, threshold, jaccard_similarity, selected_indices)
print(len(similairty_ppvy))


1235
1210
1234
1536
1210
1234
1554


In [46]:
dblp_csv = 'CSV-files/dblp.csv'
dblp = pd.read_csv(dblp_csv)
acm_csv = 'CSV-files/acm.csv'
acm = pd.read_csv(acm_csv)


threshold = 0.7
n = 2
selected_indices = ['ngram_values']

dblp_ap = initial_ap_ngram(dblp, n)
acm_ap = initial_ap_ngram(acm, n)
sim_ap = apply_similarity_blocks(dblp_ap, acm_ap, threshold, n_gram_similarity, selected_indices)
print(len(sim_ap))
similar_pairs_to_csv(sim_ap, 'similarity_th_07/sim_ap.csv')

dblp_apv = initial_apv_ngram(dblp, n)
acm_apv = initial_apv_ngram(acm, n)
sim_apv = apply_similarity_blocks(dblp_apv, acm_apv, threshold, n_gram_similarity, selected_indices)
print(len(sim_apv))
similar_pairs_to_csv(sim_apv, 'similarity_th_07/sim_apv.csv')

dblp_ay = initial_ay_ngram(dblp, n)
acm_ay = initial_ay_ngram(acm, n)
sim_ay = apply_similarity_blocks(dblp_ay, acm_ay, threshold, n_gram_similarity, selected_indices)
print(len(sim_ay))
similar_pairs_to_csv(sim_ay, 'similarity_th_07/sim_ay.csv')

dblp_apy = initial_apy_ngram(dblp, n)
acm_apy = initial_apy_ngram(acm, n)
sim_apy = apply_similarity_blocks(dblp_apy, acm_apy, threshold, n_gram_similarity, selected_indices)
print(len(sim_apy))
similar_pairs_to_csv(sim_apy, 'similarity_th_07/sim_apy.csv')


dblp_apvy = initial_apvy_ngram(dblp, n)
acm_apvy = initial_apvy_ngram(acm, n)
sim_apvy = apply_similarity_blocks(dblp_apvy, acm_apvy, threshold, n_gram_similarity, selected_indices)
print(len(sim_apvy))
similar_pairs_to_csv(sim_apvy, 'similarity_th_07/sim_apvy.csv')

dblp_appvy = initial_appvy_ngram(dblp, n)
acm_appvy = initial_appvy_ngram(acm, n)
sim_appvy = apply_similarity_blocks(dblp_appvy, acm_appvy, threshold, n_gram_similarity, selected_indices)
print(len(sim_appvy))
similar_pairs_to_csv(sim_appvy, 'similarity_th_07/sim_appvy.csv')

dblp_py = initial_py_ngram(dblp, n)
acm_py = initial_py_ngram(acm, n)
sim_py = apply_similarity_blocks(dblp_py, acm_py, threshold, n_gram_similarity, selected_indices)
print(len(sim_py))
similar_pairs_to_csv(sim_py, 'similarity_th_07/sim_py.csv')

dblp_ppv = initial_ppv_ngram(dblp, n)
acm_ppv = initial_ppv_ngram(acm, n)
sim_ppv = apply_similarity_blocks(dblp_ppv, acm_ppv, threshold, n_gram_similarity, selected_indices)
print(len(sim_ppv))
similar_pairs_to_csv(sim_ppv, 'similarity_th_07/sim_ppv.csv')

dblp_ppvy = initial_ppvy_ngram(dblp, n)
acm_ppvy = initial_ppvy_ngram(acm, n)
sim_ppvy = apply_similarity_blocks(dblp_ppvy, acm_ppvy, threshold, n_gram_similarity, selected_indices)
print(len(sim_ppvy))
similar_pairs_to_csv(sim_ppvy, 'similarity_th_07/sim_ppvy.csv')

1775
121667
6195
1801
128842
2016
2243
11216
11701


In [None]:
threshold = 0.9
n = 2
selected_indices = ['ngram_values']

dblp_ap = initial_ap_ngram(dblp, n)
acm_ap = initial_ap_ngram(acm, n)
sim_ap2 = apply_similarity_blocks(dblp_ap, acm_ap, threshold, n_gram_similarity, selected_indices)
print(len(sim_ap2))
similar_pairs_to_csv(sim_ap2, 'similarity_th_09/sim_ap.csv')

dblp_apv = initial_apv_ngram(dblp, n)
acm_apv = initial_apv_ngram(acm, n)
sim_apv2 = apply_similarity_blocks(dblp_apv, acm_apv, threshold, n_gram_similarity, selected_indices)
print(len(sim_apv2))
similar_pairs_to_csv(sim_apv2, 'similarity_th_09/sim_apv.csv')

dblp_ay = initial_ay_ngram(dblp, n)
acm_ay = initial_ay_ngram(acm, n)
sim_ay2 = apply_similarity_blocks(dblp_ay, acm_ay, threshold, n_gram_similarity, selected_indices)
print(len(sim_ay2))
similar_pairs_to_csv(sim_ay2, 'similarity_th_09/sim_ay.csv')

dblp_apy = initial_apy_ngram(dblp, n)
acm_apy = initial_apy_ngram(acm, n)
sim_apy2 = apply_similarity_blocks(dblp_apy, acm_apy, threshold, n_gram_similarity, selected_indices)
print(len(sim_apy2))
similar_pairs_to_csv(sim_apy2, 'similarity_th_09/sim_apy.csv')

dblp_apvy = initial_apvy_ngram(dblp, n)
acm_apvy = initial_apvy_ngram(acm, n)
sim_apvy2 = apply_similarity_blocks(dblp_apvy, acm_apvy, threshold, n_gram_similarity, selected_indices)
print(len(sim_apvy2))
similar_pairs_to_csv(sim_apvy2, 'similarity_th_09/sim_apvy.csv')

dblp_appvy = initial_appvy_ngram(dblp, n)
acm_appvy = initial_appvy_ngram(acm, n)
sim_appvy2 = apply_similarity_blocks(dblp_appvy, acm_appvy, threshold, n_gram_similarity, selected_indices)
print(len(sim_appvy2))
similar_pairs_to_csv(sim_appvy2, 'similarity_th_09/sim_appvy.csv')

dblp_py = initial_py_ngram(dblp, n)
acm_py = initial_py_ngram(acm, n)
sim_py2 = apply_similarity_blocks(dblp_py, acm_py, threshold, n_gram_similarity, selected_indices)
print(len(sim_py2))
similar_pairs_to_csv(sim_py2, 'similarity_th_09/sim_py.csv')

dblp_ppv = initial_ppv_ngram(dblp, n)
acm_ppv = initial_ppv_ngram(acm, n)
sim_ppv2 = apply_similarity_blocks(dblp_ppv, acm_ppv, threshold, n_gram_similarity, selected_indices)
print(len(sim_ppv2))
similar_pairs_to_csv(sim_ppv2, 'similarity_th_09/sim_ppv.csv')

dblp_ppvy = initial_ppvy_ngram(dblp, n)
acm_ppvy = initial_ppvy_ngram(acm, n)
sim_ppvy2 = apply_similarity_blocks(dblp_ppvy, acm_ppvy, threshold, n_gram_similarity, selected_indices)
print(len(sim_ppvy2))
similar_pairs_to_csv(sim_ppvy2, 'similarity_th_09/sim_ppvy.csv')

1405
3505
1895
1456
2896
1540
1637
1807
1664


In [None]:
threshold = 0.7
n = 3
selected_indices = ['ngram_values']

dblp_ap = initial_ap_ngram(dblp, n)
acm_ap = initial_ap_ngram(acm, n)
sim_ap3 = apply_similarity_blocks(dblp_ap, acm_ap, threshold, n_gram_similarity, selected_indices)
print(len(sim_ap3))
similar_pairs_to_csv(sim_ap3, 'similarity_th_07_03/sim_ap.csv')

dblp_apv = initial_apv_ngram(dblp, n)
acm_apv = initial_apv_ngram(acm, n)
sim_apv3 = apply_similarity_blocks(dblp_apv, acm_apv, threshold, n_gram_similarity, selected_indices)
print(len(sim_apv3))
similar_pairs_to_csv(sim_apv3, 'similarity_th_07_03/sim_apv.csv')

dblp_ay = initial_ay_ngram(dblp, n)
acm_ay = initial_ay_ngram(acm, n)
sim_ay3 = apply_similarity_blocks(dblp_ay, acm_ay, threshold, n_gram_similarity, selected_indices)
print(len(sim_ay3))
similar_pairs_to_csv(sim_ay3, 'similarity_th_07_03/sim_ay.csv')

dblp_apy = initial_apy_ngram(dblp, n)
acm_apy = initial_apy_ngram(acm, n)
sim_apy3 = apply_similarity_blocks(dblp_apy, acm_apy, threshold, n_gram_similarity, selected_indices)
print(len(sim_apy3))
similar_pairs_to_csv(sim_apy3, 'similarity_th_07_03/sim_apy.csv')

dblp_apvy = initial_apvy_ngram(dblp, n)
acm_apvy = initial_apvy_ngram(acm, n)
sim_apvy3 = apply_similarity_blocks(dblp_apvy, acm_apvy, threshold, n_gram_similarity, selected_indices)
print(len(sim_apvy3))
similar_pairs_to_csv(sim_apvy3, 'similarity_th_07_03/sim_apvy.csv')

dblp_appvy = initial_appvy_ngram(dblp, n)
acm_appvy = initial_appvy_ngram(acm, n)
sim_appvy3 = apply_similarity_blocks(dblp_appvy, acm_appvy, threshold, n_gram_similarity, selected_indices)
print(len(sim_appvy3))
similar_pairs_to_csv(sim_appvy3, 'similarity_th_07_03/sim_appvy.csv')

dblp_py = initial_py_ngram(dblp, n)
acm_py = initial_py_ngram(acm, n)
sim_py3 = apply_similarity_blocks(dblp_py, acm_py, threshold, n_gram_similarity, selected_indices)
print(len(sim_py3))
similar_pairs_to_csv(sim_py3, 'similarity_th_07_03/sim_py.csv')

dblp_ppv = initial_ppv_ngram(dblp, n)
acm_ppv = initial_ppv_ngram(acm, n)
sim_ppv3 = apply_similarity_blocks(dblp_ppv, acm_ppv, threshold, n_gram_similarity, selected_indices)
print(len(sim_ppv3))
similar_pairs_to_csv(sim_ppv3, 'similarity_th_07_03/sim_ppv.csv')

dblp_ppvy = initial_ppvy_ngram(dblp, n)
acm_ppvy = initial_ppvy_ngram(acm, n)
sim_ppvy3 = apply_similarity_blocks(dblp_ppvy, acm_ppvy, threshold, n_gram_similarity, selected_indices)
print(len(sim_ppvy3))
similar_pairs_to_csv(sim_ppvy3, 'similarity_th_07_03/sim_ppvy.csv')

1620
29923
3897
1670
50182
1771
1917
3927
5339


In [None]:
threshold = 0.9
n = 3
selected_indices = ['ngram_values']

dblp_ap = initial_ap_ngram(dblp, n)
acm_ap = initial_ap_ngram(acm, n)
sim_ap4 = apply_similarity_blocks(dblp_ap, acm_ap, threshold, n_gram_similarity, selected_indices)
print(len(sim_ap4))
similar_pairs_to_csv(sim_ap4, 'similarity_th_09_03/sim_ap.csv')

dblp_apv = initial_apv_ngram(dblp, n)
acm_apv = initial_apv_ngram(acm, n)
sim_apv4 = apply_similarity_blocks(dblp_apv, acm_apv, threshold, n_gram_similarity, selected_indices)
print(len(sim_apv4))
similar_pairs_to_csv(sim_apv4, 'similarity_th_09_03/sim_apv.csv')

dblp_ay = initial_ay_ngram(dblp, n)
acm_ay = initial_ay_ngram(acm, n)
sim_ay4 = apply_similarity_blocks(dblp_ay, acm_ay, threshold, n_gram_similarity, selected_indices)
print(len(sim_ay4))
similar_pairs_to_csv(sim_ay4, 'similarity_th_09_03/sim_ay.csv')

dblp_apy = initial_apy_ngram(dblp, n)
acm_apy = initial_apy_ngram(acm, n)
sim_apy4 = apply_similarity_blocks(dblp_apy, acm_apy, threshold, n_gram_similarity, selected_indices)
print(len(sim_apy4))
similar_pairs_to_csv(sim_apy4, 'similarity_th_09_03/sim_apy.csv')

dblp_apvy = initial_apvy_ngram(dblp, n)
acm_apvy = initial_apvy_ngram(acm, n)
sim_apvy4 = apply_similarity_blocks(dblp_apvy, acm_apvy, threshold, n_gram_similarity, selected_indices)
print(len(sim_apvy4))
similar_pairs_to_csv(sim_apvy4, 'similarity_th_09_03/sim_apvy.csv')

dblp_appvy = initial_appvy_ngram(dblp, n)
acm_appvy = initial_appvy_ngram(acm, n)
sim_appvy4 = apply_similarity_blocks(dblp_appvy, acm_appvy, threshold, n_gram_similarity, selected_indices)
print(len(sim_appvy4))
similar_pairs_to_csv(sim_appvy4, 'similarity_th_09_03/sim_appvy.csv')

dblp_py = initial_py_ngram(dblp, n)
acm_py = initial_py_ngram(acm, n)
sim_py4 = apply_similarity_blocks(dblp_py, acm_py, threshold, n_gram_similarity, selected_indices)
print(len(sim_py4))
similar_pairs_to_csv(sim_py4, 'similarity_th_09_03/sim_py.csv')

dblp_ppv = initial_ppv_ngram(dblp, n)
acm_ppv = initial_ppv_ngram(acm, n)
sim_ppv4 = apply_similarity_blocks(dblp_ppv, acm_ppv, threshold, n_gram_similarity, selected_indices)
print(len(sim_ppv4))
similar_pairs_to_csv(sim_ppv4, 'similarity_th_09_03/sim_ppv.csv')

dblp_ppvy = initial_ppvy_ngram(dblp, n)
acm_ppvy = initial_ppvy_ngram(acm, n)
sim_ppvy4 = apply_similarity_blocks(dblp_ppvy, acm_ppvy, threshold, n_gram_similarity, selected_indices)
print(len(sim_ppvy4))
similar_pairs_to_csv(sim_ppvy4, 'similarity_th_09_03/sim_ppvy.csv')

1291
3312
1685
1279
1873
1345
1592
1754
1598


Bemerkung: Wir haben einige Outliner mit Matches mit einer Anzahl von ca 30k-120k Paaren haben. Die Ursache hierbei liegt beim Prozess und die Werte: Beispielsweise nehmen wir bei den initialen Methoden das ganze Jahr und für publication venue extrahieren wir sigmod oder vdlb. Da es beim venue nur die zwei Möglichkeiten gibt kriegen wir für die Ähnlichkeit entweder 1 (identisch) oder x und beim Jahr haben wir 1995-2004 bedeutet bei identischen 1, bei ähnlichen 1995-1999 oder 2000-2004 kriegt man ein hohen Wert.
Natürlich werden die Werte nicht einzeln, sondern als kombinierter String verglichen, aber wenn man nicht genügend andere Werte oder lange = (Anzahl an Characters) Werte hat wie paper_title, dann wird es eben als sehr ähnlich gewertet.

Matching with hashblock

In [None]:
dblp_csv = 'CSV-files/dblp.csv'
dblp = pd.read_csv(dblp_csv)
acm_csv = 'CSV-files/acm.csv'
acm = pd.read_csv(acm_csv)

# make it so it either sigmod or vdlb (could have been done for initial_ngrams too but to lazy to change now)
dblp['publication_venue'] = dblp['publication_venue'].apply(lambda x: 'sigmod' if 'sigmod' in x else 'vldb')
acm['publication_venue'] = acm['publication_venue'].apply(lambda x: 'sigmod' if 'sigmod' in x else 'vldb')

threshold = 0.65
indices = ['hash_value']
set_indices = set(indices)

blocking_columns = ['author_names','paper_title']
dblp_hap = hash_blocking(dblp, blocking_columns)
acm_hap = hash_blocking(acm,blocking_columns)
sim_hap = apply_similarity_blocks(dblp_hap, acm_hap, threshold, jaccard_similarity, set_indices)
print(len(sim_ap))
similar_pairs_to_csv(sim_hap, 'similarity_hash_0.65/sim.hap.csv')

blocking_columns = ['author_names','publication_venue']
dblp_hapv = hash_blocking(dblp, blocking_columns)
acm_hapv = hash_blocking(acm,blocking_columns)
sim_hapv = apply_similarity_blocks(dblp_hapv, acm_hapv, threshold, jaccard_similarity, set_indices)
print(len(sim_hapv))
similar_pairs_to_csv(sim_hapv, 'similarity_hash_0.65/sim.hapv.csv')

blocking_columns = ['author_names','year']
dblp_hay = hash_blocking(dblp, blocking_columns)
acm_hay = hash_blocking(acm,blocking_columns)
sim_hay = apply_similarity_blocks(dblp_hay, acm_hay, threshold, jaccard_similarity, set_indices)
print(len(sim_hay))
similar_pairs_to_csv(sim_hay, 'similarity_hash_0.65/sim.hay.csv')

blocking_columns = ['author_names','paper_title', 'publication_venue']
dblp_happv = hash_blocking(dblp, blocking_columns)
acm_happv = hash_blocking(acm,blocking_columns)
sim_happv = apply_similarity_blocks(dblp_happv, acm_happv, threshold, jaccard_similarity, set_indices)
print(len(sim_happv))
similar_pairs_to_csv(sim_happv, 'similarity_hash_0.65/sim.happv.csv')

blocking_columns = ['author_names','paper_title', 'year']
dblp_hapy = hash_blocking(dblp, blocking_columns)
acm_hapy = hash_blocking(acm,blocking_columns)
sim_hapy = apply_similarity_blocks(dblp_hapy, acm_hapy, threshold, jaccard_similarity, set_indices)
print(len(sim_hapy))
similar_pairs_to_csv(sim_hapy, 'similarity_hash_0.65/sim.hapy.csv')

blocking_columns = ['author_names', 'publication_venue','year']
dblp_hapvy = hash_blocking(dblp, blocking_columns)
acm_hapvy = hash_blocking(acm,blocking_columns)
sim_hapvy = apply_similarity_blocks(dblp_hapvy, acm_hapvy, threshold, jaccard_similarity, set_indices)
print(len(sim_hapvy))
similar_pairs_to_csv(sim_hapvy, 'similarity_hash_0.65/sim.hapvy.csv')

blocking_columns = ['author_names','paper_title', 'publication_venue','year']
dblp_happy = hash_blocking(dblp, blocking_columns)
acm_happy = hash_blocking(acm,blocking_columns)
sim_happy = apply_similarity_blocks(dblp_happy, acm_happy, threshold, jaccard_similarity, set_indices)
print(len(sim_happy))
similar_pairs_to_csv(sim_happy, 'similarity_hash_0.65/sim.happy.csv')

blocking_columns = ['paper_title', 'publication_venue']
dblp_hppv = hash_blocking(dblp, blocking_columns)
acm_hppv = hash_blocking(acm,blocking_columns)
sim_hppv = apply_similarity_blocks(dblp_hppv, acm_hppv, threshold, jaccard_similarity, set_indices)
print(len(sim_hppv))
similar_pairs_to_csv(sim_hppv, 'similarity_hash_0.65/sim.hppv.csv')

blocking_columns = ['paper_title', 'year']
dblp_hpy = hash_blocking(dblp, blocking_columns)
acm_hpy = hash_blocking(acm,blocking_columns)
sim_hpy = apply_similarity_blocks(dblp_hpy, acm_hpy, threshold, jaccard_similarity, set_indices)
print(len(sim_hpy))
similar_pairs_to_csv(sim_hpy, 'similarity_hash_0.65/sim.hppvy.csv')

blocking_columns = ['paper_title', 'publication_venue','year']
dblp_hppvy = hash_blocking(dblp, blocking_columns)
acm_hppvy = hash_blocking(acm,blocking_columns)
sim_hppvy = apply_similarity_blocks(dblp_hppvy, acm_hppvy, threshold, jaccard_similarity, set_indices)
print(len(sim_hppvy))
similar_pairs_to_csv(sim_hppvy, 'similarity_hash_0.65/sim.hppvy.csv')


1730
1116
1172
1056
1057
1184
1058
1480
1489
1490


In [None]:
import hashlib

def ngrams_tuple(s, n):
    return [tuple(s[i:i+n]) for i in range(len(s)-n+1)]

def hash_ngram_values(ngram_values):
    return hashlib.sha256(str(hash(tuple(sorted(ngram_values.items())))).encode()).hexdigest()

def n_gram_blocking(dataframe, columns, n):
    # n-gram for each selected column 
    for column in columns:
        key_name = f'ngram_key_{column}'
        dataframe[key_name] = dataframe[column].apply(lambda x: ''.join(map(str, x)))

        ngram_col_name = f'ngram_values_{column}'
        dataframe[ngram_col_name] = dataframe[key_name].apply(lambda x: tuple(ngrams_tuple(x, n)))

    # block them and each block consists of n-gram values and the respective index (id) 
    blocks = {}
    for index, row in dataframe.iterrows():
        ngram_values = {column: row[f'ngram_values_{column}'] for column in columns}
        hashable_ngram_values = hash_ngram_values(ngram_values)
        
        if hashable_ngram_values in blocks:
            blocks[hashable_ngram_values]['index'].add(row['index'])
        else:
            blocks[hashable_ngram_values] = {'index': {row['index']}, **ngram_values}

    return blocks


dblp_csv = 'CSV-files/dblp.csv'
dblp = pd.read_csv(dblp_csv)
acm_csv = 'CSV-files/acm.csv'
acm = pd.read_csv(acm_csv)




In [None]:
dblp_csv = 'CSV-files/dblp.csv'
dblp = pd.read_csv(dblp_csv)
acm_csv = 'CSV-files/acm.csv'
acm = pd.read_csv(acm_csv)

n = 2
selected_columns = ['author_names', 'paper_title']

# bei 0.5 = 0 gemeinsame paare und probiere nur dieses Beispiel, weil es lange dauert
threshold = 0.3

db = n_gram_blocking(dblp, selected_columns,n)
ac = n_gram_blocking(acm, selected_columns, n)
ap_sim = apply_similarity_blocks(db, ac, threshold, n_gram_similarity, selected_columns)
print(len(ap_sim))

selected_columns = ['paper_title']


threshold = 0.7
sim = apply_similarity_blocks(db, ac, threshold, n_gram_similarity, selected_columns)
print(len(sim))

3664
2463


Die Methode braucht nur die jeweiligen Match methoden: Verwende eins als Baseline und das andere als Comparison. Müssten uns für eine Match methode entscheiden lass z.B sagen similarity_appvy (a = author, p = paper_title, pv= publication_venue, y = year) als Baseline und vergleichbare Matching-Methoden (alle anderen initials similarities) evaluieren. Natürlich kann man auch die gleiche Methode, aber mit anderen n (n-gram) wert und treshold vergleichen. Müssten uns für eine Baseline entscheiden?

In [None]:
def evaluate_similarity(baseline, comparison):
    baseline_set, comparison_set = set(baseline), set(comparison)

    tp = len(baseline_set.intersection(comparison_set))
    fp = len(comparison_set - baseline_set)
    fn = len(baseline_set - comparison_set)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f_measure = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0


    return precision, recall, f_measure

print(evaluate_similarity(sim_appvy, sim_ap))



(0.9779785431959345, 0.8686058174523571, 0.9200531208499335)


ToDo = - Eventuell weitere Blocking Methoden
       - kombinieren mit block ranges, welches Dataframe nach Jahren und publisher aufteilt in blöcke und dann jedes der Blöcke zusätlich mit einem anderen blocking oder direkt vergelichen
       - Aufgabe 2.3) die gewählte Baseline als CSV und kmeans darauf !