In [1]:
import pandas as pd
import sys
import re, unicodedata
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

Functions

In [2]:
def explore_df(name, df, max_rows=5):
    print(f"\n{name} - DATA EXPLORATION")
    print("=" * 60)

    # basic shape
    print("Shape (rows, cols):", df.shape)

    # columns + dtypes
    print("\nColumns & dtypes:")
    for c in df.columns:
        print(f"  - {c:15s}  {str(df[c].dtype)}")

    # missing values
    na_counts = df.isna().sum().sort_values(ascending=False)
    na_counts = na_counts[na_counts > 0]
    if len(na_counts) > 0:
        print("\nMissing values (top):")
        print(na_counts.head(10).to_string())
    else:
        print("\nMissing values: none")

    # duplicates by id (if present)
    if "id" in df.columns:
        dup_ids = df["id"].duplicated().sum()
        print("\nDuplicate IDs:", dup_ids)

    # show a few rows (selected key columns if present)
    key_cols = [c for c in ["id", "title", "authors", "venue", "year"] if c in df.columns]
    print("\nSample rows:")
    if key_cols:
        display(df[key_cols].head(max_rows))
    else:
        display(df.head(max_rows))

In [3]:
def quick_value_checks(name, df):
    print(f"\n{name} - QUICK VALUE CHECKS")
    print("-" * 60)

    for col in ["title", "authors", "year"]:
        if col in df.columns:
            non_empty = (df[col].astype(str).str.strip() != "").sum()
            print(f"{col}: non-empty = {non_empty}/{len(df)}")

    if "year" in df.columns:
        # show unique year examples
        sample_years = df["year"].dropna().astype(str).unique()[:10]
        print("Year examples:", sample_years)

In [4]:
def clean_colname(c: str) -> str:
    c = str(c).strip().lower()
    c = c.replace("\ufeff", "")          # BOM
    c = c.replace('"', "").replace("'", "")
    c = re.sub(r"\s+", "", c)
    return c

In [5]:
def normalize_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"[^\w\s]", " ", text)   # removes punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [6]:
def tokenize_title(title):
    return set(normalize_text(title).split())

In [7]:
# IMPORTANT: split authors BEFORE normalization (otherwise commas get removed)
def extract_author_lastnames(authors):
    if pd.isna(authors):
        return set()
    author_list = re.split(r",|\band\b", str(authors))  # split raw string
    lastnames = set()
    for a in author_list:
        a = normalize_text(a).strip()
        if not a:
            continue
        parts = a.split()
        lastnames.add(parts[-1])
    return lastnames

In [8]:
def parse_year(y):
    try:
        return int(y)
    except:
        return None

In [9]:
def jaccard(a, b):
    if not a or not b:
        return 0.0
    return len(a & b) / len(a | b)

In [10]:
def preprocess(df):
    df = df.copy()
    df["title_norm"] = df["title"].apply(normalize_text)
    df["title_tokens"] = df["title"].apply(tokenize_title)
    df["author_lastnames"] = df["authors"].apply(extract_author_lastnames)
    df["year_parsed"] = df["year"].apply(parse_year)
    return df

In [11]:
def make_title_prefix(tokens, k=3):
    toks = sorted(tokens)
    return " ".join(toks[:k]) if toks else ""

In [12]:
def deduplicate(df):
    df = df.copy()
    df["dedup_block"] = df["title_tokens"].apply(lambda t: make_title_prefix(t, k=3))

    visited = set()
    groups = []
    blocks = defaultdict(list)

    # block indices by prefix
    for _, r in df.iterrows():
        blocks[r["dedup_block"]].append(r["id"])

    # for quick lookup by id
    row_by_id = df.set_index("id")

    for block_ids in blocks.values():
        for i in block_ids:
            if i in visited:
                continue

            group = [i]
            visited.add(i)
            r1 = row_by_id.loc[i]

            for j in block_ids:
                if j in visited:
                    continue
                r2 = row_by_id.loc[j]

                title_sim = jaccard(r1["title_tokens"], r2["title_tokens"])
                author_overlap = len(r1["author_lastnames"] & r2["author_lastnames"]) > 0

                year_ok = (
                    r1["year_parsed"] is None or r2["year_parsed"] is None or
                    abs(r1["year_parsed"] - r2["year_parsed"]) <= 1
                )

                # STRICT or RELAXED
                if (title_sim >= 0.95 and author_overlap) or (title_sim >= 0.85 and author_overlap and year_ok):
                    group.append(j)
                    visited.add(j)

            groups.append(group)

    return groups

In [13]:
def build_mapping(groups):
    m = {}
    for g in groups:
        canon = g[0]
        for rid in g:
            m[rid] = canon
    return m

In [14]:
def unique_df(df, mapping):
    # keep first occurrence of each canonical id (stable)
    df2 = df.copy()
    df2["canon_id"] = df2["id"].map(mapping)
    df2 = df2.sort_values("id")
    return df2.drop_duplicates("canon_id", keep="first")

In [15]:
def report_dedup(name, df, groups, unique_df):
    dup_groups = sum(1 for g in groups if len(g) > 1)
    print(f"\n{name} 1.1 REPORT")
    print("-"*35)
    print("Input records:", len(df))
    print("Duplicate groups:", dup_groups)
    print("Final unique records:", len(unique_df))

Execution

In [16]:
# ---------- Load ----------
def load_csv(path):
    try:
        return pd.read_csv(path, encoding="utf-8")
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding="latin-1")

dblp = load_csv("DBLP1.csv")
scholar = load_csv("Scholar.csv")
gt = load_csv("DBLP-Scholar_perfectMapping.csv")

In [17]:
# run exploration
explore_df("DBLP", dblp)
explore_df("Scholar", scholar)
explore_df("Ground Truth", gt)


DBLP - DATA EXPLORATION
Shape (rows, cols): (2616, 5)

Columns & dtypes:
  - id               object
  - title            object
  - authors          object
  - venue            object
  - year             int64

Missing values (top):
authors    218
venue      211

Duplicate IDs: 0

Sample rows:


Unnamed: 0,id,title,authors,venue,year
0,conf/vldb/RusinkiewiczKTWM95,Towards a Cooperative Transaction Model - The ...,"M Rusinkiewicz, W Klas, T Tesch, J Wäsch, P Muth",VLDB,1995
1,journals/sigmod/EisenbergM02,SQL/XML is Making Good Progress,"A Eisenberg, J Melton",SIGMOD Record,2002
2,conf/vldb/AmmannJR95,Using Formal Methods to Reason about Semantics...,"P Ammann, S Jajodia, I Ray",VLDB,1995
3,journals/sigmod/Liu02,Editor's Notes,L Liu,SIGMOD Record,2002
4,journals/sigmod/Hammer02,Report on the ACM Fourth International Worksho...,,,2002



Scholar - DATA EXPLORATION
Shape (rows, cols): (64263, 5)

Columns & dtypes:
  - id               object
  - title            object
  - authors          object
  - venue            object
  - year             float64

Missing values (top):
year       34790
venue      14997
authors        1

Duplicate IDs: 0

Sample rows:


Unnamed: 0,id,title,authors,venue,year
0,aKcZKwvwbQwJ,11578 Sorrento Valley Road,QD Inc,"San Diego,",
1,ixKfiTHoaDoJ,Initiation of crazes in polystyrene,"AS Argon, JG Hannoosh","Phil. Mag,",
2,3BxllB4wwcIJ,Immunogold labelling is a quantitative method ...,"GH Hansen, LL Wetterberg, H SjÃ¶strÃ¶m, O NorÃ©n","The Histochemical Journal,",1992.0
3,d2WWxwKMex4J,The Burden of Infectious Disease Among Inmates...,"TM Hammett, P Harmon, W Rhodes",see,
4,cZCX-AQpjccJ,The Role of Faculty Advising in Science and En...,JR Cogdell,"NEW DIRECTIONS FOR TEACHING AND LEARNING,",1995.0



Ground Truth - DATA EXPLORATION
Shape (rows, cols): (5347, 2)

Columns & dtypes:
  - idDBLP           object
  - idScholar        object

Missing values: none

Sample rows:


Unnamed: 0,idDBLP,idScholar
0,conf/sigmod/AbadiC02,f2Lea-RN8dsJ
1,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03,eBnT7lhV2LwJ
2,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03,gBVNSFeS4P8J
3,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03,VuY9Y49GqXgJ
4,conf/sigmod/AbiteboulBCMM03,AxpQwgyRyLgJ


In [18]:

quick_value_checks("DBLP", dblp)
quick_value_checks("Scholar", scholar)


DBLP - QUICK VALUE CHECKS
------------------------------------------------------------
title: non-empty = 2616/2616
authors: non-empty = 2616/2616
year: non-empty = 2616/2616
Year examples: ['1995' '2002' '1994' '2001' '2003' '1999' '2000' '1998' '1997' '1996']

Scholar - QUICK VALUE CHECKS
------------------------------------------------------------
title: non-empty = 64263/64263
authors: non-empty = 64263/64263
year: non-empty = 64263/64263
Year examples: ['1992.0' '1995.0' '2002.0' '1987.0' '1996.0' '1998.0' '1997.0' '1999.0'
 '2001.0' '2000.0']


In [19]:
dblp.columns = [clean_colname(c) for c in dblp.columns]
scholar.columns = [clean_colname(c) for c in scholar.columns]
gt.columns = [clean_colname(c) for c in gt.columns]

# ensure id column exists
if "id" not in dblp.columns:
    dblp["id"] = dblp.index.astype(int)
if "id" not in scholar.columns:
    scholar["id"] = scholar.index.astype(int)

print("DBLP cols:", dblp.columns.tolist())
print("Scholar cols:", scholar.columns.tolist())
print("GT cols:", gt.columns.tolist())

DBLP cols: ['id', 'title', 'authors', 'venue', 'year']
Scholar cols: ['id', 'title', 'authors', 'venue', 'year']
GT cols: ['iddblp', 'idscholar']


In [20]:
dblp_p = preprocess(dblp)
dblp_p

Unnamed: 0,id,title,authors,venue,year,title_norm,title_tokens,author_lastnames,year_parsed
0,conf/vldb/RusinkiewiczKTWM95,Towards a Cooperative Transaction Model - The ...,"M Rusinkiewicz, W Klas, T Tesch, J Wäsch, P Muth",VLDB,1995,towards a cooperative transaction model the co...,"{activity, towards, a, transaction, cooperativ...","{sch, tesch, muth, klas, rusinkiewicz}",1995
1,journals/sigmod/EisenbergM02,SQL/XML is Making Good Progress,"A Eisenberg, J Melton",SIGMOD Record,2002,sql xml is making good progress,"{good, xml, sql, is, making, progress}","{melton, eisenberg}",2002
2,conf/vldb/AmmannJR95,Using Formal Methods to Reason about Semantics...,"P Ammann, S Jajodia, I Ray",VLDB,1995,using formal methods to reason about semantics...,"{methods, about, decompositions, of, reason, s...","{ray, ammann, jajodia}",1995
3,journals/sigmod/Liu02,Editor's Notes,L Liu,SIGMOD Record,2002,editor s notes,"{s, editor, notes}",{liu},2002
4,journals/sigmod/Hammer02,Report on the ACM Fourth International Worksho...,,,2002,report on the acm fourth international worksho...,"{workshop, warehousing, fourth, dolap, acm, 20...",{},2002
...,...,...,...,...,...,...,...,...,...
2611,conf/vldb/ShuklaDNR96,Storage Estimation for Multidimensional Aggreg...,"A Shukla, P Deshpande, J Naughton, K Ramasamy",VLDB,1996,storage estimation for multidimensional aggreg...,"{multidimensional, estimation, of, storage, pr...","{shukla, ramasamy, deshpande, naughton}",1996
2612,journals/sigmod/Aberer03,Call for Book Reviews,,,2003,call for book reviews,"{book, reviews, for, call}",{},2003
2613,conf/vldb/RamakrishnanR96,Modeling Design Versions,"R Ramakrishnan, D Ram",VLDB,1996,modeling design versions,"{modeling, versions, design}","{ramakrishnan, ram}",1996
2614,conf/vldb/ShaferAM96,SPRINT: A Scalable Parallel Classifier for Dat...,"J Shafer, R Agrawal, M Mehta",VLDB,1996,sprint a scalable parallel classifier for data...,"{classifier, a, parallel, data, scalable, spri...","{shafer, agrawal, mehta}",1996


In [21]:
scholar_p = preprocess(scholar)
scholar_p

Unnamed: 0,id,title,authors,venue,year,title_norm,title_tokens,author_lastnames,year_parsed
0,aKcZKwvwbQwJ,11578 Sorrento Valley Road,QD Inc,"San Diego,",,11578 sorrento valley road,"{valley, sorrento, 11578, road}",{inc},
1,ixKfiTHoaDoJ,Initiation of crazes in polystyrene,"AS Argon, JG Hannoosh","Phil. Mag,",,initiation of crazes in polystyrene,"{initiation, crazes, of, polystyrene, in}","{hannoosh, argon}",
2,3BxllB4wwcIJ,Immunogold labelling is a quantitative method ...,"GH Hansen, LL Wetterberg, H SjÃ¶strÃ¶m, O NorÃ©n","The Histochemical Journal,",1992.0,immunogold labelling is a quantitative method ...,"{as, studies, a, immunogold, labelling, quanti...","{m, hansen, n, wetterberg}",1992.0
3,d2WWxwKMex4J,The Burden of Infectious Disease Among Inmates...,"TM Hammett, P Harmon, W Rhodes",see,,the burden of infectious disease among inmates...,"{infectious, burden, from, among, of, releasee...","{harmon, rhodes, hammett}",
4,cZCX-AQpjccJ,The Role of Faculty Advising in Science and En...,JR Cogdell,"NEW DIRECTIONS FOR TEACHING AND LEARNING,",1995.0,the role of faculty advising in science and en...,"{engineering, advising, of, faculty, science, ...",{cogdell},1995.0
...,...,...,...,...,...,...,...,...,...
64258,TX4SGKJQUiEJ,Theophylline toxicity.,DS Cooling,"Journal of Emergency Medicine,",1993.0,theophylline toxicity,"{toxicity, theophylline}",{cooling},1993.0
64259,url:http://portal.acm.org/citation.cfm%3Fid%3D...,Source ACM Transactions on Database Systems (T...,RW Taylor,"ACM Transactions on Database Systems (TODS),",1983.0,source acm transactions on database systems to...,"{systems, tods, acm, archive, database, transa...",{taylor},1983.0
64260,7Hi3KKsxKNwJ,Range Nesting: A Fast Method to Evaluate Quant...,"M Jarke, J Koch",,,range nesting a fast method to evaluate quanti...,"{evaluate, a, quantified, queries, range, meth...","{jarke, koch}",
64261,url:http://portal.acm.org/citation.cfm%3Fid%3D...,Source ACM Transactions on Database Systems (T...,W Kim,"ACM Transactions on Database Systems (TODS),",1993.0,source acm transactions on database systems to...,"{systems, tods, acm, archive, database, transa...",{kim},1993.0


In [22]:
# list of groups, where each group is a list of same publications
dblp_groups = deduplicate(dblp_p)

In [23]:
print(dblp_groups)
print(len(dblp_groups))

[['conf/vldb/RusinkiewiczKTWM95'], ['journals/sigmod/EisenbergM02'], ['conf/vldb/AmmannJR95'], ['journals/sigmod/Liu02', 'journals/sigmod/Liu02a', 'journals/sigmod/Liu02b', 'journals/sigmod/Liu01a', 'journals/sigmod/Liu01b', 'journals/sigmod/Liu03c', 'journals/sigmod/Liu03b', 'journals/sigmod/Liu03a', 'journals/sigmod/Liu03', 'journals/sigmod/Liu02c'], ['journals/sigmod/Liu01'], ['journals/sigmod/Franklin99b', 'journals/sigmod/Franklin00', 'journals/sigmod/Franklin00a', 'journals/sigmod/Franklin00b', 'journals/sigmod/Franklin98a', 'journals/sigmod/Franklin98b', 'journals/sigmod/Franklin99', 'journals/sigmod/Franklin99a', 'journals/sigmod/Franklin97a', 'journals/sigmod/Franklin98', 'journals/sigmod/Franklin97b'], ['journals/sigmod/Franklin97'], ['journals/sigmod/Widom96', 'journals/sigmod/Widom96b', 'journals/sigmod/Widom96a', 'journals/sigmod/Widom95a', 'journals/sigmod/Widom95'], ['journals/sigmod/Segev94a', 'journals/sigmod/Segev94'], ['journals/sigmod/Hammer02'], ['conf/vldb/Ferrand

In [24]:
scholar_groups = deduplicate(scholar_p)

In [25]:
print(scholar_groups)
print(len(scholar_groups))

[['aKcZKwvwbQwJ'], ['ixKfiTHoaDoJ'], ['3BxllB4wwcIJ'], ['d2WWxwKMex4J'], ['cZCX-AQpjccJ'], ['DMhfVNSDYD4J'], ['97wRWOpnANkJ'], ['xSv97kdDZU8J'], ['vj0-A5RoktIJ'], ['6TKMB5gO9EoJ'], ['im0SIHX8SOcJ'], ['vtw_UuPB1Q4J'], ['oxc-6rqkST0J'], ['izK60fJMujQJ'], ['_3_Ey7EseRIJ'], ['UPWEn_-Pd-sJ'], ['B41crPtpTMcJ'], ['vwBjHtowAUoJ'], ['lDTPyBMtHVwJ'], ['B-2FTtkDshoJ'], ['G7CJNSIBp10J'], ['173qcCip5coJ'], ['I3mRuMui5o0J'], ['0GT_dHqK5roJ'], ['6wr9qvjWwy4J'], ['yNwfL9rufUIJ'], ['k9_rZiB4rKoJ'], ['QOmFqwrnSvgJ'], ['UuU6hQ8nKIYJ'], ['0BIwb8RT4OMJ'], ['XLslbG3-JYcJ'], ['v-6ClxSz9doJ'], ['url:http://portal.acm.org/citation.cfm%3Fid%3D171926.172002'], ['ES6QyfgebIcJ'], ['hM920cBVn1wJ'], ['0HMk-YUh4i8J'], ['rgzK3sG-rnQJ'], ['r3sCE4vukG0J'], ['7B7KCnJu4j8J'], ['wGTOB7ImnLYJ'], ['d2QifUxZ3zQJ'], ['pvUeHhayZ88J'], ['pG9TXzucQzcJ'], ['fOH6aNSDFXkJ'], ['url:http://portal.acm.org/citation.cfm%3Fid%3D110307.110302'], ['_-p7FOuTcEIJ'], ['ZpOG6KM-mgoJ'], ['x28I9Elj9KIJ'], ['K_qOxjIJWpQJ'], ['l8WNeX0WeSAJ'], ['url

In [26]:
dblp_map = build_mapping(dblp_groups)
dblp_map

{'conf/vldb/RusinkiewiczKTWM95': 'conf/vldb/RusinkiewiczKTWM95',
 'journals/sigmod/EisenbergM02': 'journals/sigmod/EisenbergM02',
 'conf/vldb/AmmannJR95': 'conf/vldb/AmmannJR95',
 'journals/sigmod/Liu02': 'journals/sigmod/Liu02',
 'journals/sigmod/Liu02a': 'journals/sigmod/Liu02',
 'journals/sigmod/Liu02b': 'journals/sigmod/Liu02',
 'journals/sigmod/Liu01a': 'journals/sigmod/Liu02',
 'journals/sigmod/Liu01b': 'journals/sigmod/Liu02',
 'journals/sigmod/Liu03c': 'journals/sigmod/Liu02',
 'journals/sigmod/Liu03b': 'journals/sigmod/Liu02',
 'journals/sigmod/Liu03a': 'journals/sigmod/Liu02',
 'journals/sigmod/Liu03': 'journals/sigmod/Liu02',
 'journals/sigmod/Liu02c': 'journals/sigmod/Liu02',
 'journals/sigmod/Liu01': 'journals/sigmod/Liu01',
 'journals/sigmod/Franklin99b': 'journals/sigmod/Franklin99b',
 'journals/sigmod/Franklin00': 'journals/sigmod/Franklin99b',
 'journals/sigmod/Franklin00a': 'journals/sigmod/Franklin99b',
 'journals/sigmod/Franklin00b': 'journals/sigmod/Franklin99b',
 

In [27]:

scholar_map = build_mapping(scholar_groups)
scholar_map

{'aKcZKwvwbQwJ': 'aKcZKwvwbQwJ',
 'ixKfiTHoaDoJ': 'ixKfiTHoaDoJ',
 '3BxllB4wwcIJ': '3BxllB4wwcIJ',
 'd2WWxwKMex4J': 'd2WWxwKMex4J',
 'cZCX-AQpjccJ': 'cZCX-AQpjccJ',
 'DMhfVNSDYD4J': 'DMhfVNSDYD4J',
 '97wRWOpnANkJ': '97wRWOpnANkJ',
 'xSv97kdDZU8J': 'xSv97kdDZU8J',
 'vj0-A5RoktIJ': 'vj0-A5RoktIJ',
 '6TKMB5gO9EoJ': '6TKMB5gO9EoJ',
 'im0SIHX8SOcJ': 'im0SIHX8SOcJ',
 'vtw_UuPB1Q4J': 'vtw_UuPB1Q4J',
 'oxc-6rqkST0J': 'oxc-6rqkST0J',
 'izK60fJMujQJ': 'izK60fJMujQJ',
 '_3_Ey7EseRIJ': '_3_Ey7EseRIJ',
 'UPWEn_-Pd-sJ': 'UPWEn_-Pd-sJ',
 'B41crPtpTMcJ': 'B41crPtpTMcJ',
 'vwBjHtowAUoJ': 'vwBjHtowAUoJ',
 'lDTPyBMtHVwJ': 'lDTPyBMtHVwJ',
 'B-2FTtkDshoJ': 'B-2FTtkDshoJ',
 'G7CJNSIBp10J': 'G7CJNSIBp10J',
 '173qcCip5coJ': '173qcCip5coJ',
 'I3mRuMui5o0J': 'I3mRuMui5o0J',
 '0GT_dHqK5roJ': '0GT_dHqK5roJ',
 '6wr9qvjWwy4J': '6wr9qvjWwy4J',
 'yNwfL9rufUIJ': 'yNwfL9rufUIJ',
 'k9_rZiB4rKoJ': 'k9_rZiB4rKoJ',
 'QOmFqwrnSvgJ': 'QOmFqwrnSvgJ',
 'UuU6hQ8nKIYJ': 'UuU6hQ8nKIYJ',
 '0BIwb8RT4OMJ': '0BIwb8RT4OMJ',
 'XLslbG3-

In [28]:
dblp_u = unique_df(dblp_p, dblp_map)
dblp_u

Unnamed: 0,id,title,authors,venue,year,title_norm,title_tokens,author_lastnames,year_parsed,canon_id
804,conf/sigmod/2000,Proceedings of the 2000 ACM SIGMOD Internation...,,,2000,proceedings of the 2000 acm sigmod internation...,"{conference, proceedings, acm, of, sigmod, dat...",{},2000,conf/sigmod/2000
454,conf/sigmod/2002,Proceedings of the 2002 ACM SIGMOD Internation...,,,2002,proceedings of the 2002 acm sigmod internation...,"{2002, conference, proceedings, acm, of, sigmo...",{},2002,conf/sigmod/2002
283,conf/sigmod/2003,Proceedings of the 2003 ACM SIGMOD Internation...,,,2003,proceedings of the 2003 acm sigmod internation...,"{california, 12, conference, proceedings, 2003...",{},2003,conf/sigmod/2003
572,conf/sigmod/AbadiC02,Visual COKO: a debugger for query optimizer de...,"D Abadi, M Cherniack",SIGMOD Conference,2002,visual coko a debugger for query optimizer dev...,"{a, query, visual, debugger, for, development,...","{abadi, cherniack}",2002,conf/sigmod/AbadiC02
279,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03,Aurora: A Data Stream Management System,"D Abadi, D Carney, U Çetintemel, M Cherniack, ...",SIGMOD Conference,2003,aurora a data stream management system,"{a, stream, aurora, data, management, system}","{maskey, yan, xing, erwin, rasin, singer, zdon...",2003,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03
...,...,...,...,...,...,...,...,...,...,...
2477,journals/vldb/WangW01,Indexing very high-dimensional sparse and quas...,"C Wang, X Wang",VLDB J.,2001,indexing very high dimensional sparse and quas...,"{dimensional, sparse, similarity, vectors, sea...",{wang},2001,journals/vldb/WangW01
2556,journals/vldb/WhangKW94,Dynamic Maintenance of Data Distribution for S...,"K Whang, S Kim, G Wiederhold",VLDB J.,1994,dynamic maintenance of data distribution for s...,"{dynamic, estimation, maintenance, data, of, s...","{kim, whang, wiederhold}",1994,journals/vldb/WhangKW94
2407,journals/vldb/WidomS00,Foreword by the VLDB '98 PC Chairmen: Best Pap...,,,2000,foreword by the vldb 98 pc chairmen best paper...,"{best, foreword, of, by, chairmen, papers, pc,...",{},2000,journals/vldb/WidomS00
2284,journals/vldb/YangW03,Incremental computation and maintenance of tem...,"J Yang, J Widom",VLDB J.,2003,incremental computation and maintenance of tem...,"{incremental, maintenance, of, computation, te...","{widom, yang}",2003,journals/vldb/YangW03


In [29]:
scholar_u = unique_df(scholar_p, scholar_map)
scholar_u

Unnamed: 0,id,title,authors,venue,year,title_norm,title_tokens,author_lastnames,year_parsed,canon_id
11535,--1aoAaGkgQJ,The GMAP: A Versatile Tool for Physical Data I...,"G Odysseas, MH Solomon, YE Ioannidist",,,the gmap a versatile tool for physical data in...,"{physical, a, data, for, independence, tool, t...","{solomon, odysseas, ioannidist}",,zt5Ck1PrGf0J
36373,--3ZzSMFdA0J,"Molina, R. Aranha, and Y. Cho. Extracting semi...","J Hammer, H Garcia","Technical report, Computer Science Department,...",,molina r aranha and y cho extracting semistruc...,"{web, molina, y, the, aranha, semistructured, ...","{garcia, hammer}",,--3ZzSMFdA0J
63420,--K4eygDie4J,Dynamic Query Evaluation Plans: Some Course Co...,G Graefe,,,dynamic query evaluation plans some course cor...,"{plans, dynamic, query, some, corrections, eva...",{graefe},,--K4eygDie4J
50310,--LF_WC8c5sJ,Encouraging and Evaluating Scholarship for the...,PA Lacey,"New Directions for Teaching and Learning,",1990.0,encouraging and evaluating scholarship for the...,"{the, evaluating, teacher, for, college, schol...",{lacey},1990.0,--LF_WC8c5sJ
38075,--PsNHvlpbIJ,Carnot and InfoSleuth: database technology and...,D Woelk,Proceedings of the ACM SIGMOD International Co...,1995.0,carnot and infosleuth database technology and ...,"{technology, web, the, wide, world, database, ...",{woelk},1995.0,--PsNHvlpbIJ
...,...,...,...,...,...,...,...,...,...,...
22087,zzoKlsL_J-oJ,"SEER cancer statistics review, 1973â??1991: ta...","CL Kosary, LAG Ries, BA Miller, BF Hankey, A","MD,",,seer cancer statistics review 1973a 1991 table...,"{statistics, 1991, cancer, nih, 2789, pub, nat...","{kosary, a, miller, hankey, ries}",,zzoKlsL_J-oJ
16647,zzpXCkKwo_8J,Nonlinear dynamics of two types of network wit...,P De Wilde,"BT TECHNOLOGY JOURNAL,",,nonlinear dynamics of two types of network wit...,"{two, with, of, types, nodes, nonlinear, intel...",{wilde},,zzpXCkKwo_8J
52139,zzs-zmjbNDkJ,Atlas and Catalogue of Interacting Galaxies,BA Vorontsov-Velyaminov,,,atlas and catalogue of interacting galaxies,"{interacting, catalogue, atlas, of, galaxies, ...",{velyaminov},,zzs-zmjbNDkJ
9693,zztneAuQXrgJ,Molecular selforganisation in a developmental ...,"H Bolouri, R Adams, S George, AG Rust",Procs of the Int. Conf. on Neural Information ...,,molecular selforganisation in a developmental ...,"{a, the, large, of, molecular, artificial, dev...","{bolouri, george, adams, rust}",,zztneAuQXrgJ


In [30]:
report_dedup("DBLP", dblp_p, dblp_groups, dblp_u)
report_dedup("Scholar", scholar_p, scholar_groups, scholar_u)


DBLP 1.1 REPORT
-----------------------------------
Input records: 2616
Duplicate groups: 28
Final unique records: 2563

Scholar 1.1 REPORT
-----------------------------------
Input records: 64263
Duplicate groups: 761
Final unique records: 63382


In [31]:
# gold pairs (original ids) -> canonical ids
gold_pairs = set(
    (dblp_map.get(d, d), scholar_map.get(s, s))
    for d, s in zip(gt["iddblp"], gt["idscholar"])
)

# Use canonical ids for candidate pairs
dblp_u = dblp_u.copy()
scholar_u = scholar_u.copy()
dblp_u["canon_id"] = dblp_u["id"].map(dblp_map)
scholar_u["canon_id"] = scholar_u["id"].map(scholar_map)

In [32]:
print(len(gold_pairs))
print(len(gt))

4805
5347


In [33]:
dblp_u

Unnamed: 0,id,title,authors,venue,year,title_norm,title_tokens,author_lastnames,year_parsed,canon_id
804,conf/sigmod/2000,Proceedings of the 2000 ACM SIGMOD Internation...,,,2000,proceedings of the 2000 acm sigmod internation...,"{conference, proceedings, acm, of, sigmod, dat...",{},2000,conf/sigmod/2000
454,conf/sigmod/2002,Proceedings of the 2002 ACM SIGMOD Internation...,,,2002,proceedings of the 2002 acm sigmod internation...,"{2002, conference, proceedings, acm, of, sigmo...",{},2002,conf/sigmod/2002
283,conf/sigmod/2003,Proceedings of the 2003 ACM SIGMOD Internation...,,,2003,proceedings of the 2003 acm sigmod internation...,"{california, 12, conference, proceedings, 2003...",{},2003,conf/sigmod/2003
572,conf/sigmod/AbadiC02,Visual COKO: a debugger for query optimizer de...,"D Abadi, M Cherniack",SIGMOD Conference,2002,visual coko a debugger for query optimizer dev...,"{a, query, visual, debugger, for, development,...","{abadi, cherniack}",2002,conf/sigmod/AbadiC02
279,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03,Aurora: A Data Stream Management System,"D Abadi, D Carney, U Çetintemel, M Cherniack, ...",SIGMOD Conference,2003,aurora a data stream management system,"{a, stream, aurora, data, management, system}","{maskey, yan, xing, erwin, rasin, singer, zdon...",2003,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03
...,...,...,...,...,...,...,...,...,...,...
2477,journals/vldb/WangW01,Indexing very high-dimensional sparse and quas...,"C Wang, X Wang",VLDB J.,2001,indexing very high dimensional sparse and quas...,"{dimensional, sparse, similarity, vectors, sea...",{wang},2001,journals/vldb/WangW01
2556,journals/vldb/WhangKW94,Dynamic Maintenance of Data Distribution for S...,"K Whang, S Kim, G Wiederhold",VLDB J.,1994,dynamic maintenance of data distribution for s...,"{dynamic, estimation, maintenance, data, of, s...","{kim, whang, wiederhold}",1994,journals/vldb/WhangKW94
2407,journals/vldb/WidomS00,Foreword by the VLDB '98 PC Chairmen: Best Pap...,,,2000,foreword by the vldb 98 pc chairmen best paper...,"{best, foreword, of, by, chairmen, papers, pc,...",{},2000,journals/vldb/WidomS00
2284,journals/vldb/YangW03,Incremental computation and maintenance of tem...,"J Yang, J Widom",VLDB J.,2003,incremental computation and maintenance of tem...,"{incremental, maintenance, of, computation, te...","{widom, yang}",2003,journals/vldb/YangW03


In [34]:
# Blocking rule 1: title prefix (first 3 tokens)

dblp_u["title_prefix"] = dblp_u["title_tokens"].apply(lambda t: make_title_prefix(t, k=3))
scholar_u["title_prefix"] = scholar_u["title_tokens"].apply(lambda t: make_title_prefix(t, k=3))

def block_title_prefix(d_df, s_df):
    s_index = defaultdict(list)
    for _, r in s_df.iterrows():
        s_index[r["title_prefix"]].append(r["canon_id"])

    pairs = set()
    for _, r in d_df.iterrows():
        key = r["title_prefix"]
        for sid in s_index.get(key, []):
            pairs.add((r["canon_id"], sid))
    return pairs

cand_title = block_title_prefix(dblp_u, scholar_u)

In [35]:
print(cand_title)
print(len(cand_title))

{('conf/sigmod/ShklarSKT95', 'aD3jAQWyvJQJ'), ('journals/sigmod/DoanPK95', 'WrPdVlumZL4J'), ('conf/vldb/WeberSB98', 'PceyDO5iYdUJ'), ('journals/vldb/AbbadiSW01', 'url:http://portal.acm.org/ft_gateway.cfm%3Fid%3D765219%26type%3Dpdf%26dl%3DGUIDE%26dl%3DACM%26CFID%3D11111111%26CFTOKEN%3D2222222'), ('conf/sigmod/BohlenBD99', 'X2odIfnOC88J'), ('conf/sigmod/NestorovAM98', 'kfADlOEkwOkJ'), ('conf/vldb/GarofalakisRS99', 'poso7AKokFMJ'), ('conf/sigmod/GoldmanW00', '9wxSfPaUz6QJ'), ('conf/vldb/SeshadriLR96', '_zyTvDvItXwJ'), ('conf/vldb/Selinger02', '76u3c8hIWqoJ'), ('conf/vldb/Kotidis01', 'lC-F9_1F_vkJ'), ('conf/vldb/Schnase00', 'OdaRU442nm0J'), ('journals/sigmod/ChongFTSYKDJJ03', 'LyyQWcdn5D0J'), ('conf/sigmod/ChenR94', '4Hhh-d__bksJ'), ('conf/sigmod/LiVCAHMWBCHIKSS99', '3EdatsIvdSkJ'), ('conf/sigmod/RafieiM97', 'IlN1Z8UC688J'), ('conf/vldb/LakshmananSS96', 'IKqp8Y-9eN8J'), ('conf/vldb/SistlaYLL95', 'HtTNmhCJY-oJ'), ('conf/vldb/CarusoCGLM00', '1Cd28pD7V_YJ'), ('journals/sigmod/Barbara01', '1Xo

In [36]:
# Blocking rule 2: author last-name overlap (inverted index)
def block_author_overlap(d_df, s_df):
    s_auth = defaultdict(list)
    for _, r in s_df.iterrows():
        for a in r["author_lastnames"]:
            s_auth[a].append(r["canon_id"])

    pairs = set()
    for _, r in d_df.iterrows():
        for a in r["author_lastnames"]:
            for sid in s_auth.get(a, []):
                pairs.add((r["canon_id"], sid))
    return pairs

cand_author = block_author_overlap(dblp_u, scholar_u)

In [37]:
#print(cand_author)
print(len(cand_author))

374247


In [38]:
# Union + report candidate count + % gold retained
candidate_pairs = cand_title | cand_author

retained_gold = gold_pairs & candidate_pairs
gold_retention = 100.0 * len(retained_gold) / len(gold_pairs)

print("\n1.2 BLOCKING REPORT")
print("-"*35)
print("Candidates (title prefix):", len(cand_title))
print("Candidates (author overlap):", len(cand_author))
print("Candidates (union):", len(candidate_pairs))
print("Gold pairs total:", len(gold_pairs))
print("Gold retained:", len(retained_gold))
print(f"% Gold retained: {gold_retention:.2f}%")



1.2 BLOCKING REPORT
-----------------------------------
Candidates (title prefix): 8041
Candidates (author overlap): 374247
Candidates (union): 379552
Gold pairs total: 4805
Gold retained: 4626
% Gold retained: 96.27%


In [39]:
import numpy as np
import time
from sklearn.feature_extraction.text import TfidfVectorizer

t0 = time.time()

# ---- 1) Build fast lookups (O(n)) ----
dblp_titles = dblp_u.set_index("canon_id")["title_norm"].to_dict()
sch_titles  = scholar_u.set_index("canon_id")["title_norm"].to_dict()

dblp_tokens = dblp_u.set_index("canon_id")["title_tokens"].to_dict()
sch_tokens  = scholar_u.set_index("canon_id")["title_tokens"].to_dict()

dblp_ids = list(dblp_titles.keys())
sch_ids  = list(sch_titles.keys())

d_pos = {cid: i for i, cid in enumerate(dblp_ids)}
s_pos = {cid: i for i, cid in enumerate(sch_ids)}

# ---- 2) TF-IDF fit (usually seconds) ----
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,2))  # you can switch to (1,1) if needed
X = vectorizer.fit_transform([dblp_titles[c] for c in dblp_ids] + [sch_titles[c] for c in sch_ids])

n_d = len(dblp_ids)
Xd = X[:n_d]
Xs = X[n_d:]

print("TF-IDF time (s):", round(time.time() - t0, 2))

# ---- 3) Prepare candidate index arrays (O(#pairs)) ----
pairs_list = list(candidate_pairs)

d_idx = []
s_idx = []
valid_pairs = 0

for d_id, s_id in pairs_list:
    i = d_pos.get(d_id)
    j = s_pos.get(s_id)
    if i is None or j is None:
        continue
    d_idx.append(i)
    s_idx.append(j)
    valid_pairs += 1

d_idx = np.array(d_idx, dtype=int)
s_idx = np.array(s_idx, dtype=int)

print("Valid candidate pairs to score:", valid_pairs)

# ---- 4) Vectorized cosine computation (FAST) ----
t1 = time.time()

# elementwise multiply row-wise then sum => dot product (cosine because TF-IDF rows are L2-normalized)
cosines = Xd[d_idx].multiply(Xs[s_idx]).sum(axis=1).A1

cos_ge_095 = int((cosines >= 0.95).sum())

print("Scoring time (s):", round(time.time() - t1, 2))

print("\n1.3 SIMILARITY REPORT")
print("-"*35)
print("Candidate pairs scored:", len(cosines))
print("Pairs with cosine ≥ 0.95:", cos_ge_095)
print(f"Cosine summary: min={cosines.min():.3f}, mean={cosines.mean():.3f}, max={cosines.max():.3f}")


TF-IDF time (s): 2.24
Valid candidate pairs to score: 379552
Scoring time (s): 0.17

1.3 SIMILARITY REPORT
-----------------------------------
Candidate pairs scored: 379552
Pairs with cosine ≥ 0.95: 2870
Cosine summary: min=0.000, mean=0.022, max=1.000


In [40]:
# 1.4 Matching & Evaluation (P/R/F1 at thresholds 0.70/0.80/0.90/0.95)
# We evaluate against perfect mapping:
# Predictions are made only on candidate pairs (blocked set)
# Any gold pair not in candidates is counted as FN (honest evaluation)

# recompute cosine for all candidate pairs into a dict
def prf(pred_pairs, gold_pairs):
    tp = len(pred_pairs & gold_pairs)
    fp = len(pred_pairs - gold_pairs)
    fn = len(gold_pairs - pred_pairs)

    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall    = tp / (tp + fn) if (tp + fn) else 0.0
    f1        = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
    return precision, recall, f1, tp, fp, fn

thresholds = [0.70, 0.80, 0.90, 0.95]

print("\n1.4 MATCHING & EVALUATION")
print("-" * 70)
print("thr   precision  recall     f1       tp     fp     fn")

for thr in thresholds:
    pred_pairs = {pairs_list[i] for i, c in enumerate(cosines) if c >= thr}
    p, r, f1, tp, fp, fn = prf(pred_pairs, gold_pairs)
    print(f"{thr:0.2f}  {p:0.4f}     {r:0.4f}    {f1:0.4f}   {tp:5d}  {fp:5d}  {fn:5d}")




1.4 MATCHING & EVALUATION
----------------------------------------------------------------------
thr   precision  recall     f1       tp     fp     fn
0.70  0.8658     0.7680    0.8139    3690    572   1115
0.80  0.8605     0.6483    0.7395    3115    505   1690
0.90  0.8478     0.5369    0.6575    2580    463   2225
0.95  0.8408     0.5022    0.6288    2413    457   2392


In [41]:
import pandas as pd

pairs_scored = pd.DataFrame({
    "dblp_canon_id": [p[0] for p in pairs_list],
    "scholar_canon_id": [p[1] for p in pairs_list],
    "cosine": cosines
})

pairs_scored.to_csv("pairs_scored.csv", index=False)

print("Saved pairs_scored.csv with", len(pairs_scored), "candidate pairs")


Saved pairs_scored.csv with 379552 candidate pairs


Part 2

In [42]:
pairs = pd.read_csv("pairs_scored.csv")
print("pairs_scored shape:", pairs.shape)
pairs.head()

pairs_scored shape: (379552, 3)


Unnamed: 0,dblp_canon_id,scholar_canon_id,cosine
0,conf/sigmod/LivnyRBCDLMW97,TtV6gGLQ9NMJ,0.005939
1,conf/vldb/LuSL95,08rAivBM66sJ,0.0
2,journals/vldb/LiR99,ml1M1DTzYqwJ,0.0
3,conf/sigmod/BabcockCD03,SGmfFHnOf4IJ,0.0
4,conf/sigmod/SarawagiTA98,kAi6HssDySgJ,0.032553


In [43]:
# Lookups
d_title_norm = dblp_u.set_index("canon_id")["title_norm"].to_dict()
s_title_norm = scholar_u.set_index("canon_id")["title_norm"].to_dict()

d_title_tok  = dblp_u.set_index("canon_id")["title_tokens"].to_dict()
s_title_tok  = scholar_u.set_index("canon_id")["title_tokens"].to_dict()

d_auth = dblp_u.set_index("canon_id")["author_lastnames"].to_dict()
s_auth = scholar_u.set_index("canon_id")["author_lastnames"].to_dict()

d_year = dblp_u.set_index("canon_id")["year_parsed"].to_dict()
s_year = scholar_u.set_index("canon_id")["year_parsed"].to_dict()

# venue optional
if "venue" in dblp_u.columns and "venue" in scholar_u.columns:
    d_venue = dblp_u.set_index("canon_id")["venue"].fillna("").map(normalize_text).to_dict()
    s_venue = scholar_u.set_index("canon_id")["venue"].fillna("").map(normalize_text).to_dict()
else:
    d_venue, s_venue = {}, {}

In [44]:
# feature helpers 
def dice(a, b):
    if not a or not b:
        return 0.0
    inter = len(a & b)
    return 2 * inter / (len(a) + len(b))

def shared_token_count(a, b):
    if not a or not b:
        return 0
    return len(a & b)

def char_ngrams(s, n=3):
    s = normalize_text(s)
    if len(s) < n:
        return set()
    return {s[i:i+n] for i in range(len(s) - n + 1)}

def jaccard_sets(a, b):
    if not a or not b:
        return 0.0
    return len(a & b) / len(a | b)


In [45]:
feature_rows = []

for _, r in pairs.iterrows():
    d_id = r["dblp_canon_id"]
    s_id = r["scholar_canon_id"]

    td = d_title_tok.get(d_id, set())
    ts = s_title_tok.get(s_id, set())

    ad = d_auth.get(d_id, set())
    aS = s_auth.get(s_id, set())

    yd = d_year.get(d_id)
    ys = s_year.get(s_id)

    # ---- existing similarity scores ----
    title_cos  = float(r.get("cosine_tfidf", 0.0))
    title_jacc = float(r.get("jaccard_title", jaccard_sets(td, ts)))

    # ---- required / common engineered features ----
    auth_olap = 1 if (ad & aS) else 0

    year_ok = 0
    year_diff = -1
    if yd is not None and ys is not None:
        year_diff = abs(yd - ys)
        year_ok = 1 if year_diff <= 1 else 0

    # venue exact + venue similarities (if available)
    v1 = d_venue.get(d_id, "")
    v2 = s_venue.get(s_id, "")
    venue_exact = 1 if (v1 and v2 and v1 == v2) else 0
    venue_jacc = jaccard_sets(set(v1.split()), set(v2.split())) if (v1 and v2) else 0.0
    venue_dice = dice(set(v1.split()), set(v2.split())) if (v1 and v2) else 0.0

    # ---- extra helpful features ----
    title_len_diff = abs(len(td) - len(ts))
    title_shared_tokens = shared_token_count(td, ts)

    auth_jacc = jaccard_sets(ad, aS)
    auth_dice = dice(ad, aS)

    # char 3-gram overlap on title
    c1 = char_ngrams(d_title_norm.get(d_id, ""), n=3)
    c2 = char_ngrams(s_title_norm.get(s_id, ""), n=3)
    title_char_jacc = jaccard_sets(c1, c2)

    feature_rows.append({
        "dblp_canon_id": d_id,
        "scholar_canon_id": s_id,

        # --- at least 6 core features ---
        "title_cos": title_cos,
        "title_jacc": title_jacc,
        "auth_olap": auth_olap,
        "year_ok": year_ok,
        "venue_exact": venue_exact,
        "year_diff": year_diff,

        # --- extra features ---
        "title_len_diff": title_len_diff,
        "title_shared_tokens": title_shared_tokens,
        "auth_jacc": auth_jacc,
        "auth_dice": auth_dice,
        "title_char_jacc": title_char_jacc,
        "venue_jacc": venue_jacc,
        "venue_dice": venue_dice,
    })

pairs_features = pd.DataFrame(feature_rows)
pairs_features.to_csv("pairs_features.csv", index=False)

print("Saved pairs_features.csv:", pairs_features.shape)
pairs_features.head()


Saved pairs_features.csv: (379552, 15)


Unnamed: 0,dblp_canon_id,scholar_canon_id,title_cos,title_jacc,auth_olap,year_ok,venue_exact,year_diff,title_len_diff,title_shared_tokens,auth_jacc,auth_dice,title_char_jacc,venue_jacc,venue_dice
0,conf/sigmod/LivnyRBCDLMW97,TtV6gGLQ9NMJ,0.0,0.071429,1,0,1,3.0,1,1,0.111111,0.2,0.018692,1.0,1.0
1,conf/vldb/LuSL95,08rAivBM66sJ,0.0,0.0,1,0,0,,6,0,0.166667,0.285714,0.0,0.0,0.0
2,journals/vldb/LiR99,ml1M1DTzYqwJ,0.0,0.0,1,0,0,,3,0,0.333333,0.5,0.04,0.0,0.0
3,conf/sigmod/BabcockCD03,SGmfFHnOf4IJ,0.0,0.0,1,0,0,,6,0,0.2,0.333333,0.044444,0.0,0.0
4,conf/sigmod/SarawagiTA98,kAi6HssDySgJ,0.0,0.071429,1,0,0,,3,1,0.25,0.4,0.102041,0.0,0.0


In [46]:
# gold_pairs in canonical space (as before)
gold_keys = set(str(d) + "#" + str(s) for (d, s) in gold_pairs)

pairs_features["key"] = pairs_features["dblp_canon_id"].astype(str) + "#" + pairs_features["scholar_canon_id"].astype(str)
pairs_features["label"] = pairs_features["key"].isin(gold_keys).astype(int)
pairs_features = pairs_features.drop(columns=["key"])

pairs_features.to_csv("training_features_labeled.csv", index=False)
print("Saved training_features_labeled.csv:", pairs_features.shape)
print(pairs_features["label"].value_counts())


Saved training_features_labeled.csv: (379552, 16)
label
0    374926
1      4626
Name: count, dtype: int64


In [47]:
import pandas as pd
import numpy as np

df = pd.read_csv("training_features_labeled.csv")

print("Before:", df.shape)

df = df.drop_duplicates(subset=["dblp_canon_id", "scholar_canon_id"], keep="first")

print("After dedup pairs:", df.shape)
print(df["label"].value_counts())


Before: (379552, 16)
After dedup pairs: (379552, 16)
label
0    374926
1      4626
Name: count, dtype: int64


In [48]:
# ----- Missing indicators -----
df["year_missing"] = (df["year_diff"] == -1).astype(int)

# Replace sentinel with NaN then impute
df["year_diff"] = df["year_diff"].replace(-1, np.nan)

# Impute with median (robust)
year_median = df["year_diff"].median()
df["year_diff"] = df["year_diff"].fillna(year_median)

print("Year median used for imputation:", year_median)


Year median used for imputation: 4.0


In [49]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove("label")  # label not a feature

for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].median())


In [50]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Feature columns
id_cols = ["dblp_canon_id", "scholar_canon_id"]
target = "label"

feature_cols = [c for c in df.columns if c not in id_cols + [target]]

# Identify binary vs continuous (simple heuristic)
binary_cols = [c for c in feature_cols if set(df[c].dropna().unique()).issubset({0, 1})]
cont_cols = [c for c in feature_cols if c not in binary_cols]

print("Binary features:", binary_cols)
print("Continuous features:", cont_cols)

X = df[feature_cols]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

preprocess = ColumnTransformer(
    transformers=[
        ("cont", StandardScaler(), cont_cols),
        ("bin", "passthrough", binary_cols)
    ]
)


Binary features: ['title_cos', 'auth_olap', 'year_ok', 'venue_exact', 'year_missing']
Continuous features: ['title_jacc', 'year_diff', 'title_len_diff', 'title_shared_tokens', 'auth_jacc', 'auth_dice', 'title_char_jacc', 'venue_jacc', 'venue_dice']


In [51]:
from sklearn.linear_model import LogisticRegression

clf_weighted = Pipeline([
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=400, class_weight="balanced"))
])

clf_weighted.fit(X_train, y_train)
print("Trained LogisticRegression with class_weight='balanced'")


Trained LogisticRegression with class_weight='balanced'


In [52]:
# Undersample negatives to, e.g., 1:5 ratio
pos = df[df["label"] == 1]
neg = df[df["label"] == 0].sample(n=min(len(df[df["label"]==0]), 5*len(pos)), random_state=42)

df_under = pd.concat([pos, neg]).sample(frac=1, random_state=42)  # shuffle

print("Undersampled dataset:", df_under.shape)
print(df_under["label"].value_counts())


Undersampled dataset: (27756, 17)
label
0    23130
1     4626
Name: count, dtype: int64


In [53]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

smote_clf = ImbPipeline([
    ("prep", preprocess),
    ("smote", SMOTE(random_state=42)),
    ("clf", LogisticRegression(max_iter=400))
])

smote_clf.fit(X_train, y_train)
print("Trained LogisticRegression with SMOTE")


Trained LogisticRegression with SMOTE


In [54]:
df.to_csv("training_features_labeled_preprocessed.csv", index=False)
print("Saved training_features_labeled_preprocessed.csv")


Saved training_features_labeled_preprocessed.csv


Train


In [55]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


# -----------------------------
# 1) Load + de-duplicate pairs
# -----------------------------
df = pd.read_csv("training_features_labeled.csv")
df = df.drop_duplicates(subset=["dblp_canon_id", "scholar_canon_id"], keep="first")

In [56]:
# -----------------------------
# 2) Missing values handling
#    (year_diff == -1 means missing)
# -----------------------------
df["year_missing"] = (df["year_diff"] == -1).astype(int)
df["year_diff"] = df["year_diff"].replace(-1, np.nan)
df["year_diff"] = df["year_diff"].fillna(df["year_diff"].median())

# Fill any remaining NaNs in numeric cols with median
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].median())

In [57]:
# -----------------------------
# 3) Features / target
# -----------------------------
id_cols = ["dblp_canon_id", "scholar_canon_id"]
target = "label"
feature_cols = [c for c in df.columns if c not in id_cols + [target]]

X = df[feature_cols]
y = df[target].astype(int)

# Identify binary vs continuous features (simple heuristic)
binary_cols = [c for c in feature_cols if set(df[c].dropna().unique()).issubset({0, 1})]
cont_cols = [c for c in feature_cols if c not in binary_cols]

# Scale only continuous (SVM needs scaling; RF doesn't, but it won't break)
preprocess = ColumnTransformer(
    transformers=[
        ("cont", StandardScaler(), cont_cols),
        ("bin", "passthrough", binary_cols),
    ],
    remainder="drop"
)

In [58]:
# -----------------------------
# 4) Models (handle imbalance)
# -----------------------------
rf = Pipeline([
    ("prep", preprocess),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced_subsample"
    ))
])

svm = Pipeline([
    ("prep", preprocess),
    ("clf", SVC(
        kernel="rbf",
        C=1.0,
        gamma="scale",
        class_weight="balanced"
    ))
])

In [59]:
# -----------------------------
# 5) 3-fold CV + metrics
# -----------------------------
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

scoring = {
    "precision": make_scorer(precision_score, pos_label=1, average="binary", zero_division=0),
    "recall":    make_scorer(recall_score, pos_label=1, average="binary", zero_division=0),
    "f1":        make_scorer(f1_score, pos_label=1, average="binary", zero_division=0),
}

In [60]:
def run_cv(name, model):
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)
    p_mean, p_std = scores["test_precision"].mean(), scores["test_precision"].std()
    r_mean, r_std = scores["test_recall"].mean(), scores["test_recall"].std()
    f_mean, f_std = scores["test_f1"].mean(), scores["test_f1"].std()

    print(f"\n{name} — 3-Fold CV")
    print("-" * 50)
    print(f"Precision: {p_mean:.4f} ± {p_std:.4f}")
    print(f"Recall:    {r_mean:.4f} ± {r_std:.4f}")
    print(f"F1-score:  {f_mean:.4f} ± {f_std:.4f}")

In [61]:
run_cv("Random Forest", rf)


Random Forest — 3-Fold CV
--------------------------------------------------
Precision: 0.9129 ± 0.0100
Recall:    0.9414 ± 0.0064
F1-score:  0.9269 ± 0.0056


In [62]:
run_cv("SVM (RBF)", svm)


SVM (RBF) — 3-Fold CV
--------------------------------------------------
Precision: 0.7817 ± 0.0157
Recall:    0.9946 ± 0.0003
F1-score:  0.8753 ± 0.0097
