### 1. Replace each abbreviation entity with its fullname in every paper, assuming that each abbreviation corresponds to only one fullname in the paper.

In [None]:
import numpy as np
from flashtext import KeywordProcessor
import pandas as pd
from fasttext import load_model
import re
from collections import defaultdict

abb2full = {}
with open("./data/mapping-list.txt", encoding='utf8') as f:
    for line in f:
        a = line.strip().lower().split('\t')
        fulls = [_.strip() for _ in a[1:]]
        abb2full[a[0]] = fulls
        abb2full[a[0].replace(' ', '')] = fulls
abbs = sorted(abb2full.keys(), key=lambda x:len(x), reverse=True)
kp = KeywordProcessor()
kp.add_keywords_from_list(abbs)

'''
Load pre-trained FastText model.
We trained a FastText model based on the collected research paper data to generate entity representation vectors.
Due to the large file size, it was not uploaded. If necessary, you can train the model using your own data or 
obtain word vectors through other means.
'''
ftt = load_model("ftt.model")

def cosine_similarity(x,y):
    sim = x.dot(y.T)/(np.linalg.norm(x) * np.linalg.norm(y))
    return sim

def replace_abb2full(ents):
    ents_str = '  '.join(ents)
    ents_vec = ftt.get_sentence_vector(ents_str)
    ents_str = re.sub('[!"#$%&\'()*,-./:;<=>?[\\]^_`{|}~（）—_–]', ' ', ents_str)
    ents_str = re.sub(r'(\d+)', r' \1', ents_str)
    ents_str = re.sub(' +', ' ', ents_str)
    # Match abbreviation entities.
    matched_abbs = kp.extract_keywords(ents_str)
    kws = []
    for abb in matched_abbs:
        fullnames = abb2full[abb]
        if len(fullnames)==1:
            kws.append([abb, fullnames[0]])
        else:
            sims = [(fullname, cosine_similarity(ftt.get_word_vector(fullname), ents_vec)) for fullname in fullnames]
            # Get the fullname with the highest similarity score.
            sims = sorted(sims, key=lambda _:_[1])
            kws.append([abb, sims[-1][0]])
    kp2 = KeywordProcessor()
    kws = sorted(kws, key=lambda _:len(_[0]), reverse=True)
    for kw in kws:
        kp2.add_keyword(kw[0], kw[1])
    new_ents = {}
    for ent in ents:
        e = re.sub('[!"#$%&\'()*,-./:;<=>?[\\]^_`{|}~（）—_–]', ' ', ent)
        # Add a space between digits and characters to separate them.
        e = re.sub(r'(\d+)', r' \1', e)
        e = re.sub(' +', ' ', e)
        # Replace the abbreviated entity with its fullname.
        new_ents[ent] = kp2.replace_keywords(e)
    return new_ents

df_ents = pd.read_parquet("./data/paper-ents.parquet")
d = defaultdict(set)
for row in df_ents.itertuples():
    ents = {_[1].lower() for _ in row[2]}
    new_ents = replace_abb2full(ents)
    new_ents_2 = replace_abb2full(set(new_ents.values()))
    new_ents = {k:new_ents_2[v] for k, v in new_ents.items()}
    for e in row[2]:
        d[new_ents[e[1].lower()]].add(e[1])
    print(f'\r{row[0]+1}/{len(df_ents)}', end='')
data = [(k, v) for k, v in d.items()]
df = pd.DataFrame(data)
# df.to_csv('./data/replaced-ents.csv')
df.to_parquet('./data/replaced-ents.parquet')

### 2. Lemmatize the replaced entity words and remove some words that do not contribute much to the semantics.

In [None]:
import pandas as pd
from collections import defaultdict, Counter
from flashtext import KeywordProcessor
from nltk.stem import WordNetLemmatizer
import re

# List of words that do not contribute much to the semantics.
with open("./data/remove-words.txt", encoding='utf8') as f:
    kws = f.read().strip().split('\n')
kp = KeywordProcessor()
kp.add_keywords_from_dict({' ':kws})

wnl = WordNetLemmatizer()
def word_lemma(w):
    # Lemmatize nouns, verbs, and adjectives
    w_ = wnl.lemmatize(w, 'n')
    if w_ != w: return w_
    w_ = wnl.lemmatize(w, 'v')
    if w_ != w: return w_
    w_ = wnl.lemmatize(w, 'a')
    if w_ != w: return w_
    return w

df = pd.read_parquet('./data/replaced-ents.parquet')
words = set()
for row in df.itertuples():
    ws = row[1].split(' ')
    # Add all entity words
    for w in ws:
        words.add(w)

abb2lemma_full = {}
with open("./data/mapping-list.txt", encoding='utf8') as f:
    for line in f:
        a = line.strip().lower().split('\t')
        fulls = [_.strip() for _ in a[1:]]
        # For an abbreviation with only one fullname, lemmatize it to its original word form, remove insignificant words that do not impact its meaning,
        # delete spaces, and also remove spaces in the abbreviation. Use it to replace an entity that remains an abbreviation after processing.
        if len(fulls)==1:
            ws = fulls[0].split(' ')
            ws_lemma = [word_lemma(w) for w in ws]
            ent = ' '.join(ws_lemma)
            ent = kp.replace_keywords(ent)
            ws = ent.split(' ')
            abb2lemma_full[a[0].replace(' ', '')] = ''.join(sorted(ws))
        for full in fulls:
            ws = full.split(' ')
            # Add the entity word from the mapping dictionary.
            for w in ws:
                words.add(w)
word2lemma = {w:word_lemma(w) for w in words}

d = defaultdict(list)
for row in df.itertuples():
    words = row[1].split(' ')
    ws_lemma = [word2lemma[w] for w in words]
    words = [ws_lemma[0]]
    for w in ws_lemma[1:]:
        # Exclude consecutive identical words, as this may be caused by two consecutive replacements.
        if w!=words[-1]:
            words.append(w)
    cou = Counter(words)
    # Remove duplicate words when there are at least two repeated words, as this may occur when an abbreviation and its fullname appear in the same entity.
    if len(cou)>2 and cou.most_common(2)[-1][-1]>1:
        ws, se = [], set()
        for w in words:
            if w not in se:
                ws.append(w)
                se.add(w)
        ent = ' '.join(ws)
    else:
        ent = ' '.join(words)
    ent = kp.replace_keywords(ent)
    if len(ent)>1 and ent[0].isdigit(): continue
    ws = re.split(' +|(\d+)', ent)
    ws = [w for w in ws if w]
    if not ws: continue
    ws_ = [ws[0]]
    for w in ws[1:]:
        # Merge the number with the preceding word.
        if w.isdigit():
            ws_[-1] = ws_[-1]+w
        else:
            ws_.append(w)
    ent = ''.join(ws_)
    if ent in abb2lemma_full:
        # If, after the previous processing step, the entity is still an abbreviation without spaces, then replace it with the corresponding fullname.
        ws_ = abb2lemma_full[ent].split(' ')
    # Sorting the processed entity words can eliminate the impact of word order on calculating the edit distance.
    ws = sorted(ws_)
    ent = ''.join(ws)
    # If the processed entity contains only one word, such as "score," it may lack meaning. If there are similar meaningless words discovered later on,
    # they can also be filtered here.
    if ent in {'score'}:
        continue
    # Retain only entities with at least two characters.
    if len(ent)>1:
        d[ent].extend(row[2])
    if (row[0]+1)%10000==0:
        print(f'\r{row[0]+1}/{len(df)}', end='')
print(f'\r{row[0]+1}/{len(df)}', end='')
data = [(k, v) for k, v in d.items()]
df_ = pd.DataFrame(data)
# df_.to_csv('./data/replaced-ents-lemma.csv')
df_.to_parquet('./data/replaced-ents-lemma.parquet')

### 3. Similarity calculation.

In [6]:
from collections import defaultdict
import pandas as pd

df = pd.read_parquet('./data/replaced-ents-lemma.parquet')
ent2lemma = {}
for row in df.itertuples():
    for e in row[2]:
        ent2lemma[e] = row[1]
df_ent = pd.read_parquet("./data/paper-ents.parquet")
d = defaultdict(int)
# Calculate the frequency of entities after the previous processing.
for row in df_ent.itertuples():
    for e in row[2]:
        if e[1] in ent2lemma:
            d[ent2lemma[e[1]]] += 1
data = sorted(d.items(), key=lambda x:x[1], reverse=True)
df_ = pd.DataFrame(data)
# df_.to_csv('./data/replaced-ents-lemma-count.csv')
df_.to_parquet('./data/replaced-ents-lemma-count.parquet')

In [None]:
from collections import defaultdict
from thefuzz import fuzz
import pandas as pd

df = pd.read_parquet('./data/replaced-ents-lemma-count.parquet')
# Construct a trigram index.
ents, trigram_index = [], defaultdict(set)
for row in df.itertuples():
    ents.append((row[0], row[1]))
    for i in range(len(row[1])-2):
        trigram_index[row[1][i:i+3]].add(row[0])
        
def data_gen(ent_list):
    for i, ent in ent_list:
        # Do not calculate similarity for entities with a length of less than three characters.
        if len(ent)<3: continue
        # if of entity with at least one trigram
        ids = set()
        for j in range(len(ent)-2):
            ids |= trigram_index[ent[j:j+3]]
        l1 = len(ent)
        for j in ids:
            # Filter out the id less than i, and calculate the similarity of each entity only with the entities whose id are greater than i.
            if j<=i: continue
            l2 = len(ents[j][1])
            # Filter out the entities whose length differs by more than 50%.
            if l1/l2>1.5 or l2/l1>1.5: continue
            yield i, j, ent, ents[j][1] 

def fuzz_sim(item):
    # Utilize `thefuzz` library to calculate the Levenshtein distance between two entities.
    s = fuzz.ratio(item[2], item[3])
    return item[0], item[1], s

n = len(ents)
print(n, ents[:5])
for i in range(0, n, 500):
    print(f'\r{i}/{n},  {(i)/n*100:.2f}%', end='')
    items = data_gen(ents[i:i+500])
    data = map(fuzz_sim, items)
    # Retain only the entities with a similarity score of 80 or higher.
    data = [j for j in data if j[2]>80]
    df_ = pd.DataFrame(data)
    df_.to_csv('./data/sims.csv', index=False, header=False, mode='a')
print(f'\r{n}/{n},  {(n)/n*100:.2f}%', end='')

### 4. Entity clustering.

In [None]:
import pandas as pd
from collections import defaultdict

df = pd.read_parquet("./data/replaced-ents-lemma-count.parquet")
d_len = {}
for row in df.itertuples():
    d_len[row[0]] = len(row[1])

df = pd.read_csv("./data/sims.csv", header=None)
d_sim, id2ids, clusters, clu_ids  = {}, defaultdict(set), [], set()
for row in df.itertuples():
    d_sim[(row[1], row[2])] = row[3]
    # Initialize a cluster with the entities that have a similarity score greater than 95.
    if row[3]>95:
        if row[1] not in clu_ids and row[2] not in clu_ids:
            clusters.append({row[1], row[2]})
            clu_ids.add(row[1])
            clu_ids.add(row[2])
    # Record the id of every entity that has a similarity score greater than 85 with another entity. The clustering threshold will be set at 85
    # such that only clusters with at least one entity having a similarity score greater than the threshold will have an average similarity score
    # greater than the threshold.
    if row[3]>85:
        id2ids[row[1]].add(row[2])
        id2ids[row[2]].add(row[1])

# Entities with similarity scores less than or equal to 95 will each be initialized as a separate cluster.
ids_ = set(id2ids.keys())-clu_ids
for i in ids_:
    clusters.append({i})
print(f'all: {len(id2ids)}  clu_ids: {len(clu_ids)}  ids-clu_ids: {len(ids_)}')
print(f'Initialized cluster: {len(clusters)}')
print(clusters[:5])

In [None]:
from itertools import product

def avg_sim(c1, c2):
    # Calculate the average similarity between entities within two clusters.
    combs = product(c1, c2)
    sim_list = []
    for comb in combs:
        s = d_sim.get(comb, 0)+d_sim.get((comb[1], comb[0]), 0)
        if s==0: return 0
        sim_list.append(s)
    return sum(sim_list)/len(sim_list)

threshold = 93
for i in range(15):
    print(len(clusters))
    new_clusters = []
    entid2cluid = defaultdict(set)
    for m, cl in enumerate(clusters):
        l = min([d_len[idx] for idx in cl])
        # The threshold must be no less than 90 when the length of entities is no more than 5.
        t = 90 if l<6 and threshold<90 else threshold
        # Entity ids with similarity greater than 85 to the entities within the cluster.
        filter_ids = set()
        for j in cl:
            filter_ids |= id2ids.get(j, set())
        # Retrieve the clusters that contain these entities with similarity greater than 85, as only the clusters that include these entities
        # can possibly have an average similarity greater than the threshold of the given cluster.
        cluid_set = set()
        for idx in filter_ids:
            cluid_set |= entid2cluid.get(idx, set())
        # If the set of cluid_set is empty, then add the cluster as a new cluster.
        if not cluid_set:
            for j in cl:
                entid2cluid[j].add(len(new_clusters))
            new_clusters.append(cl)
            continue
        for cluid in cluid_set:
            # Calculate the average similarity between two clusters.
            avg_s = avg_sim(cl, new_clusters[cluid])
            # If the similarity exceeds a certain threshold, merge the clusters. Otherwise, create a new cluster.
            if avg_s>t:
                for j in cl:
                    entid2cluid[j].add(cluid)
                new_clusters[cluid] = new_clusters[cluid] | cl
                break
        else:
            for j in cl:
                entid2cluid[j].add(len(new_clusters))
            new_clusters.append(cl)
        if (m+1)%10000==0:
            print(f'\repoch:{i+1}, {m+1}/{len(clusters)}', end='')
    print(f'\repoch:{i+1}, {m+1}/{len(clusters)}, threshold:{threshold}')
    # Decrease the threshold by 2 after each epoch until it reaches 85 and no longer decreases.
    if threshold>85:
        threshold -= 2
    # Terminate the loop when the number of clusters no longer changes.
    if len(new_clusters)==len(clusters):
        break
    else:
        clusters = new_clusters

### 5. Find the original entities corresponding to each cluster.

In [None]:
df_ents_lemma = pd.read_parquet("./dada/replaced-ents-lemma.parquet")
# Lemma entities and their corresponding original entities.
d = {row[1]:list(row[2]) for row in df_ents_lemma.itertuples()}
df_count = pd.read_parquet(r"F:\tmp\aclanthology4\replaced-ents-lemma-count.parquet")
id2lemma_ent = {row[0]:row[1] for row in df_count.itertuples()}
clu_ents, clustered_ids = [], set()
for i, ncl in enumerate(new_clusters):
    ents = []
    for j in ncl:
        # Find the original entity corresponding to the lemma_ent id.
        ents.extend(d[id2lemma_ent[j]])
        # 记录被聚类的lemma_ent id
        clustered_ids.add(j)
    clu_ents.append(ents)
    if (i+1)%10000==0:
        print(f'\r{i+1}/{len(new_clusters)}', end='')
print(f'\r{i+1}/{len(new_clusters)}')

# Take the lemma_ent that was not included in clustering and form it into a cluster by itself. 
# Then, find the corresponding original entity for this lemma_ent.
ids_ = set(d_len.keys())-clustered_ids
for i, j in enumerate(ids_):
    clu_ents.append(d[id2lemma_ent[j]])
    if (i+1)%10000==0:
        print(f'\r{i+1}/{len(ids_)}', end='')
print(f'\r{i+1}/{len(ids_)}')
    
from collections import defaultdict
df = pd.read_parquet(r"F:\tmp\aclanthology4\paper-ents.parquet")
# Compute the frequency for each original entity.
ent_count = defaultdict(int)
for row in df.itertuples():
    for e in row[2]:
        ent_count[e[1]] += 1
data = []
for i in clu_ents:
    # Sort the original entities in the cluster by their frequency.
    ents = sorted(i, key=lambda x:ent_count[x], reverse=True)
    n = sum(ent_count[x] for x in ents)
    data.append([n, ents])
data = sorted(data, key=lambda x:x[0], reverse=True)
data = [[i, j[0], j[1]] for i, j in enumerate(data)]
df_ = pd.DataFrame(data, columns=['ent_id', 'num', 'ents'])
# df_.to_csv("./data/normalized-ents.csv")
df_.to_parquet('./data/normalized-ents.parquet')