#### 1. Obtain the year of first occurrence for each entity, its category, and the corresponding entity ids for each paper.

In [None]:
import re
import json
import pandas as pd
from collections import defaultdict
from pathlib import Path


# Construct a normalized entity-to-id dictionary.
df = pd.read_parquet("./data/normalized-ents.parquet")
# Filter entities with a frequency greater than 5.
df = df[df.num>=5]
print(len(df))
ent2id = {}
for row in df.itertuples():
    for e in row.ents:
        ent2id[e] = row[1]
print(len(ent2id))

df_paper_ents = pd.read_parquet("./data/paper-ents.parquet")
ent_year_count = defaultdict(dict)
for row in df_paper_ents.itertuples():
    # Obtain the year through the paper id
    y = re.findall('[0-9]+', row[1].split('-')[0])[0]
    year = y if len(y)==4 else f'20{y}'
    year = int(year)
    for e in row[2]:
        # Entities not found in the ent2id are filtered out.
        if e[1] not in ent2id: continue
        ent_id = ent2id[e[1]]
        # Count the frequency of entity occurrence each year:
        ent_year_count[ent_id].setdefault(year, 0)
        ent_year_count[ent_id][year] += 1
    print(f'\r{row[0]+1}/{len(df_paper_ents)}', end='')
print(f'\r{row[0]+1}/{len(df_paper_ents)}')
ent2year = {}
for ent_id, d in ent_year_count.items():
    # If a new technology entity appears for the first time in NLP, it is likely to be mentioned multiple times in papers. 
    # If its frequency is too low in a certain year, it could be due to an error in entity recognition.
    if sum(d.values())>100:
        # For entities with a total frequency greater than 100, their first appearance year can be determined as the first year 
        # with consecutive frequency greater than 5 in two or more years.
        years = [k for k, v in d.items() if v>=5 and d.get(k+1, 0)>=5]
        if years:
            ent2year[ent_id] = min(years)
        else:
            years = [k for k, v in d.items() if v>=5]
            ent2year[ent_id] = min(years)
    else:
        # For entities with a total frequency less than or equal to 100, their first appearance year can be determined as the first year 
        # with frequency greater than 5.
        years = [k for k, v in d.items() if v>=5]
        if years:
            ent2year[ent_id] = min(years)
            
print(len(ent2year))
with open(r"./data/ent-first-year.txt", 'w', encoding='utf8') as f:
    json.dump(ent2year, f, indent=4)
    
ent_type, pid_ents = defaultdict(dict), {}
for row in df_paper_ents.itertuples():
    # Obtain the year through the paper id
    y = re.findall('[0-9]+', row[1].split('-')[0])[0]
    year = y if len(y)==4 else f'20{y}'
    year = int(year)
    ents = set()
    for e in row[2]:
        # Entities not found in the ent2id are filtered out.
        if ent2id.get(e[1], 'None') not in ent2year: continue
        ent_id = ent2id[e[1]]
        # If the publication date of a paper is earlier than the determined first appearance year of an entity, which is based on its frequency
        # of occurrence, then the entity should be filtered out.
        if year<ent2year[ent_id]: continue
        ents.add(ent_id)
        # The predicted count of each entity type for an entity.
        ent_type[ent_id].setdefault(e[0], 0)
        ent_type[ent_id][e[0]] += 1
    pid_ents[row[1]] = ents
    print(f'\r{row[0]+1}/{len(df_paper_ents)}', end='')
# The entity type is determined by the most frequently predicted type.
ent2type = {k:sorted(v.items(), key=lambda x:x[1], reverse=True)[0][0] for k, v in ent_type.items()}
with open(r"F:\tmp\aclanthology4\ent-type.txt", 'w', encoding='utf8') as f:
    json.dump(ent2type, f, indent=4)

# Dictionary of paper_id to conference
with open("./data/pid2conf.txt", "rb") as f:
    pid2conf = json.load(f)

data = []
for row in df_paper_ents.itertuples():
    y = re.findall('[0-9]+', row[1].split('-')[0])[0]
    year = y if len(y)==4 else f'20{y}'
    year = int(year)
    data.append([row[1], pid2conf[row[1]], year, pid_ents[row[1]]])

df_ = pd.DataFrame(data, columns=['id', 'conference', 'year', 'ents'])
df_.to_parquet("./data/paper-ent-ids.parquet")
# df_.to_csv("./data/paper-ent-ids.csv")

### 2. z-score calculation

In [None]:
import json
import pandas as pd
from collections import defaultdict
from itertools import combinations

df = pd.read_parquet(r"./data/paper-ent-ids.parquet")
d = defaultdict(int)
for row in df.itertuples():
    for e in row.ents:
        d[e] += 1

# Entities that appear in fewer than three papers.
filter_ids = {k for k, v in d.items() if v<3}

year_ids, year_net = defaultdict(set), defaultdict(dict)
for row in df.itertuples():
    # Filter out entities that appear in fewer than three papers.
    ents = set(row.ents)-filter_ids
    combs = combinations(sorted(ents), 2)
    for comb in combs:
        # Record the co-occurrence frequency of entities in papers each year, that is, construct a co-occurrence network of entities for each year.
        year_net[row.year].setdefault(f'{comb[0]}-{comb[1]}', 0)
        year_net[row.year][f'{comb[0]}-{comb[1]}'] += 1
    print(f'\rbuild co_occurrence network: {row[0]+1}/{len(df)}', end='')
print(f'\rbuild co_occurrence network: {row[0]+1}/{len(df)}')

z_dict = defaultdict(dict)
for year in range(2000, 2023):
    # The co-occurrence network of entities for that year.
    net = year_net[year]
    ent_w = defaultdict(int)
    for k in net:
        e1, e2 = k.split('-')
        # Accumulate the weight between each entity and other entities.
        ent_w[e1] += net[k]
        ent_w[e2] += net[k]
    # Accumulated weight list for all entities
    w_list = list(ent_w.values())
    # The square of the cumulative weights of all entities
    w_2 = (w**2 for w in w_list)
    # average value
    avg_w = sum(w_list)/len(w_list)
    # standard deviation
    std = (sum(w_2)/len(w_list)-avg_w**2)**0.5
    for e in ent_w:
        # z-score of each entity in that year
        z = (ent_w[e]-avg_w)/std
        z_dict[e][year] = round(z, 4)
    print(f'\rcompute z_score: {year}', end='')
with open("./data/z_dict.txt", 'w', encoding='utf8') as f:
    json.dump(z_dict, f, indent=4)

### 3. High-impact new entities

In [4]:
import json
import pandas as pd

with open("./data/ent-type.txt", 'rb') as f:
    id2type = json.load(f)
with open("./data/ent-first-year.txt", 'rb') as f:
    id2year = json.load(f)
df = pd.read_parquet("./data/normalized-ents.parquet")
df = df[df.num>=5]
id2ents = {row.ent_id:row.ents for row in df.itertuples()}
with open("./data/z_dict.txt", 'rb') as f:
    z_dict = json.load(f)
# Starting from 2001, any new entity with the largest z-score exceeding 2.5 will be considered a high-impact entity.
data = []
for k, v in z_dict.items():
    year = id2year[k]
    if year==2000: continue
    max_z = max(v.values())
    if max_z>2.5:
        data.append([k, id2type[k], max_z, year, v, list(id2ents[int(k)])])
data = sorted(data, key=lambda x:x[2], reverse=True)
df_ = pd.DataFrame(data, columns=['ent_id', 'ent_type', 'max_z-score', 'first_year', 'z-score_dict', 'ent_cluster'])
df_.to_csv("./data/top-ents.csv")

In [12]:
import pandas as pd
df = pd.read_csv(r"./data/top-ents.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ent_id,ent_type,max_z-score,first_year,z-score_dict,ent_cluster
0,0,1,Method,43.3138,2019,"{'2019': 14.3712, '2020': 34.6266, '2021': 43....","['BERT', 'BERT', 'BERT model', 'BERT-base', 'B..."
1,1,7,Method,34.6696,2018,"{'2018': 3.0913, '2019': 13.3371, '2020': 26.3...","['Transformer', 'transformer', 'Transformers',..."
2,2,6,Method,28.8231,2014,"{'2014': -0.0454, '2015': 3.329, '2016': 15.10...","['LSTM', 'LSTMs', 'LSTM model', 'LSTM models',..."
3,3,11,Method,26.2604,2006,"{'2006': -0.1572, '2007': -0.1271, '2008': -0....","['attention', 'attention mechanism', 'attentio..."
4,4,67,Method,20.3561,2016,"{'2016': 2.886, '2017': 9.4933, '2018': 14.846...","['Adam', 'Adam optimizer', 'ADAM', 'Adam algor..."


In [13]:
# The total number of high-impact entities, and the quantity of entities of each type.
print(len(df))
from collections import Counter
Counter(df.iloc[:, 2])

179


Counter({'Method': 130, 'Dataset': 24, 'Metric': 19, 'Tool': 6})

In [15]:
from collections import defaultdict

d = defaultdict(dict)
for row in df.itertuples():
    ents = eval(row[7])
    d[row[3]][ents[0]] = row[4]
# The top 5 high-impact entities for each type.
for k, v in d.items():
    v = sorted(v.items(), key=lambda x:x[1], reverse=True)
    print(k)
    print(v[:5])

Method
[('BERT', 43.3138), ('Transformer', 34.6696), ('LSTM', 28.8231), ('attention', 26.2604), ('Adam', 20.3561)]
Dataset
[('Wikipedia', 17.4187), ('MNLI', 6.7163), ('SQuAD', 5.783), ('Twitter', 5.3056), ('SST-2', 5.2605)]
Metric
[('BLEU', 15.9303), ('cross-entropy', 13.1292), ('ROUGE', 7.8905), ('fluency', 6.9009), ('standard deviation', 6.1762)]
Tool
[('PyTorch', 6.1565), ('Moses', 5.3327), ('GIZA++', 5.2089), ('TensorFlow', 3.563), ('Stanford parser', 3.2967)]
