In [2]:
from pymongo import MongoClient

db = MongoClient('localhost', 27017).get_database("sciso-2024")
pub_refs = db.get_collection("pub_refs")

In [3]:
import json
from tqdm import tqdm


venue_info = {}
cursor = pub_refs.find({"metadata.venue": {"$exists": True, "$ne": None}})

for doc in tqdm(
    cursor,
    total=pub_refs.count_documents({"metadata.venue": {"$exists": True, "$ne": None}}),
):
    venue = doc["metadata"]["venue"]
    if type(venue) == dict:
        if venue["name"] not in venue_info:
            venue_info[venue["name"]] = venue
        elif type(venue_info[venue["name"]]) == str:
            venue_info[venue["name"]] = venue
        for alias in venue.get("alternate_names", []):
            if alias not in venue_info:
                venue_info[alias] = venue
            elif type(venue_info[alias]) == str:
                venue_info[alias] = venue
    elif type(venue) == str:
        if venue not in venue_info:
            venue_info[venue] = venue

print("Found {} venues".format(len(venue_info)))
with open("venue_info.json", "w") as f:
    json.dump(venue_info, f, indent=2)

  0%|          | 0/13281 [00:00<?, ?it/s]

100%|██████████| 13281/13281 [00:00<00:00, 14308.77it/s]

Found 7279 venues





In [4]:
import re
import nltk
import number_parser

nltk.download("stopwords")
nltk.download("punkt")
stop_words = set(nltk.corpus.stopwords.words("english"))
stop_words.update(
    [
        "proceedings",
        "workshop",
        # "conference",
        # "international",
        # "symposium",
        # "journal",
        # "ieee",
        # "usenix",
        # "acm",
        # "association",
        "th",
        "st",
        "nd",
        "rd",
        "volume",
    ]
)
lemmatizer = nltk.stem.WordNetLemmatizer()


def clean_venue_name(venue: str):
    venue = number_parser.parse(venue)
    venue = (
        venue.replace(".", "")
        .replace("&", "and")
        .replace("'", "")
        .replace("/", " ")
        .replace("-", " ")
        .replace(",", "")
        .lower()
    )
    venue = re.sub("\\d+", "", venue)
    venue = re.sub("\\(.+?\\)", "", venue)
    venue_parts = venue.split(":", 1)
    if len(venue_parts) > 1:
        venue = (
            venue_parts[0]
            if len(venue_parts[0]) > len(venue_parts[1])
            else venue_parts[1]
        )
    tokens = nltk.word_tokenize(venue)
    return " ".join([lemmatizer.lemmatize(t) for t in tokens if t not in stop_words])


clean_venue_name("Information Hiding and Multimedia Security Workshop")

[nltk_data] Downloading package stopwords to /home/ruh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ruh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


'information hiding multimedia security'

In [5]:
import pandas as pd
valid_venues = {k: v for k, v in venue_info.items() if type(v) == dict}
valid_venues = pd.DataFrame.from_dict(valid_venues, orient="index").reset_index().rename(columns={"index": "raw_name"})
valid_venues["clean_name"] = valid_venues["raw_name"].apply(clean_venue_name)
valid_venues[valid_venues["raw_name"] == "International Conference on Image Processing"]

Unnamed: 0,raw_name,id,name,type,alternate_names,issn,url,alternate_urls,alternate_issns,clean_name
245,International Conference on Image Processing,b6369c33-5d70-463c-8e82-95a54efa3cc8,International Conference on Information Photonics,conference,"[Int Conf Image Process, ICIP, Int Conf Inf Ph...",,,,,international conference image processing


In [6]:
pd.set_option("display.max_colwidth", None)
same_cleaned_same = valid_venues.groupby("clean_name").agg({"id": lambda x: set(x), "raw_name": lambda x: set(x)})
same_cleaned_same[same_cleaned_same["id"].apply(len) > 1]

Unnamed: 0_level_0,id,raw_name
clean_name,Unnamed: 1_level_1,Unnamed: 2_level_1
cc,"{73f7fe95-b68b-468f-b7ba-3013ca879e50, 41c21288-abea-4a71-98f6-5722d33e9edc}","{CcS, CC, CCS}"
ci,"{03e270f3-9983-44e1-9d91-754044085687, bce97c16-ce21-4e56-ab0e-0060085c0d85}","{CIS, CI}"
ecosystem,"{72cc6474-a775-4c22-823f-3684c7ef79bf, 1874d943-7e14-460b-a52f-895ceacad37c}","{One Ecosystem, Ecosystems}"
icit,"{62ece4be-f643-40a5-af47-abedd995a7cf, 6c1a00cb-4b59-44d6-a887-bfd7c5b3768f}","{ICIT, IC2IT}"
international conference data engineering,"{f4d9ff4f-5eeb-4aaa-a916-8246dda89fad, 764e3630-ddac-4c21-af4b-9d32ffef082e}","{Proceedings / International Conference on Data Engineering, International Conference on Data Engineering}"
international conference information technology,"{e4272e2a-186f-4774-aedc-5a095c25990b, 67128bf0-cf99-46bd-8915-1d9a05820d0a}","{International Conference on Information Technology, International Conference on Information Technologies, Proceedings of the International Conference on Information Technology}"
international conference parallel processing,"{29df4b17-9a16-4a4c-94a6-002f52e628b4, 29d9d54d-04bd-4927-ba16-24662f2ac3da}","{Proceedings of the International Conference on Parallel Processing, International Conference on Parallel Processing}"
journal experimental psychology,"{ba388b36-981e-4c1b-8048-464cdaa9c9fc, 227ccdaf-ba32-464f-8d97-064c05b56437}","{Journal of Experimental Psychology: Learning, Memory and Cognition, Journal of Experimental Psychology: General}"
n,"{71826259-dbc2-44dc-9ffa-abf8abd1514e, de542ef7-754d-4bd9-a2dd-47897b7cd339}","{Workshop on ns-3, NSS, Workshop ns-3}"
sc,"{048c4e4e-ad38-42be-8e06-4486715e41e0, 7ed86435-f510-45fe-b582-c212782023aa}","{SC, SCSS}"


In [7]:
cleaned_name_to_info = {}
for row in valid_venues.itertuples():
    if row.clean_name not in cleaned_name_to_info:
        cleaned_name_to_info[row.clean_name] = row._asdict()
cleaned_name_to_info

{'journal functional programming': {'Index': 0,
  'raw_name': 'Journal of functional programming',
  'id': '58f55ec2-1f5a-4e79-af2a-0046ee1dcd2a',
  'name': 'Journal of functional programming',
  'type': 'journal',
  'alternate_names': ['J funct program',
   'Journal of Functional Programming',
   'J Funct Program'],
  'issn': '0956-7968',
  'url': 'https://www.cambridge.org/core/journals/journal-of-functional-programming',
  'alternate_urls': ['http://journals.cambridge.org/jid_JFP'],
  'alternate_issns': nan,
  'clean_name': 'journal functional programming'},
 'j funct program': {'Index': 1,
  'raw_name': 'J funct program',
  'id': '58f55ec2-1f5a-4e79-af2a-0046ee1dcd2a',
  'name': 'Journal of functional programming',
  'type': 'journal',
  'alternate_names': ['J funct program',
   'Journal of Functional Programming',
   'J Funct Program'],
  'issn': '0956-7968',
  'url': 'https://www.cambridge.org/core/journals/journal-of-functional-programming',
  'alternate_urls': ['http://journals

In [8]:
verified_venues = {}
pending_venues = []
for k, v in venue_info.items():
    if type(v) == dict:
        verified_venues[k] = v
        continue
    cleaned_name = clean_venue_name(k)
    if k == "2009 IEEE 31st International Conference on Software Engineering":
        print(cleaned_name, cleaned_name in cleaned_name_to_info)
    if cleaned_name in cleaned_name_to_info:
        verified_venues[k] = cleaned_name_to_info[cleaned_name]
    else:
        pending_venues.append(k)

ieee international conference software engineering False


In [9]:
len(valid_venues), len(verified_venues), len(pending_venues)

(5915, 6011, 1268)

In [10]:
venue_names = list({clean_venue_name(v) for v in venue_info.keys()})
venue_names[2001]

'cardis'

In [10]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(venue_names, show_progress_bar=True)

Batches:   0%|          | 0/213 [00:00<?, ?it/s]

In [11]:
venue_embeddings = {}
for i, venue in enumerate(venue_names):
    venue_embeddings[venue] = embeddings[i]

In [97]:
from sentence_transformers import util


def get_similarity(v1, v2):
    v1 = clean_venue_name(v1)
    print(v1)
    v2 = clean_venue_name(v2)
    print(v2)
    v1 = venue_embeddings[v1]
    v2 = venue_embeddings[v2]
    return util.pytorch_cos_sim(v1, v2).item()


def find_most_similar_venue(venue: str):
    venue = clean_venue_name(venue)
    embed_venue = venue_embeddings[venue]
    cosine_sim = util.pytorch_cos_sim(embed_venue, embeddings)[0]
    input_index = venue_names.index(venue)
    cosine_sim[input_index] = -1
    most_similar_venues = []
    if len(venue.split()) > 5:
        threshold = 0.87
    else:
        threshold = 0.9
    for i, score in enumerate(cosine_sim):
        if score >= threshold:
            most_similar_venues.append((venue_names[i], score.item()))
    most_similar_venues.sort(key=lambda x: x[1], reverse=True)
    return most_similar_venues

In [98]:
aliases = {}
cluster_nb = 0
for venue in tqdm(pending_venues):
    most_similar_venues = find_most_similar_venue(venue)
    if len(most_similar_venues) == 0:
        continue
    cid = None
    if venue in aliases:
        cid = aliases[venue]
    else:
        for sv, _ in most_similar_venues:
            if sv in aliases:
                cid = aliases[sv]
                break
    if cid is None:
        cid = cluster_nb
        cluster_nb += 1
    aliases[venue] = cid
    for sv, _ in most_similar_venues:
        aliases[sv] = cid
print("Found {} clusters".format(cluster_nb))

100%|██████████| 1277/1277 [00:40<00:00, 31.39it/s]

Found 261 clusters





In [99]:
import pandas as pd

alias_df = pd.DataFrame(aliases.items(), columns=["venue", "cluster"])
alias_df = alias_df.groupby("cluster").agg(set)
alias_df.to_json("venue_aliases.json", indent=2, orient="records")
alias_df.head()

Unnamed: 0_level_0,venue
cluster,Unnamed: 1_level_1
0,"{ieee transaction industrial electronics, ieee transaction industrial informatics, ieee transaction circuit system, ieee transaction computer, ieee transaction power system, ieee transaction consumer electronics, IEEE Transactions on Circuits and Systems, IEEE transactions on industrial electronics (1982. Print)}"
1,"{european journal combinatorics, journal combinatorial theory, electronic journal combinatorics, journal combinatorial optimization, journal combinatorial theory series b, European journal of combinatorics (Print), Journal of combinatorial theory, Journal of Combinatorial Designs, Journal of Combinatorial Theory, Series B, journal combinatorial design, journal combinatorial theory series}"
2,"{international journal new computer architecture application, International journal of computer applications, international journal applied information system, international journal advanced computer science application}"
3,"{acm se, ACM '58, ACM '59}"
4,"{Rep4NLP@ACL, replnlp @ acl}"


In [89]:
get_similarity(
    "2009 IEEE 31st International Conference on Software Engineering",
    "International Conference on Software Engineering",
)

ieee international conference software engineering
international conference software engineering


0.8715152740478516

In [88]:
find_most_similar_venue(
    "2009 IEEE 31st International Conference on Software Engineering"
)

[]

In [17]:
test = "Proceedings of the 2001 IEEE Computer Society Conference on Computer Vision and Pattern Recognition. CVPR 2001"
test = clean_venue_name(test)
aliases[test]

7

In [18]:
[v for v, c in aliases.items() if c == 28]

['Journal of the Royal Statistical Society Series B: Statistical Methodology',
 'journal royal statistical society series c']

In [11]:
with open("checked_aliases.json", "r") as f:
    checked_aliases = json.load(f)
checked_aliases

[{'venue': ['ieee annual symposium foundation computer science',
   '30th Annual Symposium on Foundations of Computer Science',
   '40th Annual Symposium on Foundations of Computer Science (Cat. No.99CB37039)',
   '28th Annual Symposium on Foundations of Computer Science (sfcs 1987)',
   '[ ] annual symposium foundation computer science',
   '18th Annual Symposium on Foundations of Computer Science (sfcs 1977)',
   '17th Annual Symposium on Foundations of Computer Science (sfcs 1976)',
   '20th Annual Symposium on Foundations of Computer Science (sfcs 1979)',
   '21st Annual Symposium on Foundations of Computer Science (sfcs 1980)',
   '26th Annual Symposium on Foundations of Computer Science (sfcs 1985)',
   'Proceedings 39th Annual Symposium on Foundations of Computer Science (Cat. No.98CB36280)',
   '27th Annual Symposium on Foundations of Computer Science (sfcs 1986)',
   '23rd Annual Symposium on Foundations of Computer Science (sfcs 1982)']},
 {'venue': ['lics',
   '[1992] Procee

In [12]:
manual_venues = []
success_fixed = {}
for row in checked_aliases:
    venues = row["venue"]
    if "name" in row:
        for v in venues:
            success_fixed[v] = {"name": row["name"]}
    vinfo = None
    for v in venues:
        v = clean_venue_name(v)
        if v in cleaned_name_to_info:
            temp = cleaned_name_to_info[v]
            if type(temp) == dict:
                vinfo = temp
                break
    if vinfo is None:
        print("No verified venue found for {}".format(venues))
        manual_venues.append(venues)
        continue
    for v in venues:
        success_fixed[v] = vinfo

No verified venue found for ['Journal of Applied Econometrics', 'journal applied econometrics']
No verified venue found for ['econometrics journal', 'Journal of Econometrics', 'The Econometrics Journal', 'journal econometrics']
No verified venue found for ['springer mathematics statistic', 'Springer Proceedings in Mathematics &amp; Statistics']
No verified venue found for ['international symposium industrial electronics', '2012 IEEE International Symposium on Industrial Electronics', '2009 IEEE International Symposium on Industrial Electronics']
No verified venue found for ['Journal of Geophysical Research: Solid Earth', 'Journal of Geophysical Research']
No verified venue found for ['Journal of the Royal Statistical Society Series C: Applied Statistics', 'journal royal statistical society series b']
No verified venue found for ['Computer-Aided Design', 'computer aided design application']
No verified venue found for ['european physical journal c', 'european physical journal', 'The Eur

In [13]:
clean_venue_name("International Symposium on Industrial Electronics")

'international symposium industrial electronics'

In [14]:
cleaned_name_to_info["international symposium industrial electronics"]

KeyError: 'international symposium industrial electronics'

In [15]:
success_fixed["2009 IEEE 31st International Conference on Software Engineering"]

{'Index': 103,
 'raw_name': 'International Conference on Software Engineering',
 'id': 'a36dc29e-4ea1-4567-b0fe-1c06daf8bee8',
 'name': 'International Conference on Software Engineering',
 'type': 'conference',
 'alternate_names': ['Int Conf Softw Eng', 'ICSE'],
 'issn': nan,
 'url': 'http://www.icse-conferences.org/',
 'alternate_urls': nan,
 'alternate_issns': nan,
 'clean_name': 'international conference software engineering'}

In [16]:
openalex_venues = [k for k, v in venue_info.items() if type(v) == str]
print("Found {} OpenAlex venues".format(len(openalex_venues)))

Found 1364 OpenAlex venues


In [17]:
fixed_at_alias = {}
for venue in openalex_venues:
    if venue in fixed_at_alias:
        continue
    if "@" in venue:
        parts = venue.split("@")
        for part in parts:
            if part in venue_info:
                fixed_at_alias[venue] = venue_info[part]
                break
openalex_venues = [k for k in openalex_venues if k not in fixed_at_alias]
print(
    "Found {} fixed aliases; {} to go".format(len(fixed_at_alias), len(openalex_venues))
)

Found 37 fixed aliases; 1327 to go


In [18]:
final_venues = {}
i = 0
for venue, info in venue_info.items():
    if type(info) == dict:
        final_venues[venue] = info
        continue
    cleaned_name = clean_venue_name(venue)
    if cleaned_name in cleaned_name_to_info:
        final_venues[venue] = cleaned_name_to_info[cleaned_name]
    elif venue in success_fixed:
        final_venues[venue] = success_fixed[venue]
    elif cleaned_name in success_fixed:
        final_venues[venue] = success_fixed[cleaned_name]
    elif venue in fixed_at_alias:
        final_venues[venue] = fixed_at_alias[venue]
        i += 1
    else:
        final_venues[venue] = venue
len(final_venues), i

(7279, 37)

In [19]:
with open("venues.json", "w") as f:
    json.dump(final_venues, f, indent=2, default=str)

In [20]:
def trim_venue_name(venue: str):
    trim_stop_words = ["th", "st", "nd", "rd", "volume"]
    venue = number_parser.parse(venue)
    venue = (
        venue.replace(".", "")
        .replace("&", "and")
        .replace("'", "")
        .replace("/", " ")
        .replace("-", " ")
        .replace(",", "")
        .lower()
    )
    venue = re.sub("\\d+", "", venue)
    venue = re.sub("\\(.+?\\)", "", venue)
    tokens = nltk.word_tokenize(venue)
    return " ".join([t for t in tokens if t not in trim_stop_words])

In [21]:
with open("venues.json", "r") as f:
    final_venues = json.load(f)

not_info_venues = {}
for venue, info in final_venues.items():
    if type(info) == dict:
        continue
    trimmed_name = trim_venue_name(venue)
    if trimmed_name not in not_info_venues:
        not_info_venues[trimmed_name] = set()
    not_info_venues[trimmed_name].add(venue)
    if "Symposium on Symbolic and Numeric Algorithms for Scientific Computing".lower() in venue.lower():
        print(trimmed_name, not_info_venues[trimmed_name])
not_info_venues = {k: list(v) for k, v in not_info_venues.items() if len(v) > 1}
with open("not_info_venues.json", "w") as f:
    json.dump(not_info_venues, f, indent=2)


In [22]:
with open("venues.json", "r") as f:
    final_venues = json.load(f)

for venue, info in final_venues.items():
    if type(info) == dict:
        continue
    norm_name = trim_venue_name(venue)
    final_venues[venue] = norm_name.capitalize()

with open("venues.json", "w") as f:
    json.dump(final_venues, f, indent=2)

In [23]:
final_venues["2006 IEEE Symposium on Interactive Ray Tracing"]

'Ieee symposium on interactive ray tracing'