In [None]:
import json
from tqdm import tqdm
from pymongo import MongoClient

db = MongoClient("localhost", 27017).get_database("sciso-2024")
pub_refs = db.get_collection("pub_refs")

with open("venues.json", "r") as f:
    venue_info = json.load(f)


def normalize_venue(doc):
    if raw_venue := doc.get("metadata", {}).get("venue", None):
        if type(raw_venue) == dict:
            if raw_venue.get("name", None):
                updated_venue = venue_info.get(raw_venue.get("name"), None)
                if updated_venue:
                    if type(updated_venue) == dict:
                        doc["metadata"]["normalized_venue"] = updated_venue.get("name", None)
                    else:
                        doc["metadata"]["normalized_venue"] = updated_venue
                else:
                    doc["metadata"]["normalized_venue"] = raw_venue.get("name", None)
            else:
                print(f"Missing name for venue {raw_venue}")
        elif type(raw_venue) == str:
            if raw_venue in venue_info:
                norm_venue = venue_info[raw_venue]
                if type(norm_venue) == dict:
                    norm_venue = norm_venue.get("name", None)
                doc["metadata"]["normalized_venue"] = norm_venue
            else:
                print(f"Missing venue {raw_venue}")
                doc["metadata"]["normalized_venue"] = raw_venue
        else:
            print(f"Unknown type for venue {raw_venue}")
    else:
        print(f"Missing venue for {doc['_id']}")
    return doc


def update_venue():
    cursor = pub_refs.find({"metadata.venue": {"$exists": True, "$ne": None}})
    for doc in tqdm(
        cursor,
        total=pub_refs.count_documents(
            {"metadata.venue": {"$exists": True, "$ne": None}}
        ),
    ):
        pub_refs.update_one({"_id": doc["_id"]}, {"$set": normalize_venue(doc)})


# be careful before running this!!!!! remember to run resolve_conflicts.ipynb after running this!!!!!
# update_venue()

100%|██████████| 13281/13281 [00:05<00:00, 2290.25it/s]


In [None]:
def output_pub_refs():
    with open("pub_refs_24_nor.jsonl", "w") as f:
        cursor = pub_refs.find({})
        for doc in tqdm(cursor, total=pub_refs.count_documents({})):
            f.write(json.dumps(doc, default=str, separators=("," ":")) + "\n")
# output_pub_refs()

100%|██████████| 14914/14914 [00:01<00:00, 7756.15it/s]


In [19]:
import pandas as pd
pubrefs = pd.read_json("../../sciso.jsonl", lines=True)
pubrefs = pubrefs[["PostId", "metadata", "Url", "History", "LinkType", "AddedAt", "AddedBy", "RevisionId", "Source"]]
pubrefs.head()

Unnamed: 0,PostId,metadata,Url,History,LinkType,AddedAt,AddedBy,RevisionId,Source
0,37933,"{'title': 'Asynchronous completion token', 'authors': [{'name': 'Irfan Pyarali', 'id': '1840209'...",http://www.cse.wustl.edu/~schmidt/PDF/ACT.pdf,2,pdf,2008-09-01T13:23:18.830,2010,52547,pdf
1,125285,"{'title': 'Impact of high-intensity negotiated-style interruptions on end-user debugging', 'venu...",ftp://ftp.cs.orst.edu/pub/burnett/jvlc-interruptions-2006.pdf,2,pdf,2008-09-24T03:37:32.107,5113,178094,pdf
2,80736,"{'title': 'A note on the genuine Sieve of Eratosthenes', 'authors': [{'name': 'M. Nykänen', 'id'...",http://www.cs.hmc.edu/~oneill/papers/Sieve-JFP.pdf,2,pdf,2008-09-17T07:15:11.747,9632,114305,pdf
3,133356,"{'title': 'Writing Hygienic Macros in Scheme with Syntax-Case', 'authors': [{'name': 'Kent Dybvi...",http://www.cs.uml.edu/~giam/91.531/Textbooks/RKDybvig.pdf,2,pdf,2008-09-25T13:34:28.060,7851,190988,pdf
4,25248,"{'title': 'A consensus glossary of temporal database concepts', 'authors': [{'name': 'C. Dyreson...",http://www.cs.arizona.edu/~rts/pubs/SIGMODRecordMarch94p52.pdf,2,pdf,2008-08-24T18:11:02.603,2010,33333,pdf


In [20]:
metadata = pd.DataFrame(list(pubrefs["metadata"]))
metadata = metadata[["title", "normalized_venue", "year", "citation_count", "authors", "type", "open_access", "external_ids", "concepts"]].join(pubrefs["PostId"])
metadata["year"] = metadata["year"].astype("Int64")
metadata.head()

Unnamed: 0,title,normalized_venue,year,citation_count,authors,type,open_access,external_ids,concepts,PostId
0,Asynchronous completion token,Addison wesley longman publishing co inc ebooks,1997,8.0,"[{'name': 'Irfan Pyarali', 'id': '1840209'}, {'name': 'Timothy H. Harrison', 'id': '2079983'}, {...",[book],False,"{'s2': '44bb7b1bada2a777a131af91a1fbacc7e0bda541', 'mag': '1577953154', 'corpusid': 267916118, '...","[{'id': 'https://openalex.org/C41008148', 'display_name': 'computer science', 'level': 0, 'score...",37933
1,Impact of high-intensity negotiated-style interruptions on end-user debugging,Journal of visual languages and computing,2005,4.0,,[journal-article],False,{'openalex': 'https://openalex.org/W1987666248'},"[{'id': 'https://openalex.org/C41008148', 'display_name': 'computer science', 'level': 0, 'score...",125285
2,A note on the genuine Sieve of Eratosthenes,Journal of functional programming,2008,44.0,"[{'name': 'M. Nykänen', 'id': '1890836'}]","[JournalArticle, journal-article]",True,"{'s2': 'e6e9eb7f5c4f9d935d2b92b9baa5e293995fef81', 'mag': '2168056765', 'dblp': 'journals/jfp/Ny...","[{'id': 'https://openalex.org/C41008148', 'display_name': 'computer science', 'level': 0, 'score...",80736
3,Writing Hygienic Macros in Scheme with Syntax-Case,,1992,35.0,"[{'name': 'Kent Dybvigdyb', 'id': '2262888616'}]",[journal-article],False,"{'s2': '8705776b20156fe235a7dc5a4fd0a6ff0fcb486a', 'mag': '2134717046', 'corpusid': 7608716, 'op...","[{'id': 'https://openalex.org/C41008148', 'display_name': 'computer science', 'level': 0, 'score...",133356
4,A consensus glossary of temporal database concepts,ACM SIGMOD Conference,1994,495.0,"[{'name': 'C. Dyreson', 'id': '1710939'}, {'name': 'F. Grandi', 'id': '143732420'}, {'name': 'W....",[journal-article],True,"{'s2': '28e57aeb8a9e8cd798c1ee59644a22568f0d666e', 'mag': '1971864291', 'doi': '10.1145/181550.1...","[{'id': 'https://openalex.org/C41008148', 'display_name': 'computer science', 'level': 0, 'score...",25248


In [21]:
venue_grouped = metadata.groupby("normalized_venue").size().reset_index(name="count")
venue_grouped = venue_grouped.sort_values(by="count", ascending=False)
venue_grouped.to_json("venue_grouped.json", orient="records")
venue_grouped.head()

Unnamed: 0,normalized_venue,count
3467,arXiv.org,1831
2616,Neural Information Processing Systems,629
583,Computer Vision and Pattern Recognition,581
1659,International Conference on Learning Representations,562
1660,International Conference on Machine Learning,418


In [11]:
metadata_grouped = metadata.groupby("title").agg(
    {
        "normalized_venue": lambda x: set(x),
        "year": "min",
        "citation_count": "max",
        "open_access": "any",
        "PostId": lambda x: set(x),
    }
).reset_index()
metadata_grouped

Unnamed: 0,title,normalized_venue,year,citation_count,open_access,PostId
0,Neural Network Methods for Natural Language Processing,{Computational Linguistics},2017,549.0,True,{48445189}
1,"""Low-Resource"" Text Classification: A Parameter-Free Classification Method with Compressors",{Annual Meeting of the Association for Computational Linguistics},2023,52.0,True,{76724158}
2,"#ifdef Considered Harmful, or Portability Experience with C News",{Usenix summer},1992,246.0,False,"{489434, 8012461, 21097319, 9294487}"
3,$2.00 Gas! Studying the Effects of a Gas Tax Moratorium,{Journal of Public Economics},2008,162.0,True,{75879031}
4,"$\mu $ RNG: A 300–950 mV, 323 Gbps/W All-Digital Full-Entropy True Random Number Generator in 14...",{IEEE Journal of Solid-State Circuits},2016,83.0,False,{61554191}
...,...,...,...,...,...,...
13282,“One Against One” or “One Against All”: Which One is Better for Handwriting Recognition with SVMs?,"{nan, International Conference on Frontiers in Handwriting Recognition}",2006,272.0,False,"{16728386, 16252899, 16793390}"
13283,"“Search, Show Context, Expand on Demand”: Supporting Large Graph Exploration with Degree-of-Inte...",{IEEE Transactions on Visualization and Computer Graphics},2009,257.0,False,{21893401}
13284,“Should This Loan be Approved or Denied?”: A Large Dataset with Class Assignment Guidelines,{Journal of statistics education},2018,33.0,True,{64403934}
13285,“We Are Here to Help”Who Opens the Gate for Surgeries?,{Tsq transgender studies quarterly},2018,10.0,False,"{5017208, 58591979, 14537012, 5151693}"


In [12]:
pd.set_option("display.max_colwidth", 100)
conflicted_venue_idx = metadata_grouped["normalized_venue"].apply(len) > 1
metadata_grouped[conflicted_venue_idx]

Unnamed: 0,title,normalized_venue,year,citation_count,open_access,PostId
77,A Bootstrap Evaluation of the Effect of Data Splitting on Financial Time Series,"{Ieee trans neural networks, IEEE Transactions on Neural Networks}",1998,106.0,True,{25067048}
186,A Decorated Parallel Coordinate Plot for Categorical Longitudinal Data,"{American Statistician, The american statistician}",2014,18.0,False,"{75517617, 28968342}"
655,A Survey of Text Similarity Approaches,"{International journal of computer applications, nan}",2013,851.0,True,"{44595712, 42657333}"
729,A Unified Approach to Interpreting Model Predictions,"{arXiv.org, Neural Information Processing Systems}",2017,17636.0,True,"{62398529, 69905537, 72096547, 66844388, 60987558, 76481990, 75090662, 72957961, 67488811, 58203..."
871,A fast triangle-triangle intersection test,"{Journal of graphics gpu and game tools, Journal of graphics tools}",1997,438.0,False,"{35400896, 1496260, 29563443, 1496345, 36473309}"
...,...,...,...,...,...,...
12057,The normal law under linear restrictions: simulation and estimation via minimax tilting,"{Journal of the royal statistical society series b, Journal of the royal statistical society ser...",2016,221.0,True,"{68814235, 51218340}"
12079,The quickhull algorithm for convex hulls,"{Toms, ACM Transactions on Mathematical Software}",1996,5269.0,True,"{75396912, 43965579, 66151164, 12333877}"
12673,Using Evolutionary Testing to Find Test Scenarios for Hard to Reproduce Faults,"{International Conference on Software Testing, Verification and Validation Workshops, Internatio...",2010,2.0,False,{25322117}
12973,What matters in differences between life trajectories: a comparative review of sequence dissimil...,"{Journal of the royal statistical society series a, Journal of the royal statistical society}",2015,357.0,True,"{75517617, 77837858, 53281018}"


In [13]:
len(metadata_grouped)

13287

In [14]:
metadata_grouped["normalized_venue"] = metadata_grouped["normalized_venue"].apply(
    lambda x: list(x)[0] if len(x) == 1 else x
)
metadata_grouped["reference_count"] = metadata_grouped["PostId"].apply(len)
metadata_grouped.sort_values("reference_count", ascending=False).head(11)

Unnamed: 0,title,normalized_venue,year,citation_count,open_access,PostId,reference_count
2404,Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift,International Conference on Machine Learning,2015,41504.0,True,"{44475009, 69508740, 46181636, 40314502, 41416583, 39258253, 53335053, 58256783, 45731612, 63020...",67
2196,Attention is All you Need,Neural Information Processing Systems,2017,110977.0,True,"{62575360, 59377538, 62177796, 56930821, 63178631, 63298183, 71400202, 78645642, 69129741, 65343...",62
12504,U-Net: Convolutional Networks for Biomedical Image Segmentation,International Conference on Medical Image Computing and Computer-Assisted Intervention,2015,68758.0,True,"{65635714, 68121220, 71277061, 59237001, 65604874, 40640906, 65921292, 64972431, 56764048, 53963...",57
6357,"IEEE Standard for SystemVerilog--Unified Hardware Design, Specification, and Verification Language",,2017,41.0,False,"{77293184, 77502338, 69790851, 64085508, 77344521, 57198731, 62908941, 67739151, 77528852, 76447...",57
2355,BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding,North American Chapter of the Association for Computational Linguistics,2018,84697.0,True,"{64588418, 78201090, 71965576, 56129165, 56828558, 55114128, 55982099, 54773912, 55066010, 60493...",51
4349,Dropout: a simple way to prevent neural networks from overfitting,Journal of machine learning research,2014,38062.0,False,"{41311232, 44131458, 44036610, 37210500, 54170758, 42696199, 44109320, 44224010, 50843787, 57909...",46
9661,Purely Functional Data Structures,,1998,598.0,False,"{20205191, 5573640, 1602574, 8773006, 528528, 12393104, 13236370, 8237847, 2279320, 2223258, 227...",45
3846,Deep Residual Learning for Image Recognition,Computer Vision and Pattern Recognition,2015,177579.0,True,"{62024327, 65604874, 46904460, 68258188, 61841938, 58213909, 50390549, 68845078, 54868504, 43355...",43
12571,Understanding the difficulty of training deep feedforward neural networks,International Conference on Artificial Intelligence and Statistics,2010,16786.0,False,"{45521025, 45380994, 37372682, 42564875, 47987212, 72603152, 34642719, 43489697, 41565985, 46360...",42
1393,Adam: A Method for Stochastic Optimization,International Conference on Learning Representations,2014,142334.0,True,"{60872321, 41450882, 49977348, 52474500, 60667655, 53929352, 56440329, 56115600, 76281366, 39765...",41


In [15]:
metadata_grouped[
    metadata_grouped["title"]
    == "MapReduce: simplified data processing on large clusters"
]

Unnamed: 0,title,normalized_venue,year,citation_count,open_access,PostId,reference_count
7591,MapReduce: simplified data processing on large clusters,"{Cacm, Communications of the ACM}",2004,25616.0,True,"{2818613, 26985557, 12901694, 12896855}",4


In [16]:
metadata_grouped[metadata_grouped["title"] == "A Future-Adaptable Password Scheme"]

Unnamed: 0,title,normalized_venue,year,citation_count,open_access,PostId,reference_count
266,A Future-Adaptable Password Scheme,USENIX Annual Technical Conference,,212.0,False,"{67114272, 30029026, 36080835, 1645190, 53692391, 11101384, 4984044, 39875161, 4772058, 33912028...",11


In [17]:
metadata_grouped.sort_values("citation_count", ascending=False).head(10)

Unnamed: 0,title,normalized_venue,year,citation_count,open_access,PostId,reference_count
9631,Protein measurement with the Folin phenol reagent.,Journal of Biological Chemistry,1951,318671.0,True,"{55592324, 14608444, 58489165}",3
9011,PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT,Journal of Biological Chemistry,1951,318671.0,True,{24634890},1
3846,Deep Residual Learning for Image Recognition,Computer Vision and Pattern Recognition,2015,177579.0,True,"{62024327, 65604874, 46904460, 68258188, 61841938, 58213909, 50390549, 68845078, 54868504, 43355...",43
1393,Adam: A Method for Stochastic Optimization,International Conference on Learning Representations,2014,142334.0,True,"{60872321, 41450882, 49977348, 52474500, 60667655, 53929352, 56440329, 56115600, 76281366, 39765...",41
6464,ImageNet classification with deep convolutional neural networks,"{Neural Information Processing Systems, Communications of the ACM}",2012,113943.0,True,"{61167744, 65405953, 50193801, 40060949, 41841173, 51015834, 28232235, 48493868, 76746931, 15578...",25
2196,Attention is All you Need,Neural Information Processing Systems,2017,110977.0,True,"{62575360, 59377538, 62177796, 56930821, 63178631, 63298183, 71400202, 78645642, 69129741, 65343...",62
12798,Very Deep Convolutional Networks for Large-Scale Image Recognition,International Conference on Learning Representations,2014,94664.0,True,"{54997632, 61321247, 60812966, 65612711, 51854502, 54295849, 78408109, 62370995, 39095735, 42434...",26
9838,Random Forests,"{Springer ebooks, Machine-mediated learning}",2001,91631.0,True,"{70360996, 11501381, 65530093, 24663120, 58937426}",5
12044,"The moderator-mediator variable distinction in social psychological research: conceptual, strate...",Journal of Personality and Social Psychology,1986,87911.0,True,{1219480},1
2401,Basic local alignment search tool.,Journal of Molecular Biology,1990,87795.0,True,{1432570},1


In [18]:
def describe_stats(x):
    num_list = []
    for i in x:
        try:
            num_list.append(int(i))
        except:
            pass
    return (
        pd.Series(num_list)
        .describe(include=["mean", "std", "min", "25%", "50%", "75%", "max"])
        .to_dict()
    )


venue_citation_year = metadata_grouped.groupby("normalized_venue").agg(
    {"citation_count": list, "year": list}
)
venue_citation_year["count"] = metadata_grouped.groupby("normalized_venue").size()
venue_citation_year = venue_citation_year.sort_values(
    "count", ascending=False
).reset_index()
venue_citation_year["citation_stat"] = venue_citation_year["citation_count"].apply(
    describe_stats
)
venue_citation_year["year_stat"] = venue_citation_year["year"].apply(describe_stats)
venue_citation_year[
    ["normalized_venue", "count", "citation_stat", "year_stat"]
].to_json("venue_citation_year_stat.json", orient="records")
venue_citation_year.head()

TypeError: unhashable type: 'set'

In [15]:
swarm_data = []
for idx, row in venue_citation_year.iterrows():
    swarm_data.append({
        "id": row["normalized_venue"],
        "group": "",
        "num_ref": row["count"],
        "h5i": "",
    })
    if idx == 100:
        break
json.dump(swarm_data, open("venue_swarm.json", "w"), indent=0)