In [2]:
import psycopg2

In [3]:
conn = psycopg2.connect(user="lvs215",
                        password="",
                        host="127.0.0.1",
                        port="12777",
                        database="aip")

In [15]:
# This cell outputs how many articles we have some author data on.
with conn.cursor() as cursor:
    query = """
SELECT count(id), (SELECT COUNT(DISTINCT(id))
    FROM author_paper_pairs, publications 
    WHERE publications.id = author_paper_pairs.paper_id
) AS number_papers_authors
FROM publications
"""
    cursor.execute(query)

    row = cursor.fetchone()
    print(
        "Amount of papers that have at least one author assigned {:.2f}".format(
            (row[1] / row[0]) * 100))

Amount of papers that have at least one author assigned 71.21810595898887


In [16]:
# This cell outputs how many articles we have some number of citation data on.
with conn.cursor() as cursor:
    query = """
SELECT count(id), (SELECT COUNT(DISTINCT(id))
    FROM publications 
    WHERE publications.n_citations >= 0
) AS number_articles_citation_info
FROM publications
"""
    cursor.execute(query)

    row = cursor.fetchone()
    print("Amount of papers that have citation information {:.2f}%".format(
        (row[1] / row[0]) * 100))

Amount of papers that have citation information 90.45


In [None]:
# TODO check the amount of article entries in DBLP, Semantic Scholar, and AMiner.
def wccount(filename):
    out = subprocess.Popen(['wc', '-l', filename],
                           stdout=subprocess.PIPE,
                           stderr=subprocess.STDOUT
                           ).communicate()[0]
    return int(out.partition(b' ')[0])



In [None]:
import os
import re
from os.path import isfile

from datetime import datetime
from venue_mapper.venue_mapper import VenueMapper
import sys

sys.path.append("..")
from util import iterload_file_lines, iterload_file_lines_gzip
from lxml import etree


def run():
    total_dblp = 0
    total_aminer = 0
    total_semantic_scholar = 0
    system_articles_dblp = 0
    system_articles_aminer = 0
    system_articles_semantic_scholar = 0
    venue_mapper = VenueMapper()

    file_locations = "/var/scratch/lvs215/aip_data"

    # Create a list of all the files we want to parse. Skip the compressed sources if they are still lingering around
    for path, subdirs, files in os.walk(file_locations):
        for name in files:
            if isfile(os.path.join(path, name)) and not name.endswith(
                    ("zip", "tar")):
                file_path = os.path.join(path, name)
                if re.match(".*dblp[\w-]+\.xml", file_path):
                    for event, element in etree.iterparse(file_path,
                                                          load_dtd=True,
                                                          dtd_validation=True):
                        total_dblp += 1
                        venue = element.find('booktitle')  # type: Optional[str]
                        if venue is None and len(
                                element.findall('journal')) > 0:
                            venue = element.find('journal')

                        if venue is not None and venue.text is not None:
                            venue = str(venue.text)
                        else:
                            venue = None

                        if venue is not None and venue_mapper.get_abbreviation(
                                venue) is not None:
                            system_articles_dblp += 1

                elif "s2-corpus" in file_path:
                    file_iterator_func = iterload_file_lines_gzip if file_path.endswith(
                        "gz") else iterload_file_lines
                    publication_iterator = file_iterator_func(file_path)
                    for publication in publication_iterator:
                        total_semantic_scholar += 1
                        if publication is None:
                            continue

                        if "venue" not in publication:
                            continue

                        venue_string = str(publication['venue'])
                        if len(venue_string) == 0:
                            continue

                        if venue_mapper.get_abbreviation(
                                venue_string) is not None:
                            system_articles_semantic_scholar += 1
                elif "aminer_papers" in file_path:
                    file_iterator_func = iterload_file_lines_gzip if file_path.endswith(
                        "gz") else iterload_file_lines
                    publication_iterator = file_iterator_func(file_path)
                    for publication in publication_iterator:
                        total_aminer += 1
                        if publication is None:
                            continue

                        if 'venue' not in publication:
                            continue

                        venue_string = publication['venue']
                        if isinstance(venue_string,
                                      dict) and "raw" in venue_string:
                            venue_string = venue_string["raw"]

                        if venue_mapper.get_abbreviation(
                                venue_string) is not None:
                            system_articles_aminer += 1

    date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    with open("aip_content_per_source_{}.csv".format(date_time), "w") as file1:
        file1.write("dblp,{},{}\n".format(total_dblp, system_articles_dblp))
        file1.write(
            "aminer,{},{}\n".format(total_aminer, system_articles_aminer))
        file1.write("semantic_scholar,{},{}\n".format(total_semantic_scholar,
                                                      system_articles_semantic_scholar))

    print("DBLP: {}/{}: {:2f}".format(total_dblp, system_articles_dblp,
                                      total_dblp / system_articles_dblp))
    print("Aminer: {}/{}: {:2f}".format(total_aminer, system_articles_aminer,
                                        total_aminer / system_articles_aminer))
    print("Semantic Scholar: {}/{}: {:2f}".format(total_semantic_scholar,
                                                  system_articles_semantic_scholar,
                                                  total_semantic_scholar / system_articles_semantic_scholar))


if __name__ == '__main__':
    run()


Exception ignored in: <generator object iterload_file_lines_gzip at 0x2aab1496af90>
Traceback (most recent call last):
  File "../util.py", line 59, in iterload_file_lines_gzip
    json_object = orjson.loads(line)
RuntimeError: generator ignored GeneratorExit
