In [None]:
import json
import pyarrow.parquet as pq
import pandas as pd
import pyarrow as pa
import numpy as np

Get venues

In [None]:
f = open('mag_venues.txt', 'r')
venues = set()
for line in f.readlines():
    venue = json.loads(line)
    venues.add(venue['id'])
print(f'There are {len(venues)} venues')
venues = list(venues)
venues.sort()
df = pd.DataFrame({'id': venues})
table = pa.Table.from_pandas(df)
pq.write_table(table, 'mag_venue_id.parquet')
print("venue data type:", type(venues[0]))
venues = set(venues)

Get affiliations

In [None]:
f = open('mag_affiliations.txt', 'r')
affs = list()
aff_names = list()
for line in f.readlines():
    aff = json.loads(line)
    affs.append(aff['id'])
    aff_names.append(aff['NormalizedName'])
print(f'There are {len(affs)} affiliations')
df = pd.DataFrame({'id': affs, 'name': aff_names})
table = pa.Table.from_pandas(df)
pq.write_table(table, 'mag_affs.parquet')

df = pd.DataFrame({'id': affs})
table = pa.Table.from_pandas(df)
pq.write_table(table, 'mag_aff_id.parquet')
print('Afflication data type:', type(affs[0]))
affs = set(affs)

Get affiliation of authors and save the author-affiliation pairs in the parquet files. Not all authors have known affiliations.

In [None]:
full_authors = []
for i in range(5):
    f = open(f'mag_authors_{i}.txt', 'r')
    authors = []
    affs = []
    num_authors = 0
    for line in f.readlines():
        num_authors += 1
        author = json.loads(line)
        full_authors.append(author['id'])
        if 'last_known_aff_id' in author:
            authors.append(author['id'])
            affs.append(int(author['last_known_aff_id']))
    df = pd.DataFrame({'author': authors, 'affiliation': affs})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_author2aff_{i}.parquet')
    print(f'There are {num_authors} authors and {len(authors)} of them have affiliations')
    
df = pd.DataFrame({'id': full_authors})
print('author ID data type:', type(full_authors[0]))
table = pa.Table.from_pandas(df)
pq.write_table(table, 'mag_author_id.parquet')

In [None]:
def parse_paper_file(i):
    f = open(f'mag_papers_{i}.txt', 'r')
    num_paper_venue = 0
    venue_ids = set()
    fos_set = set()

    paper_ids = []
    titles = []
    years = []
    paper2author = ([], [], [])
    paper2venue = ([], [])
    paper2fos = ([], [], [])
    paper2paper = ([], [])
    for line in f.readlines():
        paper = json.loads(line)
        if 'id' not in paper or 'title' not in paper or 'year' not in paper or 'authors' not in paper or len(paper['authors']) == 0:
            continue
        paper_ids.append(paper['id'])
        titles.append(paper['title'])
        years.append(paper['year'])
        for order, author in enumerate(paper['authors']):
            paper2author[0].append(paper['id'])
            paper2author[1].append(author['id'])
            paper2author[2].append(order)
        if 'venue' in paper:
            num_paper_venue += 1
            venue = paper['venue']
            if 'id' in venue:
                venue_ids.add(venue['id'])
                paper2venue[0].append(paper['id'])
                paper2venue[1].append(venue['id'])
                assert venue['id'] in venues
        # If a paper has field of study.
        if 'fos' in paper:
            for fos in paper['fos']:
                if fos['w'] > 0:
                    fos_set.add(fos['name'])
                    paper2fos[0].append(paper['id'])
                    paper2fos[1].append(fos['name'])
                    paper2fos[2].append(fos['w'])
        if 'references' in paper:
            for ref in paper['references']:
                paper2paper[0].append(paper['id'])
                paper2paper[1].append(ref)

    df = pd.DataFrame({'paper': paper_ids, 'title': titles, 'year': years})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_papers_{i}.parquet')
    print(f'There are {len(paper_ids)} papers in file {i}', flush=True)

    df = pd.DataFrame({'paper': paper2author[0], 'author': paper2author[1], 'order': paper2author[2]})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_paper2author_{i}.parquet')
    print(f'There are {len(paper2author[0])} paper-author pairs', flush=True)
    
    df = pd.DataFrame({'paper': paper2venue[0], 'venue': paper2venue[1]})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_paper2venue_{i}.parquet')
    print(f'There are {len(paper2venue[0])} paper-venue pairs', flush=True)
    
    df = pd.DataFrame({'paper': paper2fos[0], 'fos': paper2fos[1], 'w': paper2fos[2]})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_paper2fos_{i}.parquet')
    print(f'There are {len(paper2fos[0])} paper-fos pairs', flush=True)

    df = pd.DataFrame({'src_paper': paper2paper[0], 'dst_paper': paper2paper[1]})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_paper2paper_{i}.parquet')
    print(f'There are {len(paper2paper[0])} paper citations', flush=True)
    
    return num_paper_venue, venue_ids, fos_set

In [None]:
from graphstorm.gconstruct.utils import multiprocessing_data_read
data = multiprocessing_data_read([i for i in range(51)], num_processes=8, user_parser=parse_paper_file)

num_paper_venue = 0
venue_ids = set()
fos_set = set()
for i in data:
    num_paper_venue += data[i][0]
    venue_ids.update(data[i][1])
    fos_set.update(data[i][2])

In [None]:
print(f'There are {num_paper_venue} papers that have venues.')
print(f'There are {len(venue_ids)} venues with IDs')
print(f'There are {len(fos_set)} topic fields')

In [None]:
df = pd.DataFrame({'id': np.array(list(fos_set))})
table = pa.Table.from_pandas(df)
print('fos data type:', type(list(fos_set)[0]))
pq.write_table(table, 'mag_fos.parquet')

The code below creates venues as labels of paper nodes.

In [None]:
v_papers = []
venues = []
for i in range(51):
    table = pd.read_parquet(f'mag_paper2venue_{i}.parquet')
    v_papers.append(table['paper'].to_numpy())
    venues.append(table['venue'].to_numpy())
v_papers = np.concatenate(v_papers)
venues = np.concatenate(venues)

Collect all unique venues and only keep the venues that have a large number publications.

In [None]:
uniq_venues, v_cnts = np.unique(venues, return_counts=True)
common_venue_map = {venue: i for i, venue in enumerate(uniq_venues[v_cnts > 10000].tolist())}

Only collect the papers published in the popular venues.

In [None]:
v_papers1 = []
venues1 = []
for pid, v in zip(v_papers, venues):
    if v in common_venue_map:
        v_papers1.append(pid)
        venues1.append(common_venue_map[v])

In [None]:
p2v_map = {paper: venue for paper, venue in zip(v_papers1, venues1)}

Recreate the paper node files with venues.

In [None]:
import math
num_labels = 0
for i in range(51):
    table = pd.read_parquet(f'mag_papers_{i}.parquet')
    venues = []
    for paper in table['paper'].to_numpy():
        if paper in p2v_map:
            venues.append(p2v_map[paper])
            num_labels += 1
        else:
            venues.append(math.nan)
    df = pd.DataFrame({'paper': table['paper'], 'title': table['title'],
                       'year': table['year'], 'venue': np.array(venues)})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_papers_with_labels_{i}.parquet')
    print(f'There are {len(venues)} papers in file {i}', flush=True)
    print(f'There are {num_labels} labels so far.')