In [1]:
import json
import pyarrow.parquet as pq
import pandas as pd
import pyarrow as pa
import numpy as np

Get venues

In [2]:
f = open('mag_venues.txt', 'r')
venues = set()
for line in f.readlines():
    venue = json.loads(line)
    venues.add(venue['id'])
print(f'There are {len(venues)} venues')
venues = list(venues)
venues.sort()
df = pd.DataFrame({'id': venues})
table = pa.Table.from_pandas(df)
pq.write_table(table, 'mag_venue_id.parquet')
print("venue data type:", type(venues[0]))
venues = set(venues)

There are 53422 venues
venue data type: <class 'int'>


Get affiliations

In [3]:
f = open('mag_affiliations.txt', 'r')
affs = list()
aff_names = list()
for line in f.readlines():
    aff = json.loads(line)
    affs.append(aff['id'])
    aff_names.append(aff['NormalizedName'])
print(f'There are {len(affs)} affiliations')
df = pd.DataFrame({'id': affs, 'name': aff_names})
table = pa.Table.from_pandas(df)
pq.write_table(table, 'mag_affs.parquet')

df = pd.DataFrame({'id': affs})
table = pa.Table.from_pandas(df)
pq.write_table(table, 'mag_aff_id.parquet')
print('Afflication data type:', type(affs[0]))
affs = set(affs)

There are 25776 affiliations
Afflication data type: <class 'int'>


Get affiliation of authors and save the author-affiliation pairs in the parquet files. Not all authors have known affiliations.

In [4]:
full_authors = []
for i in range(5):
    f = open(f'mag_authors_{i}.txt', 'r')
    authors = []
    affs = []
    num_authors = 0
    for line in f.readlines():
        num_authors += 1
        author = json.loads(line)
        full_authors.append(author['id'])
        if 'last_known_aff_id' in author:
            authors.append(author['id'])
            affs.append(int(author['last_known_aff_id']))
            #assert author['last_known_aff_id'] in affs, f"{author['last_known_aff_id']} does not exist"
            #assert author['id'] not in author2aff
    df = pd.DataFrame({'author': authors, 'affiliation': affs})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_author2aff_{i}.parquet')
    print(f'There are {num_authors} authors and {len(authors)} of them have affiliations')
    
df = pd.DataFrame({'id': full_authors})
print('author ID data type:', type(full_authors[0]))
table = pa.Table.from_pandas(df)
pq.write_table(table, 'mag_author_id.parquet')

There are 29941623 authors and 9465011 of them have affiliations
There are 53207043 authors and 12112687 of them have affiliations
There are 59310755 authors and 9551575 of them have affiliations
There are 55625205 authors and 5571860 of them have affiliations
There are 45392524 authors and 6678088 of them have affiliations
author ID data type: <class 'int'>


In [5]:
def parse_paper_file(i):
    f = open(f'mag_papers_{i}.txt', 'r')
    num_paper_venue = 0
    venue_ids = set()
    fos_set = set()

    paper_ids = []
    titles = []
    years = []
    paper2author = ([], [], [])
    paper2venue = ([], [])
    paper2fos = ([], [], [])
    paper2paper = ([], [])
    for line in f.readlines():
        paper = json.loads(line)
        if 'id' not in paper or 'title' not in paper or 'year' not in paper or 'authors' not in paper or len(paper['authors']) == 0:
            continue
        paper_ids.append(paper['id'])
        titles.append(paper['title'])
        years.append(paper['year'])
        for order, author in enumerate(paper['authors']):
            paper2author[0].append(paper['id'])
            paper2author[1].append(author['id'])
            paper2author[2].append(order)
        #if len(paper['authors']) == 0:
        #    print(paper)
        if 'venue' in paper:
            num_paper_venue += 1
            venue = paper['venue']
            if 'id' in venue:
                venue_ids.add(venue['id'])
                paper2venue[0].append(paper['id'])
                paper2venue[1].append(venue['id'])
                assert venue['id'] in venues
        if 'fos' in paper:
            for fos in paper['fos']:
                if fos['w'] > 0:
                    fos_set.add(fos['name'])
                    paper2fos[0].append(paper['id'])
                    paper2fos[1].append(fos['name'])
                    paper2fos[2].append(fos['w'])
        if 'references' in paper:
            for ref in paper['references']:
                paper2paper[0].append(paper['id'])
                paper2paper[1].append(ref)

    df = pd.DataFrame({'paper': paper_ids, 'title': titles, 'year': years})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_papers_{i}.parquet')
    print(f'There are {len(paper_ids)} papers in file {i}', flush=True)

    df = pd.DataFrame({'paper': paper2author[0], 'author': paper2author[1], 'order': paper2author[2]})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_paper2author_{i}.parquet')
    print(f'There are {len(paper2author[0])} paper-author pairs', flush=True)
    
    df = pd.DataFrame({'paper': paper2venue[0], 'venue': paper2venue[1]})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_paper2venue_{i}.parquet')
    print(f'There are {len(paper2venue[0])} paper-venue pairs', flush=True)
    
    df = pd.DataFrame({'paper': paper2fos[0], 'fos': paper2fos[1], 'w': paper2fos[2]})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_paper2fos_{i}.parquet')
    print(f'There are {len(paper2fos[0])} paper-fos pairs', flush=True)

    df = pd.DataFrame({'src_paper': paper2paper[0], 'dst_paper': paper2paper[1]})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_paper2paper_{i}.parquet')
    print(f'There are {len(paper2paper[0])} paper citations', flush=True)
    
    return num_paper_venue, venue_ids, fos_set

In [6]:
from graphstorm.gconstruct.utils import multiprocessing_data_read
data = multiprocessing_data_read([i for i in range(51)], num_processes=8, user_parser=parse_paper_file)

num_paper_venue = 0
venue_ids = set()
fos_set = set()
for i in data:
    num_paper_venue += data[i][0]
    venue_ids.update(data[i][1])
    fos_set.update(data[i][2])

There are 3662502 papers in file 7
There are 4267728 papers in file 6
There are 5328980 papers in file 0
There are 5222259 papers in file 4
There are 10595719 paper-author pairs
There are 5549891 papers in file 1
There are 1887447 paper-venue pairs
There are 4423942 papers in file 5
There are 10817771 paper-author pairs
There are 1564146 paper-venue pairs
There are 11204836 paper-author pairs
There are 11406033 paper-author pairs
There are 1210308 paper-venue pairs
There are 1197241 paper-venue pairs
There are 11747133 paper-author pairs
There are 10617858 paper-author pairs
There are 1261954 paper-venue pairs
There are 1407684 paper-venue pairs
There are 23221190 paper-fos pairs
There are 7644442 papers in file 3
There are 9056069 papers in file 2
There are 25277429 paper-fos pairs
There are 25070263 paper-fos pairs
There are 22343061 paper-fos pairs
There are 25623871 paper-fos pairs
There are 24939618 paper-fos pairs
There are 14587688 paper-author pairs
There are 750743 paper-venue

There are 21361169 paper-fos pairs
There are 1061145 paper-venue pairs
There are 14291276 paper-author pairs
There are 14631797 paper-author pairs
There are 1268832 paper-venue pairs
There are 24007125 paper-fos pairs
There are 1245854 paper-venue pairs
There are 13946192 paper-fos pairs
There are 23923532 paper-fos pairs
There are 12685296 paper citations
There are 35838787 paper citations
There are 22054291 paper citations
There are 4989878 papers in file 48
There are 23614785 paper-fos pairs
There are 22762436 paper-fos pairs
There are 18540388 paper citations
There are 5579532 papers in file 49
There are 15779380 paper-author pairs
There are 23769289 paper citations
There are 1564036 paper-venue pairs
There are 26625613 paper citations
There are 17508917 paper-author pairs
There are 1547869 paper-venue pairs
There are 23454580 paper-fos pairs
There are 24395338 paper-fos pairs
There are 27172944 paper citations
There are 29466019 paper citations


In [7]:
print(f'There are {num_paper_venue} papers that have venues.')
print(f'There are {len(venue_ids)} venues with IDs')
print(f'There are {len(fos_set)} topic fields')

There are 143630713 papers that have venues.
There are 53422 venues with IDs
There are 275021 topic fields


In [8]:
df = pd.DataFrame({'id': np.array(list(fos_set))})
table = pa.Table.from_pandas(df)
print('fos data type:', type(list(fos_set)[0]))
pq.write_table(table, 'mag_fos.parquet')

fos data type: <class 'str'>


The code below creates venues as labels of paper nodes.

In [43]:
v_papers = []
venues = []
for i in range(51):
    table = pd.read_parquet(f'mag_paper2venue_{i}.parquet')
    v_papers.append(table['paper'].to_numpy())
    venues.append(table['venue'].to_numpy())
v_papers = np.concatenate(v_papers)
venues = np.concatenate(venues)

In [46]:
uniq_venues, v_cnts = np.unique(venues, return_counts=True)

In [61]:
common_venue_map = {venue: i for i, venue in enumerate(uniq_venues[v_cnts > 10000].tolist())}

In [62]:
v_papers1 = []
venues1 = []
for pid, v in zip(v_papers, venues):
    if v in common_venue_map:
        v_papers1.append(pid)
        venues1.append(common_venue_map[v])

In [69]:
p2v_map = {paper: venue for paper, venue in zip(v_papers1, venues1)}

In [73]:
import math
num_labels = 0
for i in range(51):
    table = pd.read_parquet(f'mag_papers_{i}.parquet')
    venues = []
    for paper in table['paper'].to_numpy():
        if paper in p2v_map:
            venues.append(p2v_map[paper])
            num_labels += 1
        else:
            venues.append(math.nan)
    df = pd.DataFrame({'paper': table['paper'], 'title': table['title'],
                       'year': table['year'], 'venue': np.array(venues)})
    table = pa.Table.from_pandas(df)
    pq.write_table(table, f'mag_papers_with_labels_{i}.parquet')
    print(f'There are {len(venues)} papers in file {i}', flush=True)
    print(f'There are {num_labels} labels so far.')

There are 5328980 papers in file 0
There are 358029 labels so far.
There are 5549891 papers in file 1
There are 698300 labels so far.
There are 9056069 papers in file 2
There are 920280 labels so far.
There are 7644442 papers in file 3
There are 1106205 labels so far.
There are 5222259 papers in file 4
There are 1475060 labels so far.
There are 4423942 papers in file 5
There are 1890023 labels so far.
There are 4267728 papers in file 6
There are 2461230 labels so far.
There are 3662502 papers in file 7
There are 3229396 labels so far.
There are 3358693 papers in file 8
There are 4792230 labels so far.
There are 3355092 papers in file 9
There are 6354551 labels so far.
There are 3355440 papers in file 10
There are 7917339 labels so far.
There are 3354045 papers in file 11
There are 9478565 labels so far.
There are 3351987 papers in file 12
There are 11039437 labels so far.
There are 3354689 papers in file 13
There are 12600462 labels so far.
There are 3350715 papers in file 14
There are

In [74]:
print(len(p2v_map))

35849264


In [75]:
table = pd.read_parquet(f'mag_papers_with_labels_0.parquet')

In [77]:
table['venue'].dtype

dtype('float64')