In [1]:

import json
import os
import re
import random
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

data_path = 'source'


In [2]:
## Reading arxiv metadata for filtering to DS / CS papers

with open(os.path.join('output','arxiv_metadata.pkl'), "rb") as open_file:
    arxiv_metadata = pickle.load(open_file)


In [3]:
## Create a set for the CS paper arxiv_ids for filtering to them on semanticscholar

cs_papers = set()

with open(os.path.join('arxiv-dataset',"train.txt"),'r') as scipapers:
    for art in tqdm(scipapers): #203037 papers in total
        article = json.loads(art)
        id = article['article_id']
        cat = ' '+arxiv_metadata[id]['category']+' '
        if re.search(r'[ ](stat\.ML|math\.ST|cs\.[A-Z]{2})',cat):
            cs_papers.add(id)


203037it [00:52, 3837.54it/s]


In [5]:
## Filter semanticscholar dataset

pattern = r"arxiv\.org/(?:pdf|abs)/(.+)"

## Init papers dictionary
papers = {}

with open('../papers-2017-10-30.json','r') as articles:
    ## Go through each line (article)
    for article in tqdm(articles):
        ## Load values from json
        article_json = json.loads(article)
        venue = article_json['venue']
        journal = article_json['journalName']
        urls = article_json['pdfUrls']
        ## Go through the list of urls
        for url in urls:
            ## If there's an arxiv url, get the arxiv_id from it
            if 'arxiv' in url:
                arxiv_search = re.search(pattern, url)
                ## If arxiv_id is found, check whether it's in the "TF Scientific paper CS paper subset"
                ## then break the loop as there's no need to go through more links for the same paper
                if arxiv_search:
                    arxiv_id = arxiv_search.group(1)
                    arxiv_id = re.sub(r"\.pdf.*?$","",arxiv_id)
                    arxiv_id = re.sub(r"v[0-9]{1,2}$","",arxiv_id)
                    arxiv_id = arxiv_id.replace('/','')
                    ## If arxiv_id is in the TF CS subset, add the semanticscholar article to the dictionary
                    if arxiv_id in cs_papers:
                        papers[arxiv_id] = article_json
                    break


20548890it [07:18, 46836.55it/s]


In [6]:

paperssubset = {}
for key, value in papers.items():
    paperssubset[key] = {k: value[k] for k in ('title','journalName','journalPages','journalVolume','venue','pdfUrls','id','authors','s2Url')}


In [7]:
## Write the CS paper subset to a pickle

with open(os.path.join('output','cs_papers_semanticscholar.pkl'),'wb') as openfile:
    pickle.dump(paperssubset, openfile)


In [10]:
## Number of papers found from the TF scientific papers dataset CS subset

len(papers)


9019

In [11]:
## Create a set for the arxiv_ids, that are conference papers based on url / venue / journalname / etc

conference_url = {'aclweb.org','iclr.cc','nips.cc','aaai.org','ieeecomputersociety','ieeexplore','ieee'}
conference_venue = {'ACL','ICLR','NIPS','CVPR','AAAI','IEEE'} # ICLM not found


In [16]:

with open(os.path.join('output','conf_names.pkl'),'rb') as openfile:
    conf_names = pickle.load(openfile)


In [17]:

conf_names = set(conf_names)


In [18]:

conf_names.remove('WWW')
conf_names.remove('FG')
conf_names.remove('MM')
conf_names.remove('ALT')
conf_names.add('ACM')


In [19]:

a = {}

for key, value in papers.items():
    urls = value['pdfUrls']
    venue = ' '+value['venue']+' '
    journalname = value['journalName']
    journalvolume = value['journalVolume']
    journalpages = value['journalPages']
    if any(' '+cv+' ' in venue for cv in conf_names):
        a[key] = value
    for url in urls:
        url = url.lower()
        urlsearch = re.search(r"(?:https://|http://)?(?:www\.)?([a-z0-9.\-]+?\.(?:com|org|eu|co\.uk|us|cc|edu\.au|ac\.il|gov|edu|ca|net|ac\.uk|press|info|ie|ch|de|fr))[^a-z]",url)
        if urlsearch:
            urlfound = urlsearch.group(1)
            if any(v.lower() in urlfound for v in conf_names):
                a[key] = value
    if journalname != '' and journalname != 'CoRR':#and journalname != 'CoRR':# and journalvolume != '' and journalpages != '':
        a[key] = value

print(len(a))



2700


In [20]:

with open(os.path.join('output','conference_papers.pkl'),'wb') as openfile:
    pickle.dump(a, openfile)
