In [2]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import sys
# Standard imports

In [3]:
import json_tools as jt
# Import the script full of useful tools for reading the JSON data.

In [3]:
coreness = []

article_texts = []
title_Ngrams = []
abstract_Ngrams = []

Nrefs = []
f_core = [] # to store the fraction of references which are core
f_noncore = [] # to store the fraction of references which are noncore
f_core_2o = [] # to store SECOND ORDER core.
f_noncore_2o = []

categories = []

listing_generator = jt.read_json("./datfiles/arXiv.json")

t0 = time.time()
for i, listing in enumerate( listing_generator ):
    # calls a generator function in JSON_TOOLS script.
    
    # Get coreness.
    coreness.append( jt.get_coreness(listing) )

    
    # Get the texts.
    title = jt.get_title(listing)    
    abstract = jt.get_abstract(listing)    
    text = ' '.join((title, abstract))    
    article_texts.append(text)
    
    # Get the category.
    categories.append( jt.get_category(listing) )
    
    # Do the Ngrams.
    maxN = 4 # How big should the Ngrams get?
    title_scores = []
    abstract_scores = []
    for N in range(1,maxN+1):
        all_title_ngrams = jt.make_ngrams(title, N=N)        
        title_scores.append( len(jt.ngram_search(all_title_ngrams)) )
        
        all_abstract_ngrams = jt.make_ngrams(abstract, N=N)        
        abstract_scores.append( len(jt.ngram_search(all_abstract_ngrams)) )    
        
    title_Ngrams.append( title_scores )
    abstract_Ngrams.append( abstract_scores )
        
    # Look at the references.
    Nrefs.append( jt.get_Nrefs(listing) )
    
    ref_fracs = jt.get_reference_fractions(listing)    
    f_core.append( ref_fracs[0] )
    f_noncore.append( ref_fracs[1] )
    
    ref_fracs_2o = [0.0,0.0] # Implement 2o references here properly - only works with access to INSPIRE database!
    f_core_2o.append( ref_fracs_2o[0] )
    f_noncore_2o.append( ref_fracs_2o[1] )

deltat = time.time() - t0

print("%d:%d to extract data from  %d articles" %(int(deltat/60.0), int(deltat%60), len(article_texts)) )

{ 'data': { '$schema': 'http://pcrcsis02.cern.ch/schemas/records/hep.json',
            '_collections': ['Literature'],
            '_files': [ { 'bucket': 'a2e88987-d347-491c-8356-b05e126a3462',
                          'checksum': 'md5:a3f1a9971347cd117e10b4c3441ea817',
                          'key': '0705.3928.tar.gz',
                          'size': 10890,
                          'version_id': '19e5d57e-450e-4d58-bc37-1d3a3a75ea4e'},
                        { 'bucket': 'a2e88987-d347-491c-8356-b05e126a3462',
                          'checksum': 'md5:80dbf0ef42c3f22e0634e3df3fdb8668',
                          'key': '0705.3928.pdf',
                          'size': 166854,
                          'version_id': '66dc968c-5b09-4154-b8fd-765fc8cb1f85'}],
            'abstracts': [ { 'source': 'arXiv',
                             'value': 'The dissipative Hofstadter model '
                                      'describes quantum particles moving in '
                      

                                              'approved:None.',
                                       'name': 'mark',
                                       'nicename': 'Mark the workflow object '
                                                   'with approved:None.',
                                       'parameters': [],
                                       'time': '2018-05-17 09:14:41.938564'},
                                     { 'doc': 'Mark the workflow object with '
                                              'unexpected-workflow-path:None.',
                                       'name': 'mark',
                                       'nicename': 'Mark the workflow object '
                                                   'with '
                                                   'unexpected-workflow-path:None.',
                                       'parameters': [],
                                       'time': '2018-05-17 09:14:41.938718'},
                     

# NOTE HERE:

No way here to extract the second-order references without access to INSPIRE database, so have a 'dirty' fix, to read them in from a previous run on a machine which has the inspire database.

In [4]:
reference_data = np.genfromtxt("./datfiles/reference_coreness.txt")
if len(reference_data) != len(article_texts):
    f_core_2o = reference_data[:len(article_texts),4]
    f_noncore_2o = reference_data[:len(article_texts),5]
else:
    f_core_2o = reference_data[:,4]
    f_noncore_2o = reference_data[:,5]

In [5]:
T_Ng = np.array(title_Ngrams)
A_Ng = np.array(abstract_Ngrams)

columns = [coreness, article_texts, T_Ng[:,0],T_Ng[:,1],T_Ng[:,2],T_Ng[:,3],
           A_Ng[:,0],A_Ng[:,1],A_Ng[:,2],A_Ng[:,3],
           categories, f_core, f_noncore, f_core_2o, f_noncore_2o, Nrefs]
names   = ["coreness", "texts", "title unigrams", "title bigrams", "title trigrams", "title quadgrams", 
           "abstract unigrams", "abstract bigrams", "abstract trigrams", "abstract quadgrams","category",
           "f_core", "f_noncore", "f_core_2o", "f_noncore_2o", "Nrefs"]

df = pd.DataFrame()

for column, name in zip(columns, names):
    df[name] = column

In [6]:
size_in_Mb = sys.getsizeof(df)/1024.0**2
print("File takes up %.3f Mb" %size_in_Mb)
df.head()

File takes up 60.343 Mb


Unnamed: 0,coreness,texts,title unigrams,title bigrams,title trigrams,title quadgrams,abstract unigrams,abstract bigrams,abstract trigrams,abstract quadgrams,category,f_core,f_noncore,f_core_2o,f_noncore_2o,Nrefs
0,0,Comparing Robustness of Pairwise and Multiclas...,0,0,0,0,4,0,0,0,cs.AI,0.0,0.0,0.0,0.0,5.0
1,0,On an Auxiliary Function for Log-Density Estim...,0,0,0,0,2,0,0,0,stat.CO,0.0,0.0,0.0,0.0,3.0
2,0,The two defaults scenario for stressing credit...,1,0,0,0,5,1,0,0,q-fin.RM,0.0,0.0,0.0,0.0,6.0
3,0,Coulomb drag as a measure of trigonal warping ...,3,0,0,0,5,0,0,0,cond-mat.mes-hall,0.051282,0.179487,0.03392,0.02365,39.0
4,2,Dissipative Hofstadter Model at the Magic Poin...,2,1,0,0,9,3,0,0,hep-th,0.647059,0.176471,0.44114,0.12174,34.0


In [7]:
Nshuffles = 10

for i in range(Nshuffles):
    df = df.sample(frac=1).reset_index(drop=True)

In [25]:
df.head()

Unnamed: 0,coreness,texts,title unigrams,title bigrams,title trigrams,title quadgrams,abstract unigrams,abstract bigrams,abstract trigrams,abstract quadgrams,category,f_core,f_noncore,f_core_2o,f_noncore_2o,Nrefs
0,0,Pre-images of extreme points of the numerical ...,0,0,0,0,3,0,0,0,math.FA,0.000000,0.027027,0.00000,0.00000,37.0
1,0,Positive margins and primary decomposition We ...,2,0,0,0,11,0,0,0,math.AC,0.000000,0.000000,0.00000,0.00000,32.0
2,1,How well can we measure supermassive black hol...,2,1,1,0,14,1,0,0,astro-ph.IM,0.017857,0.642857,0.01498,0.40512,56.0
3,0,On the positive commutator in the radical In t...,1,0,0,0,8,0,0,0,math.FA,0.000000,0.000000,0.00000,0.00000,18.0
4,0,Degeneration of trigonometric dynamical differ...,3,0,0,0,6,1,0,0,math.RT,0.095238,0.047619,0.00724,0.01327,21.0
5,0,A Second Look at Counting Triangles in Graph S...,0,0,0,0,5,0,0,0,cs.DS,0.000000,0.000000,0.00000,0.00000,17.0
6,0,Skill-Based Differences in Spatio-Temporal Tea...,0,0,0,0,2,0,0,0,stat.ML,0.000000,0.000000,0.00000,0.00000,40.0
7,0,Double theta polynomials and equivariant Giamb...,0,0,0,0,3,0,0,0,math.AG,0.000000,0.000000,0.00000,0.00000,34.0
8,0,Global well-posedness for the full compressibl...,1,0,0,0,1,0,0,0,math.AP,0.000000,0.000000,0.00000,0.00000,30.0
9,0,A Supervised Learning Algorithm for Binary Dom...,1,0,0,0,2,0,0,0,cs.IR,0.000000,0.000000,0.00000,0.00000,19.0


In [11]:
### THIS WILL OVERWRITE THE PREVIOUS FILE!
## need argument protocol=2 for making DFs you can use in py2
df.to_pickle("./datfiles/INSPIRE.df")

In [5]:
dat = pd.read_pickle("./datfiles/INSPIRE.df")
dat.head()

Unnamed: 0,coreness,texts,title unigrams,title bigrams,title trigrams,title quadgrams,abstract unigrams,abstract bigrams,abstract trigrams,abstract quadgrams,category,f_core,f_noncore,f_core_2o,f_noncore_2o,Nrefs
0,0,Pre-images of extreme points of the numerical ...,0,0,0,0,3,0,0,0,math.FA,0.0,0.027027,0.0,0.0,37.0
1,0,Positive margins and primary decomposition We ...,2,0,0,0,11,0,0,0,math.AC,0.0,0.0,0.0,0.0,32.0
2,1,How well can we measure supermassive black hol...,2,1,1,0,14,1,0,0,astro-ph.IM,0.017857,0.642857,0.01498,0.40512,56.0
3,0,On the positive commutator in the radical In t...,1,0,0,0,8,0,0,0,math.FA,0.0,0.0,0.0,0.0,18.0
4,0,Degeneration of trigonometric dynamical differ...,3,0,0,0,6,1,0,0,math.RT,0.095238,0.047619,0.00724,0.01327,21.0
