In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
from inspire_utils.record import get_value
% matplotlib inline

from keras.models import Sequential

from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def read_json(filename):
    '''filename is a string= /path/to/file'''
    with open(filename) as f:
        for line in f:
            try:
                yield json.loads(line)
            except:
                continue

def removeNonAscii(string):
    return "".join( char for char in string if ord(char)<128 )

In [3]:
def get_title(listing,which_version=0):
    '''Get the title. If there are >1 versions, which_ver selects that version.'''
    title = get_value(listing,"extra_data.source_data.data.titles[%d].title" % which_version)
    title = removeNonAscii(title)
    return title

def get_abstract(listing,which_version=0):
    #                             look at this location in "example_api"
    abstract = get_value(listing,"extra_data.source_data.data.abstracts[%d].value" %which_version)
    abstract = removeNonAscii(abstract)
    return abstract

In [4]:
def make_ngrams(string, N=1):
    tokens = nltk.word_tokenize(string)
    ngram_tuples = list( ngrams(tokens,N) )
    ngram_list = [''.join(words) for words in ngram_tuples]
    return ngram_list

def ngram_search(string,keywords):
    ''' find the intersection of the ngram-ed string and the keyword list.'''
    matches = list( set(string).intersection( set(keywords) ) )
    return matches

def eval_score(string, keywords, min_ngram=1,max_ngram=4):
    scores = []
    for N in range(min_ngram,max_ngram+1):
        words = ngram_search( make_ngrams(string,N=N),keywords )
     
        scores.append(len(words))

    return scores

In [5]:
keysfile = np.loadtxt("KeyWords.csv", dtype="str") # load the keywords.
keywords = [word.lower() for word in keysfile]     # lowercase them all.

In [6]:
core_list = pd.read_csv("./core_arxiv.txt", names=["core"])
noncore_list = pd.read_csv("./noncore_arxiv.txt", names=["noncore"])
core, noncore = [],[]

# read the core arXiv list
for line in core_list['core']:
    name = line.split(":")[2]
    core.append(name)
core_list = core #pd.DataFrame(core, columns=["core"])

# read the noncore arXic list.
for line in noncore_list['noncore']:
    name = line.split(":")[2]
    noncore.append(name)
noncore_list = noncore #pd.DataFrame(noncore)

inspire_core = set(np.genfromtxt("inspire_core.txt").tolist())

def get_coreness(listing,which_listing=0):
    arXiv_id = get_value(listing,"extra_data.source_data.data.arxiv_eprints[%d].value" %which_listing)
    if arXiv_id in core_list:
        return 2
    elif arXiv_id in noncore_list:
        return 1
    else:
        return 0

In [7]:
def get_references(listing):
    if get_value(listing, "data.references"):
        refs = get_value(listing, "data.references")
    else:
        return [0.0,0.0]
    
    core_refs = 0.0
    noncore_refs = 0.0
    N_refs = float(len(refs))
    for ref in refs:
        if get_value(ref, "record.$ref"):
            inspire_id = int(get_value(ref, "record.$ref").split("/")[5])
            if inspire_id in inspire_core:
                core_refs = core_refs + 1.0
            else:
                noncore_refs = noncore_refs + 1.0
    
    f_core = core_refs/N_refs
    f_noncore = noncore_refs/N_refs
    
    return [f_core, f_noncore]

In [8]:
texts, labels = [], []
reference_fractions = []
title_scores = []
abstract_scores = []

for listing in read_json("arXiv.json"):
    title = get_title(listing)
    title_scores.append(eval_score(title, keywords))
    
    abstract = get_abstract(listing)
    abstract_scores.append(eval_score(abstract, keywords) )
    
    text = ' '.join( ( title, abstract ) )
    coreness = get_coreness(listing)
    
    f_refs = get_references(listing)

    reference_fractions.append(f_refs)

    texts.append(text)
    labels.append(coreness)

In [9]:
t_scores = np.asarray(title_scores)
a_scores = np.asarray(abstract_scores)

In [3]:
T = np.asarray(texts)
T = T.reshape(38774,1)

R = np.asarray(reference_fractions)

L = np.asarray(labels)
L = L.reshape(38774,1)

NameError: name 'np' is not defined

In [12]:
REFS = np.concatenate((T, t_scores), axis=1)
REF_ = np.concatenate((REFS, a_scores), axis=1)
REF__ = np.concatenate((REF_, R), axis=1)
REF = np.concatenate((REF__, L), axis=1)

In [13]:
keys = ['Text', 'Title-sing', 'Title-bi', 'Title-tri', 'Title-quad', 'Abs-sing','Abs-bi', 'Abs-tri', 'Abs-quad','core refs', 'non-core refs', 'Result']
REF= pd.DataFrame(REF, columns=keys)
REF = REF.reindex(np.random.permutation(REF.index))

REF['Result'] =REF['Result'].astype(int)

keysPC = ['core refs', 'non-core refs']
REF[keysPC] =REF[keysPC].astype(float)

REF['Text'] =REF['Text'].astype(str)
REF.head()

Unnamed: 0,Text,Title-sing,Title-bi,Title-tri,Title-quad,Abs-sing,Abs-bi,Abs-tri,Abs-quad,core refs,non-core refs,Result
28184,Mott Quantum Criticality in the Anisotropic 2D...,0,0,0,0,9,0,0,0,0.0,0.5,0
6456,Balanced Reed-Solomon Codes We consider the pr...,0,0,0,0,11,1,0,0,0.0,0.0,0
10183,Fermion Dipole Moment and Holography In the ba...,0,0,0,0,9,2,0,0,0.93617,0.0,2
4535,Quantization conditions and functional equatio...,0,0,0,0,9,1,0,0,0.925,0.025,2
36703,"Resonances in the continuum, field induced non...",2,0,0,0,6,0,0,0,0.0,0.04386,0


In [14]:
REF.to_pickle('REFS')