Establish imports and location-specific variables.

In [53]:
import json
import os

In [54]:
input_dir = '/Users/weale/data/covid/raw/'
tmp_dir = '/Users/weale/data/covid/tmp/'
output_dir = '/Users/weale/data/covid/out/'

experiment = 'comm_use_subset/pdf_json/'

#RUN ONCE PER EXPERIMENT DIRECTORY
#os.makedirs(tmp_dir + experiment)

input_path = input_dir + experiment
tmp_path = tmp_dir + experiment
output_path = output_dir + experiment

== Extract ==

Take the large json objects and extract the titles and abstracts for each document.

This should provide a simple representation for the content, without requiring the full text documents.

In [55]:
fOUT = open(tmp_path + "20200420_title_abstract_text.tsv", "w")
fCOUNT = open(tmp_path + "20200420_title_abstract_count.txt", "w")

In [56]:
file_len = 0

for filename in os.listdir(input_path):
    with open(os.path.join(input_path, filename)) as f:
        #print(f.name)
        
        #Load data from the file
        parsed_data = json.load(f)
        
        #Get the ID of the paper
        pID = parsed_data['paper_id']

        #Get the paper title
        tmp = parsed_data['metadata']
        if len(tmp) > 0:
            title = tmp.get('title')
        
        #Get the paper abstract
        tmp = parsed_data['abstract']
        if len(tmp) > 0:
            tmp = tmp[0]
            abstract = tmp.get('text')

        #Combine into the output and print to the file
        line = pID + '\t' + title + '\t' + abstract + '\n'
        fOUT.write(line)
        
        #Increment file length
        file_len = file_len + 1
    
    f.close()
fOUT.close()

## Print the number of elements to another file
fCOUNT.write(str(file_len))
fCOUNT.close()

== Create NLP Stuff ==

In [57]:
import scispacy
import spacy

nlp = spacy.load("en_core_sci_md")

In [75]:
fIN = open(tmp_path + "20200420_title_abstract_text.tsv", "r")

dim_title = 0
dim_abstract = 0

lines = fIN.readlines()
for line in lines:
    #print(line)
    
    elements = line.split('\t')
    #tokens = nlp(elements[1])
    
    len_title = len(nlp(elements[1]))
    len_abstract = len(nlp(elements[2]))
    
    if dim_title < len_title:
        dim_title = len_title
        
    if dim_abstract < len_abstract:
        dim_abstract = len_abstract

fIN.close()

print(dim_title)
print(dim_abstract)

275
1366


In [76]:
fIN = open(tmp_path + "20200420_title_abstract_count.txt", "r")
numlines = int(fIN.readline())
fIN.close

print(numlines)

9557


In [77]:
import numpy as np

titleArr = np.zeros((numlines, dim_title, 200))
abstractArr = np.zeros((numlines, dim_abstract, 200))

In [80]:
fIN = open(tmp_path + "20200420_title_abstract_text.tsv", "r")

i=0
lines = fIN.readlines()
for line in lines:
    elements = line.split('\t')

    processed = nlp(elements[1])
    j=0
    for token in processed:
        titleArr[i,j,:] = token.vector
        j+=1
        
    processed = nlp(elements[2])    
    j=0
    for token in processed:
        abstractArr[i,j,:] = token.vector
        j+=1
        
    i+=1
fIN.close()
print(i)

9557


In [81]:
print(titleArr[6000][1])

[-2.04027995e-01  3.53692994e-02  2.17341006e-01 -1.41967997e-01
  2.21268997e-01 -1.31978005e-01  1.94425002e-01 -7.84751959e-03
 -8.55292007e-03  6.28777966e-02 -3.96941006e-02 -9.26643983e-02
 -4.14449014e-02 -9.75648984e-02 -2.07541004e-01 -1.99394003e-01
 -1.09490998e-01 -3.92311990e-01  2.31323004e-01  4.59965989e-02
 -6.67273030e-02  1.57240003e-01  1.97433993e-01 -1.19071998e-01
 -1.70100003e-01 -1.33843005e-01  1.01182997e-01 -1.53658003e-01
 -1.69984996e-01  2.69062012e-01 -1.23778000e-01 -1.90770000e-01
  2.26792004e-02 -5.71665987e-02  2.31070995e-01  1.70316994e-01
 -2.01279998e-01  6.42450973e-02 -2.13558003e-01 -3.47654000e-02
 -7.45249018e-02 -9.79702026e-02 -5.53206988e-02 -2.43674009e-03
  1.37740999e-01 -2.28818998e-01 -2.13365003e-01 -1.40552998e-01
  9.11056027e-02 -2.03635007e-01 -2.61175004e-03  2.16067001e-01
  9.76639017e-02  3.51377986e-02  1.04653999e-01  8.44338983e-02
 -4.71705012e-02  1.72894001e-01  6.75649010e-03 -1.06210001e-01
  2.48066001e-02 -5.55003

In [82]:
from numpy import asarray
from numpy import save

save(output_path + "20200420_title_vectors.npy", titleArr)
save(output_path + "20200420_abstract_vectors.npy", abstractArr)