In [1]:
import gzip
import pickle
from os.path import join, expanduser
import pandas as pd
import json

from article_analysis.parse import get_chunk, get_articles, find_doi_chunk_map, load_ngram_dist
import article_analysis.parse_ent as aape
%load_ext autoreload
%autoreload 1
%aimport article_analysis.parse_ent

In [2]:
head = 50
verbose = True

nlp = aape.init_nlp()

input_path = expanduser('~/data/jstor/latest/')
output_path = expanduser('~/data/jstor/latest/')
prefix = 'ngrams_dict'

with open(join(output_path, 'registry_json.txt')) as file:
    registry_dict = json.loads(file.read())


fname = expanduser('~/data/jstor/latest/corpus_clean_dict.pgz')
with gzip.open(fname) as fp:
    articles_ds = pickle.load(fp)

In [3]:
keywords = ['hypothesis', 'hypotheses', 'table']

In [5]:
all_dois_flat = [v for sublist in registry_dict.values() for v in sublist]

if head > 0:
    all_dois_flat = all_dois_flat[:50]

# check boundaries
batch_size = 10
dois_batched = [all_dois_flat[i:i+batch_size] for i in range(0, len(all_dois_flat), batch_size)]

df_agg = []

for dois_batch, j in zip(dois_batched, range(len(dois_batched))):
    if verbose:
        print('batch number {0}'.format(j))
    dch_dict = get_articles(dois_batch, registry_dict, input_path, prefix)
    for doi in dois_batch:
        ngram_order = 1
        article_ngrams = dch_dict[doi]
        ngram_positions_list = article_ngrams[ngram_order]
        
        ixs = []
        for keyword in keywords:
            if keyword in ngram_positions_list.keys():
                ixs += ngram_positions_list[keyword]
        print('doi {0}, len ixs {1}'.format(doi, len(ixs)))
        
        carticle = articles_ds[doi]
        chunks = aape.get_np_candidates(ixs, carticle, nlp, 1)
        total_counts, total_counts_raw, table, tree_dict = aape.choose_popular_np_phrases(chunks)
        df = pd.DataFrame(table, columns=['root', 'np', 'count'])
        df['doi'] = doi
        df_agg.append(df)

batch number 0
doi 10.1525/sop.2004.47.2.189, len ixs 6
doi 10.2307/1556338, len ixs 20
doi 10.2307/1556386, len ixs 18
doi 10.2307/20141827, len ixs 0
doi 10.2307/20159047, len ixs 7
doi 10.2307/20159077, len ixs 0
doi 10.2307/20159138, len ixs 3
doi 10.2307/20159163, len ixs 0
doi 10.2307/20159328, len ixs 2
doi 10.2307/20159359, len ixs 7
batch number 1
doi 10.2307/20159377, len ixs 2
doi 10.2307/20159399, len ixs 1
doi 10.2307/20159415, len ixs 0
doi 10.2307/20159471, len ixs 0
doi 10.2307/20159591, len ixs 14
doi 10.2307/20159644, len ixs 11
doi 10.2307/20159647, len ixs 25
doi 10.2307/20159684, len ixs 27
doi 10.2307/20159692, len ixs 2
doi 10.2307/20159723, len ixs 0
batch number 2
doi 10.2307/20159762, len ixs 25
doi 10.2307/20159846, len ixs 18
doi 10.2307/20628720, len ixs 7
doi 10.2307/20799474, len ixs 33
doi 10.2307/23070653, len ixs 4
doi 10.2307/2392364, len ixs 2
doi 10.2307/2392454, len ixs 16
doi 10.2307/2392667, len ixs 14
doi 10.2307/2392740, len ixs 23
doi 10.2307/

In [6]:
df0 = pd.concat(df_agg)

In [8]:
df0.head()

Unnamed: 0,root,np,count,doi
0,contribution,"(maintenance, contribution)",4,10.2307/1556338
1,contribution,"(task, contribution)",3,10.2307/1556338
2,input,"(maintenance, input)",2,10.2307/1556338
3,input,"(task, input)",2,10.2307/1556338
4,coefficient,"(beta, coefficient)",2,10.2307/1556338


In [9]:
df0

Unnamed: 0,root,np,count,doi
0,contribution,"(maintenance, contribution)",4,10.2307/1556338
1,contribution,"(task, contribution)",3,10.2307/1556338
2,input,"(maintenance, input)",2,10.2307/1556338
3,input,"(task, input)",2,10.2307/1556338
4,coefficient,"(beta, coefficient)",2,10.2307/1556338
5,split,"(median, split)",3,10.2307/1556338
6,generosity,"(evaluation, generosity)",3,10.2307/1556338
0,locus,"(external, locus)",3,10.2307/1556386
1,locus,"(internal, locus)",3,10.2307/1556386
2,attitude,"(job, attitude)",6,10.2307/1556386


In [22]:
df0 = df0[['root', 'np', 'count', 'doi']].sort_values(['root', 'count'], 
                                                      ascending=[True, False]).reset_index(drop=True)
df0.to_csv('{0}np_data.csv.gz'.format(output_path), compression='gzip')

In [24]:
df0.shape, df0.drop_duplicates('np').shape

((431, 4), (411, 4))