In [1]:
import gzip
import pickle
from os.path import join, expanduser
import numpy as np
import pandas as pd
import json
import networkx as nx
import spacy

from article_analysis.parse import get_chunk, get_articles, find_doi_chunk_map, load_ngram_dist
import article_analysis.parse_ent as aape

%load_ext autoreload
%autoreload 1
%aimport article_analysis.parse_ent

In [2]:
nlp = aape.init_nlp()

In [3]:
input_path = expanduser('~/data/jstor/latest/')
output_path = expanduser('~/data/jstor/latest/')
prefix = 'ngrams_dict'
dois = ['10.2307/3069368', '10.2307/20159507']

with open(join(output_path, 'registry_json.txt')) as file:
    registry_dict = json.loads(file.read())

In [4]:
all_dois_per_batch = [v for v in registry_dict.values()]
all_dois_flat = [v for sublist in registry_dict.values() for v in sublist]

In [5]:
batch_size = 10
dois_batched = [dois[i:i+batch_size] for i in range(0, len(all_dois_flat), batch_size)]

In [6]:
# get articles
dch_dict = get_articles(dois, registry_dict, input_path, prefix)
# get ngrams
# ngram_order = 1
# distr_dict = load_ngram_dist(input_path, ngram_order)
# keys = sorted(distr_dict.keys())

In [7]:
# three grams
{k : v for k, v in dch_dict[dois[0]][1].items() if k in list(dch_dict[dois[0]][1].keys())[:2]}

{'c': [0, 327, 344, 390, 442],
 'academy': [0, 326, 349, 375, 378, 389, 418, 424, 462, 465, 473, 488, 491]}

In [9]:
# a single article
ngram_order = 1
article = dch_dict[dois[0]]
ngram_positions_list = article[ngram_order]
# ngram_counts = {k: len(v) for k, v in ngram_positions_list.items()}
# ngram_number = sum(ngram_counts.values())
# ngram_freqs = {k: v/ngram_number for k, v in ngram_counts.items() if k in distr_dict.keys()}
# len(ngram_freqs)

# ngram_freqs_outstanding = {k: v/distr_dict[k][0] for k, v in ngram_freqs.items()} 
# len(ngram_counts), len(ngram_freqs), len(ngram_freqs_outstanding)

# keys = sorted(ngram_freqs_outstanding.keys())
# vals = np.array([(ngram_freqs_outstanding[k], ngram_counts[k], distr_dict[k][-1], 1./distr_dict[k][-1]) 
#                  for k in keys])
# print(len(keys), vals.shape)
# df = pd.DataFrame(vals, keys, columns=['tf_idf', 'n_article', 
#                                        'n_corpus', 'inv_n_corpus']).sort_values(['n_article', 'inv_n_corpus'])
# df.tail(20)

In [10]:
ngram_order = 1
ngram_positions_list = article[ngram_order]

In [11]:
ngram_order = 2
ngram_positions_list2 = article[ngram_order]

In [12]:
('variable', 'name') in ngram_positions_list2.keys()

False

In [13]:
ixs = ngram_positions_list['hypothesis'] + ngram_positions_list['table']

In [14]:
fname = expanduser('~/data/jstor/latest/corpus_clean_dict.pgz')
with gzip.open(fname) as fp:
    articles_ds = pickle.load(fp)

In [15]:
carticle = articles_ds[dois[0]]

In [16]:
chunks = aape.get_np_candidates(ixs, carticle, nlp, 1)

In [17]:
total_counts, total_counts_raw, table, tree_dict = aape.choose_popular_np_phrases(chunks)

In [18]:
len(total_counts), len(total_counts_raw)

(68, 141)

In [19]:
pd.DataFrame(table, columns=['root', 'np', 'count'])

Unnamed: 0,root,np,count
0,posture,"(global, strategic, posture)",8
1,posture,"(firm, global, strategic, posture)",12
2,posture,"(expansive, global, strategic, posture)",1
3,relationship,"(positive, relationship)",4
4,relationship,"(linear, relationship)",1
5,relationship,"(u, relationship)",4
6,relationship,"(previously, propose, linear, relationship)",4
7,relationship,"(main, effect, relationship)",1
8,relationship,"(moderate, relationship)",1
9,relationship,"(negative, linear, relationship)",1


In [20]:
total_counts[:5], total_counts_raw[:25]

([('heterogeneity', 30),
  ('posture', 21),
  ('relationship', 17),
  ('characteristic', 8),
  ('uncertainty', 8)],
 [(('firm', 'global', 'strategic', 'posture'), 12),
  (('global', 'strategic', 'posture'), 8),
  (('environmental', 'uncertainty'), 6),
  (('positive', 'relationship'), 4),
  (('tmt', 'characteristic'), 4),
  (('u', 'relationship'), 4),
  (('previously', 'propose', 'linear', 'relationship'), 4),
  (('educational', 'heterogeneity'), 4),
  (('functional', 'heterogeneity'), 4),
  (('management', 'team', 'international', 'work', 'experience'), 3),
  (('management', 'team', 'educational', 'heterogeneity'), 3),
  (('management', 'team', 'functional', 'heterogeneity'), 3),
  (('management', 'team', 'firm', 'tenure', 'heterogeneity'), 3),
  (('management', 'team'), 3),
  (('follow', 'hypothesis'), 2),
  (('tmt', 'demographic'), 2),
  (('fix', 'model'), 2),
  (('upper', 'echelon', 'study'), 2),
  (('independent', 'variable'), 2),
  (('tmt', 'turnover'), 2),
  (('bivariate', 'corre

In [21]:
tree_dict['heterogeneity']

{('management',
  'team',
  'educational',
  'heterogeneity'): <networkx.classes.digraph.DiGraph at 0x1b0d2cd9e8>,
 ('management',
  'team',
  'functional',
  'heterogeneity'): <networkx.classes.digraph.DiGraph at 0x1b0d2cdef0>,
 ('management',
  'team',
  'firm',
  'tenure',
  'heterogeneity'): <networkx.classes.digraph.DiGraph at 0x1b0d2ec2b0>,
 ('team',
  'member',
  'heterogeneity'): <networkx.classes.digraph.DiGraph at 0x1b0d2ec9b0>,
 ('excessive',
  'heterogeneity'): <networkx.classes.digraph.DiGraph at 0x1b0d2ecdd8>,
 ('tmt', 'heterogeneity'): <networkx.classes.digraph.DiGraph at 0x1ad31bc438>,
 ('educational',
  'heterogeneity'): <networkx.classes.digraph.DiGraph at 0x1a36cc2d30>,
 ('firm',
  'tenure',
  'heterogeneity'): <networkx.classes.digraph.DiGraph at 0x1a36cc2da0>,
 ('functional',
  'heterogeneity'): <networkx.classes.digraph.DiGraph at 0x1a36cc2e48>,
 ('18',
  'tmt',
  'educational',
  'heterogeneity'): <networkx.classes.digraph.DiGraph at 0x1a36ce0518>,
 ('tmt',
  'fu

In [34]:
c0 = chunks[5][1]
c = chunks[5][1][0]

In [32]:
c.isnumeric()

False

In [33]:
type(c)

str

In [35]:
type(c0)

tuple