In [1]:
import gzip
import pickle
from os.path import join, expanduser
import numpy as np
import pandas as pd
import json

In [2]:
input_path = expanduser('~/data/jstor/latest/')
output_path = expanduser('~/data/jstor/latest/')
prefix = 'ngrams_dict'
dois = ['10.2307/3069368', '10.2307/20159507']

with open(join(output_path, 'registry_json.txt')) as file:
    registry_dict = json.loads(file.read())

In [3]:
def get_chunk(fpath, prefix, index):
    """
    return chunk number index
    """
    fname = join(fpath, '{0}_{1}.pgz'.format(prefix, index))
    with gzip.open(fname) as fp:
        item = pickle.load(fp)
        return item
    return None

def get_articles(dois, registry_dict, fpath, prefix):
    """
    get articles by dois
    """
    doi_chunk_map = find_doi_chunk_map(dois, registry_dict)
    chunks_to_load = list(set(doi_chunk_map.values()))
    chunks_dict = {ii: get_chunk(fpath, prefix, ii) for ii in chunks_to_load}
    doi_ngram_dict = {doi:chunks_dict[doi_chunk_map[doi]][doi] for doi in dois}
    return doi_ngram_dict

def find_doi_chunk_map(dois, registry_dict):
    return {k : [kk for kk, val in registry_dict.items() if k in val][0] for k in dois}

def load_ngram_dist(fpath, order, thr=2):
    fname = join(fpath, 'corpus_ngram_dist_n_{0}_thr_{1}.pgz'.format(order, thr))
    with gzip.open(fname) as fp:
        distr_dict = pickle.load(fp)
    if order == 1:
        distr_dict = {k[0]: v for k, v in distr_dict.items()}
    return distr_dict

In [4]:
dch_dict = get_articles(dois, registry_dict, input_path, prefix)

In [5]:
# dch_dict[dois[0]][3]
ngram_order = 2

In [6]:
distr_dict = load_ngram_dist(input_path, ngram_order)

In [7]:
keys = sorted(distr_dict.keys())
# vals = np.array([distr_dict[k] for k in keys])
# df = pd.DataFrame(vals, keys, columns=['f', 'f-', 'f+', 'n']).sort_values('n')
# df.tail(20)

In [8]:
# a single article
article = dch_dict[dois[0]]
ngram_positions_list = article[ngram_order]
ngram_counts = {k: len(v) for k, v in ngram_positions_list.items()}
ngram_number = sum(ngram_counts.values())
ngram_freqs = {k: v/ngram_number for k, v in ngram_counts.items() if k in distr_dict.keys()}
len(ngram_freqs)

2669

In [9]:
ngram_freqs_outstanding = {k: v/distr_dict[k][0] for k, v in ngram_freqs.items()} 
len(ngram_counts), len(ngram_freqs), len(ngram_freqs_outstanding)

(3476, 2669, 2669)

In [12]:
keys = sorted(ngram_freqs_outstanding.keys())
vals = np.array([(ngram_freqs_outstanding[k], ngram_counts[k], distr_dict[k][-1], 1./distr_dict[k][-1]) 
                 for k in keys])
print(len(keys), vals.shape)
df = pd.DataFrame(vals, keys, columns=['tf_idf', 'n_article', 
                                       'n_corpus', 'inv_n_corpus']).sort_values(['n_article', 'inv_n_corpus'])
df.tail(20)

2669 (2669, 4)


Unnamed: 0,tf_idf,n_article,n_corpus,inv_n_corpus
"(finkelstein, hambrick)",113.68034,13.0,558.0,0.001792
"(management, team's)",773.58085,13.0,82.0,0.012195
"(expansiveness, firm's)",4879.509979,13.0,13.0,0.076923
"(demographic, effect)",481.53059,15.0,152.0,0.006579
"(tmt, characteristic)",1045.609281,15.0,70.0,0.014286
"(firm's, global)",2614.023203,15.0,28.0,0.035714
"(firm, tenure)",523.974226,16.0,149.0,0.006711
"(relationship, top)",772.99168,16.0,101.0,0.009901
"(tmt, demographic)",3318.066786,17.0,25.0,0.04
"(educational, heterogeneity)",3606.594332,17.0,23.0,0.043478


In [13]:
ngram_order = 1
ngram_positions_list = article[ngram_order]

In [30]:
ixs = ngram_positions_list['hypothesis']

In [19]:
fname = expanduser('~/data/jstor/latest/corpus_clean_dict.pgz')
with gzip.open(fname) as fp:
    articles_ds = pickle.load(fp)

In [20]:
carticle = articles_ds[dois[0]]

In [31]:
list(range(-1, 2))

[-1, 0, 1]

In [32]:
for i in ixs:
    js = range(i-1, i + 2)
    for j in js:
        phrase = ' '.join(carticle[j])
        doc = nlp(phrase)
        for chunk in doc.noun_chunks:
            print(chunk.text, '|', chunk.root.text, chunk.root.dep_,
                  chunk.root.head.text)

we | we nsubj hypothesized
such breadth | breadth nsubjpass related
diversity | diversity conj breadth
global strategic posture | posture pobj to
a positive relationship | relationship attr be
a top management team's international work experience | experience pobj between
the expansiveness | expansiveness conj experience
its firm's global strategic posture | posture pobj of
a positive relationship | relationship attr be
a top management team's international work experience | experience pobj between
the expansiveness | expansiveness conj experience
its firm's global strategic posture | posture pobj of
Hypothesis | Hypothesis nsubj lb
a positive relationship | relationship attr be
a top management team's educational heterogeneity | heterogeneity pobj between
the expansiveness | expansiveness conj heterogeneity
its firm's global strategic posture | posture pobj of
a positive relationship | relationship attr be
a top management team's educational heterogeneity | heterogeneity pobj between


the complexity | complexity pobj Given
characterizing firms | firms pobj Given
expansive global positions | positions pobj with
hence the need | need conj firms
both diversity | diversity pobj for
high levels | levels conj diversity
teamwork | teamwork pobj of
their top management teams | teams pobj in
Kim Mauborgne | Mauborgne appos teams
Weick Van Orden | Orden conj Mauborgne
this evidence | evidence nsubj suggests
the relationship | relationship nsubjpass curvilinear
TMT characteristics | characteristics pobj between
global strategic posture | posture conj characteristics
an inverted U | U dobj curvilinear
the complexity | complexity pobj Given
characterizing firms | firms pobj Given
expansive global positions | positions pobj with
hence the need | need conj firms
both diversity | diversity pobj for
high levels | levels conj diversity
teamwork | teamwork pobj of
their top management teams | teams pobj in
Kim Mauborgne | Mauborgne appos teams
Weick Van Orden | Orden conj Mauborgne
th

these hypotheses | hypotheses dobj test
we | we nsubj added
a squared term | term dobj added
model | model pobj in
none | none dobj added
the squared - coef - 2 Summary | Summary pobj of
Results | Results pobj of
OLS Fixed - Regression Analyses | Analyses pobj of
Global Strategic Posturea Variable | Variable pobj for
2 Model | Model appos Model
3 Model | Model appos Model
18 TMT educational heterogeneity | heterogeneity nsubj lb
TMT functional heterogeneity | heterogeneity ROOT heterogeneity
- 07t TMT firm tenure heterogeneity | heterogeneity ROOT heterogeneity
i | i nsubj d
12 Curvilinear effects | effects nsubj squared
TMT international experience | experience nsubj squared
educational heterogeneity | heterogeneity nsubj squared
TMT functional heterogeneity squared 2c - 05 TMT firm tenure heterogeneity | heterogeneity nsubj squared
29 TMT functional heterogeneity | heterogeneity ROOT heterogeneity
Controls Organizational size | size appos heterogeneity
07t | t ROOT t
07t | t ROOT t
0

In [11]:
import spacy

nlp = spacy.load('en_core_web_sm')

Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [27]:
doc = nlp(phrase)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Nevertheless nevertheless ADV RB advmod Xxxxx True False
, , PUNCT , punct , False False
we -PRON- PRON PRP nsubj xx True True
found find VERB VBD ROOT xxxx True False
no no DET DT det xx True True
support support NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
the the DET DT det xxx True True
curvilinear curvilinear NOUN NN compound xxxx True False
predictions prediction NOUN NNS pobj xxxx True False
the the DET DT det xxx True True
second second ADJ JJ amod xxxx True False
set set NOUN NN oprd xxx True False
of of ADP IN prep xx True True
hypotheses hypothesis NOUN NNS pobj xxxx True False
, , PUNCT , punct , False False
and and CCONJ CC cc xxx True True
our -PRON- ADJ PRP$ poss xxx True True
results result NOUN NNS nsubj xxxx True False
suggest suggest VERB VBP conj xxxx True False
instead instead ADV RB advmod xxxx True False
a a DET DT det x True True
refinement refinement NOUN NN dobj xxxx True False
to to ADP IN prep xx True True
such such ADJ JJ amod xxxx True Tr

In [29]:
for chunk in doc.noun_chunks:
    print(chunk.text, '|', chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

we | we nsubj found
no support | support dobj found
the curvilinear predictions | predictions pobj for
hypotheses | hypotheses pobj of
our results | results nsubj suggest
a refinement | refinement dobj suggest
