In [20]:
import gzip
import pickle
from os.path import join, expanduser
import numpy as np
import pandas as pd
import json
import networkx as nx
from article_analysis.parse import get_chunk, get_articles, find_doi_chunk_map, load_ngram_dist

In [2]:
import spacy

nlp = spacy.load('en')

In [3]:
not_stop_words = ['top', 'bottom']
stop_words = ["'s"]

for w in not_stop_words:
#     nlp.Defaults.stop_words.remove(w)
    nlp.vocab[w].is_stop = False

for w in stop_words:
    nlp.vocab[w].is_stop = True

# nlp.Defaults.stop_words -= set(not_stop_words)
sentence = nlp("the bottom is the new top's word")
print([(s, s.is_stop) for s in sentence])
# sentence[1].is_stop, sentence[5].is_stop

[(the, True), (bottom, False), (is, True), (the, True), (new, False), (top, False), ('s, True), (word, False)]


In [29]:
nlp.vocab['The'].is_stop

False

In [4]:
input_path = expanduser('~/data/jstor/latest/')
output_path = expanduser('~/data/jstor/latest/')
prefix = 'ngrams_dict'
dois = ['10.2307/3069368', '10.2307/20159507']

with open(join(output_path, 'registry_json.txt')) as file:
    registry_dict = json.loads(file.read())

In [5]:
# def get_chunk(fpath, prefix, index):
#     """
#     return chunk number index
#     """
#     fname = join(fpath, '{0}_{1}.pgz'.format(prefix, index))
#     with gzip.open(fname) as fp:
#         item = pickle.load(fp)
#         return item
#     return None

# def get_articles(dois, registry_dict, fpath, prefix):
#     """
#     get articles by dois
#     """
#     doi_chunk_map = find_doi_chunk_map(dois, registry_dict)
#     chunks_to_load = list(set(doi_chunk_map.values()))
#     chunks_dict = {ii: get_chunk(fpath, prefix, ii) for ii in chunks_to_load}
#     doi_ngram_dict = {doi:chunks_dict[doi_chunk_map[doi]][doi] for doi in dois}
#     return doi_ngram_dict

# def find_doi_chunk_map(dois, registry_dict):
#     return {k : [kk for kk, val in registry_dict.items() if k in val][0] for k in dois}

# def load_ngram_dist(fpath, order, thr=2):
#     fname = join(fpath, 'corpus_ngram_dist_n_{0}_thr_{1}.pgz'.format(order, thr))
#     with gzip.open(fname) as fp:
#         distr_dict = pickle.load(fp)
#     if order == 1:
#         distr_dict = {k[0]: v for k, v in distr_dict.items()}
#     return distr_dict

In [6]:
# get articles
dch_dict = get_articles(dois, registry_dict, input_path, prefix)

ngram_order = 2
# get ngrams
distr_dict = load_ngram_dist(input_path, ngram_order)

In [7]:
# three grams
{k : v for k, v in dch_dict[dois[0]][3].items() if k in list(dch_dict[dois[0]][3].keys())[:2]}

{('c', 'academy', 'management'): [0],
 ('academy', 'management', 'journal'): [0,
  326,
  349,
  378,
  418,
  465,
  473,
  488,
  491]}

In [8]:
keys = sorted(distr_dict.keys())
# vals = np.array([distr_dict[k] for k in keys])
# df = pd.DataFrame(vals, keys, columns=['f', 'f-', 'f+', 'n']).sort_values('n')
# df.tail(20)

In [9]:
# a single article
article = dch_dict[dois[0]]
ngram_positions_list = article[ngram_order]
ngram_counts = {k: len(v) for k, v in ngram_positions_list.items()}
ngram_number = sum(ngram_counts.values())
ngram_freqs = {k: v/ngram_number for k, v in ngram_counts.items() if k in distr_dict.keys()}
len(ngram_freqs)

ngram_freqs_outstanding = {k: v/distr_dict[k][0] for k, v in ngram_freqs.items()} 
len(ngram_counts), len(ngram_freqs), len(ngram_freqs_outstanding)

keys = sorted(ngram_freqs_outstanding.keys())
vals = np.array([(ngram_freqs_outstanding[k], ngram_counts[k], distr_dict[k][-1], 1./distr_dict[k][-1]) 
                 for k in keys])
print(len(keys), vals.shape)
df = pd.DataFrame(vals, keys, columns=['tf_idf', 'n_article', 
                                       'n_corpus', 'inv_n_corpus']).sort_values(['n_article', 'inv_n_corpus'])
df.tail(20)

2669 (2669, 4)


Unnamed: 0,tf_idf,n_article,n_corpus,inv_n_corpus
"(finkelstein, hambrick)",113.68034,13.0,558.0,0.001792
"(management, team's)",773.58085,13.0,82.0,0.012195
"(expansiveness, firm's)",4879.509979,13.0,13.0,0.076923
"(demographic, effect)",481.53059,15.0,152.0,0.006579
"(tmt, characteristic)",1045.609281,15.0,70.0,0.014286
"(firm's, global)",2614.023203,15.0,28.0,0.035714
"(firm, tenure)",523.974226,16.0,149.0,0.006711
"(relationship, top)",772.99168,16.0,101.0,0.009901
"(tmt, demographic)",3318.066786,17.0,25.0,0.04
"(educational, heterogeneity)",3606.594332,17.0,23.0,0.043478


In [10]:
ngram_order = 1
ngram_positions_list = article[ngram_order]

In [11]:
ixs = ngram_positions_list['hypothesis'] + ngram_positions_list['hypothesis']

In [12]:
fname = expanduser('~/data/jstor/latest/corpus_clean_dict.pgz')
with gzip.open(fname) as fp:
    articles_ds = pickle.load(fp)

In [13]:
carticle = articles_ds[dois[0]]

In [67]:
chunks = []
inds = [range(i-1, i+2) for i in ixs]
overlap_indices = sorted(list(set([x for sublist in inds for x in sublist])))

for j in overlap_indices:
    phrase = ' '.join(carticle[j])
    doc = nlp(phrase)
    for chunk in doc.noun_chunks:
        supp_chunk = [c for c in chunk if not c.is_stop and not c.text.lower() in nlp.Defaults.stop_words]
        if supp_chunk and chunk.root.pos_ == 'NOUN':
            edge_slist = [[(c.lemma_, d.lemma_) for d in c.children if d in supp_chunk] for c in supp_chunk]
            edge_list = [e for sublist in edge_slist for e in sublist]
            g = nx.Graph()
            edge_list += [('#', chunk.root.lemma_)]
            g.add_edges_from(edge_list)
            supp_chunk2 = [c.lemma_ for c in supp_chunk]
            chunks.append((chunk.root.lemma_, tuple(supp_chunk2), g))

a positive relationship
a positive relationship
a positive relationship
a positive relationship
the relationship
the linear relationships
U - relationship
U - relationship
U - relationship
U - relationship
the relationship
the previously proposed linear relationships
the previously proposed linear relationships
the previously proposed linear relationships
the previously proposed linear relationships
the main effect relationships
the relationship
the relationship
such relationships
the moderated relationships
a negative linear relationship
the relationship
such relationships
an inverted - or curvilinear relationship


In [68]:
chunks_dict = {}
for r, np, tr in chunks:
    if r in chunks_dict:
        if np in chunks_dict[r].keys():
             chunks_dict[r][np] += 1
        else:
            chunks_dict[r][np] = 1
    else:
        chunks_dict[r] = {np: 1}

In [69]:
len(chunks_dict)

109

In [70]:
{k: chunks_dict[k] for k in list(chunks_dict.keys())[:5]}

{'breadth': {('breadth',): 2},
 'diversity': {('diversity',): 3},
 'posture': {('global', 'strategic', 'posture'): 8,
  ('firm', 'global', 'strategic', 'posture'): 12,
  ('expansive', 'global', 'strategic', 'posture'): 1},
 'relationship': {('positive', 'relationship'): 4,
  ('relationship',): 7,
  ('linear', 'relationship'): 1,
  ('u', '-', 'relationship'): 4,
  ('previously', 'propose', 'linear', 'relationship'): 4,
  ('main', 'effect', 'relationship'): 1,
  ('moderate', 'relationship'): 1,
  ('negative', 'linear', 'relationship'): 1,
  ('invert', '-', 'curvilinear', 'relationship'): 1},
 'experience': {('management',
   'team',
   'international',
   'work',
   'experience'): 3,
  ('international', 'work', 'experience'): 1,
  ('tmt', 'international', 'experience'): 1,
  ('dramatically', 'different', 'functional', 'experience'): 1}}

In [71]:
total_counts = {k: sum(chunks_dict[k].values()) for k in list(chunks_dict.keys())}

In [78]:
pop_keys = sorted(total_counts, key=total_counts.get, reverse=True)
root_candidates = [k for k in pop_keys if total_counts[k] > 5 and len(k) > 2]
root_candidates

['heterogeneity',
 'relationship',
 'posture',
 'hypothesis',
 'uncertainty',
 'model',
 'expansiveness',
 'level',
 'set',
 'characteristic',
 'effect',
 'team',
 'variable',
 'result',
 'experience']

In [27]:
chunks_dict2 = {k: v for k, v in chunks_dict.items() if len(k) > 2}
print(len(chunks_dict2))

125


In [24]:
chunks_dict3 = {k: {q: w for q, w in v.items() if w > 1 and len(q) > 2} for k, v in chunks_dict2.items() if len(k) > 2}
chunks_dict4 = {k: v for k, v in chunks_dict3.items() if v}
print(len(chunks_dict4))

7


In [25]:
chunks_dict4

{'posture': {('global', 'strategic', 'posture'): 8,
  ('firm', "'s", 'global', 'strategic', 'posture'): 12},
 'relationship': {('U', '-', 'relationship'): 4},
 'experience': {('top',
   'management',
   'team',
   "'s",
   'international',
   'work',
   'experience'): 3},
 'heterogeneity': {('top',
   'management',
   'team',
   "'s",
   'educational',
   'heterogeneity'): 3,
  ('top', 'management', 'team', "'s", 'functional', 'heterogeneity'): 3,
  ('top', 'management', 'team', "'s", 'firm', 'tenure', 'heterogeneity'): 3},
 'studies': {('upper', 'echelons', 'studies'): 2},
 'teams': {('top', 'management', 'teams'): 2},
 'relationships': {('previously', 'proposed', 'linear', 'relationships'): 4}}

In [129]:
unis[(cnts > 2) & np.array([len(x) > 1 for x in unis]) & chunk.root.pos_]

array([('07', 't'), ('Environmental', 'uncertainty'),
       ('TMT', 'characteristics'), ('U', '-', 'relationship'),
       ('educational', 'heterogeneity'),
       ('firm', "'s", 'global', 'strategic', 'posture'),
       ('functional', 'heterogeneity'),
       ('global', 'strategic', 'posture'), ('positive', 'relationship'),
       ('previously', 'proposed', 'linear', 'relationships'),
       ('top', 'management', 'team', "'s", 'educational', 'heterogeneity'),
       ('top', 'management', 'team', "'s", 'firm', 'tenure', 'heterogeneity'),
       ('top', 'management', 'team', "'s", 'functional', 'heterogeneity'),
       ('top', 'management', 'team', "'s", 'international', 'work', 'experience'),
       ('top', 'team')], dtype=object)

In [139]:
chunk.root.pos_

'NOUN'

In [56]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS | {"'s'"}
spacy_stopwords = spacy_stopwords - {'top'}
print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:100])
# stop_words.append

Number of stop words: 305
First ten stop words: ['himself', 'each', 'if', 'will', 'no', 'others', 'sixty', 'he', 'did', 'we', 'could', 'the', 'using', 'say', 'whither', 'nowhere', 'ours', 'once', 'third', 'thereby', 'least', 'part', 'further', 'seem', 'whose', 'another', 'some', 'one', 'back', 'by', 'forty', 'has', 'last', 'upon', 'except', 'various', 'too', 'though', 'often', 'yourself', 'both', 'any', 'eight', 'during', 'former', 'next', 'had', 'almost', 'than', 'either', 'noone', 'from', 'make', 'thru', 'else', 'of', 'onto', 'see', 'hers', 'they', 'may', 'been', 'per', 'mostly', 'here', 'sometime', 'you', 'might', 'sometimes', 'hundred', 'wherever', 'into', 'off', 'yet', 'seemed', 'yourselves', 'these', 'otherwise', 'via', 'anyone', 'although', 'whoever', "'s'", 'thereafter', 'across', 'around', 'several', 'show', 'whereupon', 'everything', 'myself', 'do', 'along', 'then', 'everyone', 'empty', 'but', 'against', 'side', 'whereas']


In [27]:
# doc = nlp(phrase)
# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#           token.shape_, token.is_alpha, token.is_stop)

Nevertheless nevertheless ADV RB advmod Xxxxx True False
, , PUNCT , punct , False False
we -PRON- PRON PRP nsubj xx True True
found find VERB VBD ROOT xxxx True False
no no DET DT det xx True True
support support NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
the the DET DT det xxx True True
curvilinear curvilinear NOUN NN compound xxxx True False
predictions prediction NOUN NNS pobj xxxx True False
the the DET DT det xxx True True
second second ADJ JJ amod xxxx True False
set set NOUN NN oprd xxx True False
of of ADP IN prep xx True True
hypotheses hypothesis NOUN NNS pobj xxxx True False
, , PUNCT , punct , False False
and and CCONJ CC cc xxx True True
our -PRON- ADJ PRP$ poss xxx True True
results result NOUN NNS nsubj xxxx True False
suggest suggest VERB VBP conj xxxx True False
instead instead ADV RB advmod xxxx True False
a a DET DT det x True True
refinement refinement NOUN NN dobj xxxx True False
to to ADP IN prep xx True True
such such ADJ JJ amod xxxx True Tr

In [20]:
for chunk in doc.noun_chunks:
    print(chunk.text, '|', chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

arguments | arguments nsubj hold
the nonmonotonic effects | effects pobj about
heterogeneity | heterogeneity pobj of
some demographic characteristics | characteristics pobj for
one | one nsubj take
account | account pobj into
the level | level dobj take
uncertainty | uncertainty pobj of
a top team | team dobj facing


In [None]:
chunks = []
inds = [range(i-1, i+2) for i in ixs]
overlap_indices = sorted(list(set([x for sublist in inds for x in sublist])))

for j in overlap_indices:
    phrase = ' '.join(carticle[j])
    doc = nlp(phrase)
    for chunk in doc.noun_chunks:
        supp_chunk = [c for c in chunk if not c.is_stop]
        if supp_chunk and chunk.root.pos_ == 'NOUN':
            edge_slist = [[(c,d) for d in c.children if d in supp_chunk] for c in supp_chunk]
            edge_list = [e for sublist in edge_slist for e in sublist]
            g = nx.Graph()
            edge_list += [('#', chunk.root)]
            g.add_edges_from(edge_list)
            supp_chunk2 = [c.text for c in supp_chunk]
            chunks.append((chunk.root, tuple(supp_chunk2), g))
#         print(chunk.text, '|', chunk.root.text, chunk.root.dep_,
#               chunk.root.head.text)

In [83]:
c = chunk[1]

In [86]:
c.dep_

'amod'