In [115]:
import gzip
import pickle
from os.path import join, expanduser
import numpy as np
import pandas as pd
import json

In [129]:
input_path = expanduser('~/data/jstor/latest/')
output_path = expanduser('~/data/jstor/latest/')
prefix = 'ngrams_dict'
dois = ['10.2307/3069368', '10.2307/20159507']

with open(join(output_path, 'registry_json.txt')) as file:
    registry_dict = json.loads(file.read())

In [135]:
def get_chunk(fpath, prefix, index):
    fname = join(fpath, '{0}_{1}.pgz'.format(prefix, index))
    with gzip.open(fname) as fp:
        item = pickle.load(fp)
        return item
    return None

def get_articles(dois, registry_dict, fpath, prefix):
    doi_chunk_map = find_doi_chunk_map(dois, registry_dict)
    chunks_to_load = list(set(doi_chunk_map.values()))
    chunks_dict = {ii: get_chunk(fpath, prefix, ii) for ii in chunks_to_load}
    doi_ngram_dict = {doi:chunks_dict[doi_chunk_map[doi]][doi] for doi in dois}
    return doi_ngram_dict

def find_doi_chunk_map(dois, registry_dict):
    return {k : [kk for kk, val in check_dict.items() if k in val][0] for k in dois}

def load_ngram_dist(fpath, order, thr=2):
    fname = join(fpath, 'corpus_ngram_dist_n_{0}_thr_{1}.pgz'.format(order, thr))
    with gzip.open(fname) as fp:
        distr_dict = pickle.load(fp)
    if order == 1:
        distr_dict = {k[0]: v for k, v in distr_dict.items()}
    return distr_dict

In [127]:
dch_dict = get_articles(dois, registry_dict, input_path, prefix)

In [140]:
# dch_dict[dois[0]][3]
ngram_order = 2

In [141]:
distr_dict = load_ngram_dist(input_path, ngram_order)

In [142]:
keys = sorted(distr_dict.keys())
# vals = np.array([distr_dict[k] for k in keys])
# df = pd.DataFrame(vals, keys, columns=['f', 'f-', 'f+', 'n']).sort_values('n')
# df.tail(20)

In [144]:
# a single article
article = dch_dict[dois[0]]
ngram_positions_list = article[order]
ngram_counts = {k: len(v) for k, v in ngram_positions_list.items()}
ngram_number = sum(ngram_counts.values())
ngram_freqs = {k: v/ngram_number for k, v in ngram_counts.items() if k in distr_dict.keys()}
len(ngram_freqs)

2669

In [145]:
ngram_freqs_outstanding = {k: v/distr_dict[k][0] for k, v in ngram_freqs.items()} 
len(ngram_counts), len(ngram_freqs), len(ngram_freqs_outstanding)

(3476, 2669, 2669)

In [151]:
keys = sorted(ngram_freqs_outstanding.keys())
vals = np.array([(ngram_freqs_outstanding[k], ngram_counts[k], distr_dict[k][-1], 1./distr_dict[k][-1]) 
                 for k in keys])
print(len(keys), vals.shape)
df = pd.DataFrame(vals, keys, columns=['tf_idf', 'n_article', 
                                       'n_corpus', 'inv_n_corpus']).sort_values(['n_article', 'inv_n_corpus'])
df.tail(50)

2669 (2669, 4)


Unnamed: 0,tf_idf,n_article,n_corpus,inv_n_corpus
"(high, uncertainty)",112.172643,6.0,261.0,0.003831
"(linear, relationship)",117.578554,6.0,249.0,0.004016
"(set, hypothesis)",144.93594,6.0,202.0,0.00495
"(skill, set)",232.357618,6.0,126.0,0.007937
"(simple, effect)",301.82536,6.0,97.0,0.010309
"(role, uncertainty)",487.950998,6.0,60.0,0.016667
"(echelon, perspective)",680.861858,6.0,43.0,0.023256
"(positively, moderate)",680.861858,6.0,43.0,0.023256
"(uncertainty, positively)",1084.335551,6.0,27.0,0.037037
"(moderating, role)",94.355165,7.0,362.0,0.002762
