In [1]:
import gzip
import gc
import json
import re
import yaml
import pandas as pd
from pprint import pprint
from itertools import product
from wos_parser.chunkflusher import ChunkFlusherMono, FPSmart
from wos_db_studies.utils_json import apply_mapper, process_document_top, parse_edges
from collections import defaultdict
from functools import partial
import multiprocessing as mp
import timeit
from os.path import expanduser
from IPython.core.display import display, HTML

%load_ext autoreload
%autoreload 2

In [2]:
sources = [
    expanduser("~/data/wos/experiment/tmp/1980/WR_1980_20190212023637_DSSHPSH_0001#good#0.json.gz"),
    expanduser("~/data/wos/experiment/tmp/1985/dump_xml_0#good#0.json.gz"),
    expanduser("~/data/wos/experiment/tmp/2010/WR_2010_20190215011716_DSSHPSH_0001#good#0.json.gz"),
    expanduser("~/data/wos/experiment/tmp/2010/dsimple#good#0.json.gz"),
    expanduser("~/data/wos/experiment/tmp/1985/dsimple#good#0.json.gz"),
]

source = sources[-1]
target = None
if target is None:
    target = source.split(".")[0]
    
target_prefix = target.split(".")[0]
pattern=None

if source[-2:] == "gz":
    open_foo = gzip.GzipFile
elif source[-3:] == "xml":
    open_foo = open
else:
    raise ValueError("Unknown file type")

with open_foo(source, 'rb') as fp:
    if pattern:
        fps = FPSmart(fp, pattern)
    else:
        fps = fp
    data = json.load(fps)

In [3]:
config_path = "../../../misc_lib/wos_db_studies/conf/wos_json_simple.yaml"
with open(config_path, "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
index_fields_dict = {
    k: v["index"] for k, v in config["vertex_collections"].items()
}

all_fields_dict = {
    k: v["fields"] for k, v in config["vertex_collections"].items()
}

edge_des, excl_fields = parse_edges(config["json"], [], defaultdict(list))
r0 = process_document_top(data[12], config["json"], 
                         config["vertex_collections"],
                         excl_fields, ["publication"]
                        )

In [4]:
acc = []
stats = []
rtot = defaultdict(list)
for j, item in enumerate(data[:1000]):
    r0 = process_document_top(item, config["json"], 
                             config["vertex_collections"],
                             excl_fields, ["publication"]
                            )

    for k, v in r0.items():
        rtot[k].extend(v)

    kkeys = sorted(r0.keys(), key=lambda x: x[0] + x[1] if isinstance(x, tuple) else x)
    acc += [(j, len(kkeys))]
    stats += [pd.DataFrame([(k, len(r0[k])) 
                            for k in kkeys]).rename(columns={1: f"{j}"}).set_index(0)]

{'how': 'dict', 'name': 'publication', 'map': {'uid': '_key', 'xref_doi': 'doi', 'page': 'first_page', 'citedTitle': 'title'}}
None
{'how': 'dict', 'name': 'date'}
None
{'how': 'dict', 'name': 'medium_title', 'map': {'citedWork': 'title'}}
None
{'how': 'dict', 'descend_key': 'reference', 'name': 'contributor', 'map': {'citedAuthor': 'wos_standard'}}
None
{'how': 'dict', 'name': 'publication', 'map': {'uid': '_key', 'xref_doi': 'doi', 'page': 'first_page', 'citedTitle': 'title'}}
None
{'how': 'dict', 'name': 'date'}
None
{'how': 'dict', 'name': 'medium_title', 'map': {'citedWork': 'title'}}
None
{'how': 'dict', 'descend_key': 'reference', 'name': 'contributor', 'map': {'citedAuthor': 'wos_standard'}}
None
{'how': 'dict', 'name': 'publication', 'map': {'uid': '_key', 'xref_doi': 'doi', 'page': 'first_page', 'citedTitle': 'title'}}
None
{'how': 'dict', 'name': 'date'}
None
{'how': 'dict', 'name': 'medium_title', 'map': {'citedWork': 'title'}}
None
{'how': 'dict', 'descend_key': 'reference

In [5]:
kkey_vertex = sorted([k for k in rtot.keys() if len(k) > 2])
kkey_edge = sorted([k for k in rtot.keys() if len(k) ==  2])

In [9]:
from wos_db_studies.utils import pick_unique_dict
rtotu = defaultdict(list)
for k, v in rtot.items():
    rtotu[k] = pick_unique_dict(v)

In [10]:
for k in kkey_edge[:][:]:
    if "pub" in k[0] and "loc" in k[1]:
        print(k)
        tmp = [x for x in rtotu[k] if len(x[1]) > 0]
        pprint(tmp[:5])

('publisher', 'location')
[[{'display_name': 'royal soc chemistry'},
  {'city': 'cambridge'},
  {'publication': 'WOS:000282872600021', 'role': 'publisher', 'seq_no': '1'}],
 [{'display_name': 'begell house inc'},
  {'city': 'redding'},
  {'publication': 'WOS:000282534500001', 'role': 'publisher', 'seq_no': '1'}],
 [{'display_name': 'oxford univ press inc'},
  {'city': 'cary'},
  {'publication': 'WOS:000280297001485', 'role': 'publisher', 'seq_no': '1'}],
 [{'display_name': 'canadian psychological  assoc'},
  {'city': 'ottawa'},
  {'publication': 'WOS:000285987800034', 'role': 'publisher', 'seq_no': '1'}],
 [{'display_name': 'sage publications inc'},
  {'city': 'thousand oaks'},
  {'publication': 'WOS:000275558601232', 'role': 'publisher', 'seq_no': '1'}]]


In [13]:
# publisher -> seq_no addr_no
# one location -> many publisher
# on addr_no
for k in kkey_vertex[15:][:2]:
    print(k)
    pprint(rtotu[k][-5:])

language
[{'name': 'spanish'},
 {'name': 'chinese'},
 {'name': 'slovenian'},
 {'name': 'ukrainian'},
 {'name': 'italian'}]
location
[{'anchor': 'reprint',
  'city': 'ankara',
  'country': 'turkey',
  'full_address': 'ankara diskapi educ & res hosp, dept gen surg, 1424 cadde '
                  '1435 sokak 4-14, tr-06520 ankara, turkey',
  'street': '1424 cadde 1435 sokak 4-14'},
 {'anchor': True,
  'city': 'puchheim',
  'country': 'germany',
  'full_address': 'fa masimo europe ltd, puchheim, germany'},
 {'anchor': True,
  'city': 'beijing',
  'country': 'peoples r china',
  'full_address': 'beijing univ posts & telecommun, informat secur ctr, '
                  'beijing 100876, peoples r china'},
 {'anchor': True,
  'city': 'izmir',
  'country': 'turkey',
  'full_address': 'ege univ, sch med, dept phys med & rehabil, izmir, turkey'},
 {'anchor': 'reprint',
  'city': 'norwich',
  'country': 'england',
  'full_address': 'univ e anglia, sch environm sci, norwich nr4 7tj, norfolk, '
     

In [14]:
dfr = pd.concat(stats, axis=1)
print(dfr.shape)
dfr = dfr.fillna(0)
dfr.mean(axis=1).sort_values()

(58, 1000)


(conference, date)                   0.214
(publication, conference)            0.216
conference                           0.216
conference_title                     0.216
conference_info                      0.216
(conference, location)               0.218
(conference, conference_title)       0.218
(conference, conference_info)        0.218
(publication, funding_text)          0.369
funding_text                         0.369
grant_id                             0.523
(publication, grant_id)              0.523
conference_sponsor                   0.550
(conference, conference_sponsor)     0.562
agency                               0.778
(publication, agency)                0.778
(publication, medium)                0.950
medium                               0.950
(publication, bib_id)                1.000
publication_type                     1.000
(publication, publication_type)      1.000
(publication, some_id)               1.000
publisher                            1.000
(publisher,

In [103]:
kkey_edge

[('agency', 'grant_id'),
 ('conference', 'conference_info'),
 ('conference', 'conference_sponsor'),
 ('conference', 'conference_title'),
 ('conference', 'date'),
 ('conference', 'location'),
 ('contributor', 'location'),
 ('contributor', 'publication'),
 ('contributor', 'role'),
 ('location', 'organization'),
 ('medium', 'medium_title'),
 ('organization', 'suborganization'),
 ('publication', 'abstract'),
 ('publication', 'agency'),
 ('publication', 'bib_id'),
 ('publication', 'conference'),
 ('publication', 'date'),
 ('publication', 'document_type'),
 ('publication', 'edition'),
 ('publication', 'funding_text'),
 ('publication', 'grant_id'),
 ('publication', 'heading'),
 ('publication', 'keyword'),
 ('publication', 'language'),
 ('publication', 'medium'),
 ('publication', 'medium_title'),
 ('publication', 'publication_type'),
 ('publication', 'publisher'),
 ('publication', 'some_id'),
 ('publication', 'subheading'),
 ('publication', 'subject'),
 ('publisher', 'location'),
 ('publisher'

In [None]:
from datetime import datetime
date_a = '2010-01-01'
date_a_ = datetime.strptime(date_a, "%Y-%m-%d")
year, month, day = date_a_.year, date_a_.month, date_a_.day
year, month, day

In [None]:
date_b = 'NOV 03-04, 2008'.split(", ")
year = datetime.strptime(date_b[-1], "%Y").year
date_b_ = datetime.strptime(date_b[0].split("-")[0], "%b %d")
month, day = date_b_.month, date_b_.day
year, month, day

In [None]:
rtot['date']

In [None]:
# def foo(n):
#     for j, x in enumerate(data[:n]):
#             r0 = process_document_top(x, config["json"], 
#                                      config["vertex_collections"],
#                                      excl_fields, ["publication"]
#                                     )
# %timeit -n1 -r3 foo(10000)

In [None]:
# parallelize
kwargs = {"config": config["json"], 
          "vertex_config": config["vertex_collections"], 
          "edge_fields": excl_fields, 
          "merge_collections": ["publication"]
}
def foo_parallel(n):
    func = partial(process_document_top, **kwargs)
    n_proc = 4
    with mp.Pool(n_proc) as p:
        r = p.map(func, data[:n])
%timeit -n1 -r3 foo_parallel(10000)

In [None]:
%timeit -n1 -r1 foo(50000)