In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import ray
import redis
import pickle

import pandas as pd
import pprint as pp
import seaborn as sns
import matplotlib.pyplot as plt

ray.init()

2020-06-11 09:58:19,105	INFO resource_spec.py:204 -- Starting Ray with 27.78 GiB memory available for workers and up to 13.9 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-06-11 09:58:19,430	INFO services.py:1168 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


{'node_ip_address': '192.168.1.6',
 'raylet_ip_address': '192.168.1.6',
 'redis_address': '192.168.1.6:36080',
 'object_store_address': '/tmp/ray/session_2020-06-11_09-58-19_097211_32160/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-06-11_09-58-19_097211_32160/sockets/raylet',
 'webui_url': 'localhost:8265',
 'session_dir': '/tmp/ray/session_2020-06-11_09-58-19_097211_32160'}

In [2]:
import util


files = util.abs_file_paths("./3gpp-lte-rel-15")
pdfs = [f for f in files if f.endswith(".pdf")]
pp.pprint(sorted(util.path_bns(files)))

['ts_123002v150000p-abbr.txt',
 'ts_123002v150000p.pdf',
 'ts_123401v151000p-abbr.txt',
 'ts_123401v151000p.pdf',
 'ts_123402v150300p-abbr.txt',
 'ts_123402v150300p.pdf']


### Interative preparation (pdf too hard!)

In [3]:
import re

etsi = util.EtsiParser()

doc_abbrs = dict()
for p in pdfs:
    name = os.path.basename(p)
    doc_abbrs[name] = etsi.get_abbrs_from_file(p)
pp.pprint(doc_abbrs)

{'ts_123002v150000p.pdf': {'BSC': 'Base Station Controller  area',
                           'CN': 'Core Network  and Access Network (AN)',
                           'CS': 'Circuit Switched  and Packet Switched (PS) '
                                 'Domains',
                           'CS Domain': 'CS Domain',
                           'Cell': 'Cell',
                           'Group call area': 'Group call area',
                           'IMS': 'IP Multimedia subsystem ',
                           'LA': 'Location Area ',
                           'Location register': 'Location register',
                           'MME': 'Are',
                           'MSC': 'area',
                           'PLMN': 'Public Land Mobile Network ',
                           'PS Domain': 'PS Domain',
                           'Pool-are': 'Pool-are',
                           'RA': 'Routing Area ',
                           'RNC': 'Radio Network Controller  area',
                      

### Pdf -> Text

In [4]:
doc_pages = util.parse_pdfs(pdfs) 
# cache redis

[2m[36m(pid=32176)[0m ts_123002v150000p.pdf: num. pages: 111
[2m[36m(pid=32176)[0m ts_123002v150000p.pdf: parsed 1/111
[2m[36m(pid=32180)[0m ts_123402v150300p.pdf: num. pages: 310
[2m[36m(pid=32180)[0m ts_123402v150300p.pdf: parsed 1/310
[2m[36m(pid=32175)[0m ts_123401v151000p.pdf: num. pages: 419
[2m[36m(pid=32175)[0m ts_123401v151000p.pdf: parsed 1/419
[2m[36m(pid=32176)[0m ts_123002v150000p.pdf: parsed 31/111
[2m[36m(pid=32175)[0m ts_123401v151000p.pdf: parsed 31/419
[2m[36m(pid=32180)[0m ts_123402v150300p.pdf: parsed 31/310
[2m[36m(pid=32176)[0m ts_123002v150000p.pdf: parsed 61/111
[2m[36m(pid=32175)[0m ts_123401v151000p.pdf: parsed 61/419
[2m[36m(pid=32180)[0m ts_123402v150300p.pdf: parsed 61/310
[2m[36m(pid=32175)[0m ts_123401v151000p.pdf: parsed 91/419
[2m[36m(pid=32176)[0m ts_123002v150000p.pdf: parsed 91/111
[2m[36m(pid=32176)[0m ts_123002v150000p.pdf: done, took 4.024340867996216s
[2m[36m(pid=32175)[0m ts_123401v151000p.pdf: par

### Text -> dataframe

##### Per spec analysis

In [11]:
rows = list()
for doc, pages in doc_pages.items():
    rows.append({
        "file": doc,
        "title": etsi.get_title_from_pages(pages),
        "num_page": len(pages),
        "num_acronym": len(doc_abbrs[doc]),
    })
    
display(pd.DataFrame(rows))

Unnamed: 0,file,title,num_page,num_acronym
0,ts_123401v151000p.pdf,Radio Access Network (E-UTRAN) access,419,75
1,ts_123002v150000p.pdf,Network architecture,111,31
2,ts_123402v150300p.pdf,Architecture enhancements for non-3GPP accesses,310,49


##### Agg. analysis

In [15]:
acrs, num_page, num_acr, title, file = list(), 0, 0, "agg", "agg.pdf"
for doc, pages in doc_pages.items():
    acrs += list(doc_abbrs[doc].keys())
    num_page += len(pages)
num_acr = len(set(acrs))

display(pd.DataFrame([{
    "file": file,
    "title": title,
    "num_page": num_page,
    "num_acronym": num_acr,
}]))

Unnamed: 0,file,title,num_page,num_acronym
0,agg.pdf,agg,840,145


### TODO: number of interfaces under a release

### TODO: number of pages per interface