In [1]:
# pdf2txt.py -o afi34-201.html -t html afi/afi34-201.pdf

import topycal
import os
import glob
import random

INSTR_PATH = os.path.join(os.getcwd(),"afi_txt")

In [2]:
from concurrent.futures import ThreadPoolExecutor, wait, as_completed
def do_fn_on_iter(fn, iterator, num_threads=6):
    futures = []
    if isinstance(num_threads, str):
        num_threads = int(num_threads)
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        for elem in iterator:
            futures.append(executor.submit(fn, elem))
    results = []
    for x in as_completed(futures):
        results.append(x.result())
    return results


In [3]:
def get_file_list(limit=500, shuffle=True):
    files = glob.glob("{}/afi*.txt".format(INSTR_PATH))
    if shuffle:
        random.shuffle(files)
    if limit:
        return files[0:limit]
    else:
        return files
    #data = myfile.read()
    
def read_file(fname):
    with open(fname, errors='replace') as fd:
        return fd.read()

In [4]:
file_list = get_file_list(limit=None)


In [5]:
import re
import os

def read_file(fname):
    with open(fname, 'r') as myfile:
        return re.sub("\s+",' ',myfile.read())

def write_file(fpath, string):
    with open(fpath, 'w') as myfile:
        return myfile.write(string)
    
def load_file(fname):       
    with open(fname, 'r') as myfile:
        #contents = re.sub(r'[\t\n\r\x0b\x0c]',' ', myfile.read())
        contents = myfile.read()
        return (os.path.basename(fname),re.sub("\s+",' ', contents))

def load_corpus(file_list):
    return {f[0]:f[1] for f in do_fn_on_iter(load_file, file_list)}    

In [6]:
corpus_dict = load_corpus(file_list)

In [7]:
#corpus_dict['afi10-244.txt']

In [8]:
import spacy
nlp = spacy.load("en_core_web_md")

In [9]:
foo = "the warrior"
foo[0:4] in ['the ','The ']
foo[4:]

'warrior'

In [10]:
from textstat.textstat import textstat

def get_num_words(sent):
    return len([word for word in sent if word.text.isalpha()])

def get_num_syllables(sent):
    return textstat.syllable_count(sent.text)

def get_np_regexcands(string):
    propnounregexp = r"(?:[A-Z][a-z]{2,}\s*(?:and|of)?\s*){3,}"
    p = re.compile(propnounregexp)
    nps = list(set(p.findall(string)))
    return nps

def get_np_cands(string):
    nps = []
    doc = nlp(string)
    for np in doc.noun_chunks:
        nps.append(np)
    return nps

def get_nounphrases(string,blacklist=['AFI','AFPD']):
    new_cands = []
    cands = get_np_cands(string)
    for cand in cands:
        # If starts with the.. strip it off
        if cand.text[0:4] in ['The ','the ']:
            cand = cand[4:]
        elif cand.text[0:2] in ['a ','A ']:
            cand = cand[2:]
        #new_cands.append(cand.text.strip())
        num_words = get_num_words(cand)
        if 3 <= num_words <=9:
            #print(num_words)
            #print(cand.text)
            if not any(blackword in cand.text for blackword in blacklist):
                if get_num_syllables(cand)/num_words > 4:
                    new_cands.append(cand.text.strip())
                
                elif len(cand.text)/num_words > 7:
                    new_cands.append(cand.text.strip())
    spacy_cands = list(set(new_cands))
    regex_cands = get_np_regexcands(string)
    #print(regex_cands)
    return list(set(spacy_cands + regex_cands))
            
    

In [11]:
import re
def find_reference_candidates(string):
    cands = []
    refregexps = [r"AF[A-Z]+\s?\d+[A-Z0-9-]+\d+\-?\d*",r"CJ\w+\s\d+\.\d+[A-Z]?",r"JP\s\d\-?\d",
                  r"(?:DODFMR|DoD|DTR|DODI|DoDI|DoDD|DoD\sDirective)\s\d+\.?\d*",
                 "(?:SF|DD|AF|AFTO)+\s+Form\s\d+"]
    for regexp in refregexps:
                  p = re.compile(regexp)
                  cands += p.findall(string)
    
    return cands

def find_references(string):
    cands = list(set(find_reference_candidates(string)))
    stoprefs = ['AF']
    for ref in stoprefs:
        try:
            cands.remove(ref)
        except ValueError:
            pass      
    cands.sort()
    return cands

In [12]:
import re
# find acronyms
def find_acronym_candidates(string):
    acroregexp = r"\([A-Z]{3}[A-Za-z]*\)"
    p = re.compile(acroregexp)
    uniqued = list(set(p.findall(string)))
    return [cand[1:-1] for cand in uniqued]

stopronyms = ['AFPD','AFI','REQUIRED','Conventional','Hijacking','Name']
knownronyms = ['FOA','DRU','DSN','IAW','ULN','FAM','MAJCOM','ALN']
def find_acronyms(string):
    cands = find_acronym_candidates(string)
    for s in stopronyms:
        try:
            cands.remove(s)
        except ValueError:
            pass
    for known in knownronyms:
        cands.append(known)
    #cands.sort()
    cands.sort(key = len, reverse=True)
    return cands
    
    

In [13]:
def monkeypatch_css(htmlstr):
    css = """
    
    
    <style>
    
/**
 * Tooltip Styles
 */

/* Add this attribute to the element that needs a tooltip */
[data-tooltip] {
  position: relative;
  z-index: 2;
  cursor: pointer;
}

/* Hide the tooltip content by default */
[data-tooltip]:before,
[data-tooltip]:after {
  visibility: hidden;
  -ms-filter: "progid:DXImageTransform.Microsoft.Alpha(Opacity=0)";
  filter: progid: DXImageTransform.Microsoft.Alpha(Opacity=0);
  opacity: 0;
  pointer-events: none;
}

/* Position tooltip above the element */
[data-tooltip]:before {
  position: absolute;
  bottom: 150%;
  left: 50%;
  margin-bottom: 5px;
  margin-left: -80px;
  padding: 7px;
  width: 160px;
  -webkit-border-radius: 3px;
  -moz-border-radius: 3px;
  border-radius: 3px;
  background-color: #000;
  background-color: hsla(0, 0%, 20%, 0.9);
  color: #fff;
  content: attr(data-tooltip);
  text-align: center;
  font-size: 14px;
  line-height: 1.2;
}

/* Triangle hack to make tooltip look like a speech bubble */
[data-tooltip]:after {
  position: absolute;
  bottom: 150%;
  left: 50%;
  margin-left: -5px;
  width: 0;
  border-top: 5px solid #000;
  border-top: 5px solid hsla(0, 0%, 20%, 0.9);
  border-right: 5px solid transparent;
  border-left: 5px solid transparent;
  content: " ";
  font-size: 0;
  line-height: 0;
}

/* Show tooltip content on hover */
[data-tooltip]:hover:before,
[data-tooltip]:hover:after {
  visibility: visible;
  -ms-filter: "progid:DXImageTransform.Microsoft.Alpha(Opacity=100)";
  filter: progid: DXImageTransform.Microsoft.Alpha(Opacity=100);
  opacity: 1;
}
span.red {
  background-color: red;
}
mark.nounphrase {
  background-color: #ffa370;
}
mark.reference {
  background-color: #90d2ff;
}
mark.acronym {
  background-color: #3aff3a; 
}
</style>"""
    return htmlstr.replace("<html><head>","<html><head>{}".format(css))

In [15]:
#afi_html.find('Work timetables need to be adjusted to minimize ')
#afi_html[1168331:1169331]

In [16]:
import pickle
with open('ec130results.pickle','rb') as myfile:
    ec130results = pickle.load(myfile)

In [17]:
HTMLFILE = "afi11-2ec-130hv3.html"
afi_html = read_file(os.path.join(os.getcwd(),HTMLFILE))
afi_html = afi_html.replace('- <br>','-')
afi_html = afi_html.replace(' <br>',' ')

In [18]:
assert list(ec130results.keys())[7] in afi_html

In [19]:
assert 'Work timetables need to be adjusted to minimize thermal stress caused by wearing the ACDE. Aircrews must weigh all factors when performing in-flight and ground duties.' in afi_html

In [20]:
TXTFILE = 'afi11-2ec-130hv3.txt'
acronyms = find_acronyms(corpus_dict[TXTFILE])

nounphrases = get_nounphrases(corpus_dict[TXTFILE])

In [21]:
references = find_references(corpus_dict[TXTFILE])

In [22]:
#references

In [23]:
for k,v in ec130results.items():
    desc = "Duplicated {} times across {}".format(v[1],", ".join([k[:-4] for k in v[0]]))
    new_elem = '<mark class="dupe" data-tooltip="{}">{}</mark>'.format(desc,k)
    afi_html = afi_html.replace(k,new_elem)

In [24]:
for ac in acronyms:
    new_elem = '<mark class="acronym">{}</mark>'.format(ac)
    afi_html = afi_html.replace(ac,new_elem)

In [25]:
for np in nounphrases:
    new_elem = '<mark class="nounphrase">{}</mark>'.format(np)
    afi_html = afi_html.replace(np,new_elem)    

In [26]:
for ref in references:
    #new_elem = '<mark class="reference">{}</mark>'.format(ref)
    new_elem = '<mark class="reference" data-tooltip="Blah blah..">{}</mark>'.format(ref)
    afi_html = afi_html.replace(ref,new_elem)
    
afi_html = monkeypatch_css(afi_html)
#afi_html = monkeypatch_js(afi_html)

In [27]:
write_file('afi11-2ec-130hv3-meta.html',afi_html)

2038184

In [None]:
#nounphrases

In [None]:
# long sentences
# some kind of reference