In [1]:
import topycal
import os
import glob
import random

INSTR_PATH = os.path.join(os.getcwd(),"afi_txt")

In [2]:
from concurrent.futures import ThreadPoolExecutor, wait, as_completed
def do_fn_on_iter(fn, iterator, num_threads=6):
    futures = []
    if isinstance(num_threads, str):
        num_threads = int(num_threads)
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        for elem in iterator:
            futures.append(executor.submit(fn, elem))
    results = []
    for x in as_completed(futures):
        results.append(x.result())
    return results


In [3]:
def get_file_list(limit=500, shuffle=True):
    files = glob.glob("{}/afi*.txt".format(INSTR_PATH))
    if shuffle:
        random.shuffle(files)
    if limit:
        return files[0:limit]
    else:
        return files
    #data = myfile.read()
    
def read_file(fname):
    with open(fname, errors='replace') as fd:
        return fd.read()

In [4]:
file_list = get_file_list(limit=None)


In [5]:
import re
import os

def load_file(fname):       
    with open(fname, 'r') as myfile:
        contents = re.sub(r'[\t\n\r\x0b\x0c]',' ', myfile.read())
        return (os.path.basename(fname),re.sub("\s+",' ', contents))

def load_corpus(file_list):
    return {f[0]:f[1] for f in do_fn_on_iter(load_file, file_list)}    

In [6]:
corpus_dict = load_corpus(file_list)

In [7]:
import spacy
nlp = spacy.load("en_core_web_md")

For each document, we sentence segment then create a basket based on the 64bit murmurhash (after whitespace trimming and checking for a minimum sentence threshold).

In [9]:
import mmh3

from tqdm import tqdm
from collections import defaultdict

MIN_SENT_LEN = 15

def do_market_basket_analysis(doc_dict):
    baskets = defaultdict(set)
    reverselut = {}
    for k,v in tqdm(doc_dict.items()):
        for sent in nlp(v).sents:
            if len(sent) > MIN_SENT_LEN:
                this_hash = mmh3.hash64(sent.text.strip())
                baskets[this_hash].add(k)
                reverselut[sent.text] = this_hash
    return (baskets,reverselut)

In [10]:
baskets,reverselut = do_market_basket_analysis(corpus_dict)

100%|██████████| 1141/1141 [18:32<00:00,  1.80it/s] 


Each basket is trimmed based on being a certain minimum size.. logic here is to reduce noise.

In [35]:
this_hash = reverselut['Establish, monitor, and verify supervisory inspections on elements assigned with equipment and CTK’s are completed.']
#baskets[this_hash]
sent_to_hash = reverselut
hash_to_sent = {v:k for k,v in reverselut.items()}

In [59]:
def count_docs():
    doc_count = {}
    for k,v in trimmed_baskets.items():
        for doc in v:
            try:
                doc_count[doc] += 1
            except KeyError:
                doc_count[doc] = 1
    return doc_count
        

In [60]:
doccount = count_docs()

In [61]:
doccount

{'afi1-404.txt': 4,
 'afi10-1701.txt': 4,
 'afi10-1703v1.txt': 7,
 'afi10-1703v2.txt': 5,
 'afi10-1703v3.txt': 4,
 'afi10-201.txt': 4,
 'afi10-202.txt': 7,
 'afi10-203.txt': 4,
 'afi10-204.txt': 3,
 'afi10-206.txt': 2,
 'afi10-207.txt': 4,
 'afi10-208.txt': 1,
 'afi10-209.txt': 2,
 'afi10-210.txt': 2,
 'afi10-213.txt': 3,
 'afi10-216.txt': 4,
 'afi10-2402.txt': 1,
 'afi10-244.txt': 1,
 'afi10-250.txt': 6,
 'afi10-2501.txt': 3,
 'afi10-251.txt': 3,
 'afi10-2519.txt': 6,
 'afi10-2607.txt': 2,
 'afi10-2909.txt': 3,
 'afi10-3002.txt': 4,
 'afi10-301.txt': 3,
 'afi10-3502v1.txt': 6,
 'afi10-3502v2.txt': 14,
 'afi10-401.txt': 2,
 'afi10-402.txt': 3,
 'afi10-403.txt': 6,
 'afi10-410.txt': 1,
 'afi10-414.txt': 1,
 'afi10-420.txt': 3,
 'afi10-4201v1.txt': 3,
 'afi10-4201v3.txt': 5,
 'afi10-421.txt': 3,
 'afi10-501.txt': 2,
 'afi10-503.txt': 1,
 'afi10-601.txt': 1,
 'afi10-701.txt': 5,
 'afi10-703.txt': 1,
 'afi10-706.txt': 1,
 'afi10-707.txt': 3,
 'afi10-712.txt': 5,
 'afi11-101.txt': 2,
 'afi1

In [66]:
def get_docs_from_sent(sent):
    this_hash = sent_to_hash[sent]
    return list(baskets[this_hash])

In [71]:
def search_for_doc(doc_name):
    sents = {}
    for k,v in trimmed_baskets.items():
        if doc_name in v:
            this_sent = hash_to_sent[k]
            #sents.append(this_sent)
            the_docs = get_docs_from_sent(this_sent)
            sents[this_sent] = (the_docs,len(the_docs))
            #shared_docs = 
            #print("{} in {}".format(k, doc_name))
    return sents

In [73]:
ec130results = search_for_doc('afi11-2ec-130hv3.txt')

In [75]:
import pickle
with open('ec130results.pickle','wb') as myfile:
    pickle.dump(ec130results,myfile)

In [23]:
# trim baskets

def trim_baskets(baskets, min_size=3):
    return {k:v for k,v in tqdm(baskets.items()) if len(v) > min_size}


In [28]:
trimmed_baskets = trim_baskets(baskets, min_size=6)

100%|██████████| 508484/508484 [00:00<00:00, 1421388.42it/s]


In [29]:
len(list(baskets.items()))

508484

In [53]:
hash_to_sent[(-5011513889702821403, -7321025647363460706)]

'To the extent its directions are inconsistent with other Air Force publications, the information herein prevails, in accordance with AFI 33-360, Publications and Forms Management.'

In [None]:
# For a given document,
# Output a list of sentences it shares with others, and for each sentence, what those references are
# For a given reference, determine its cosine similarity.

In [55]:
get_occurrence_count_of_sentence('To the extent its directions are inconsistent with other Air Force publications, the information herein prevails, in accordance with AFI 33-360, Publications and Forms Management.')

(35,
 ['afi10-201.txt',
  'afi32-7062.txt',
  'afi24-301.txt',
  'afi11-202v3.txt',
  'afi90-6001.txt',
  'afi10-1703v1.txt',
  'afi32-1032.txt',
  'afi10-244.txt',
  'afi10-1703v2.txt',
  'afi10-1703v3.txt',
  'afi13-1stan-evalv2.txt',
  'afi36-2201.txt',
  'afi91-204.txt',
  'afi51-110.txt',
  'afi10-2519.txt',
  'afi36-3003.txt',
  'afi51-703.txt',
  'afi36-2254v1.txt',
  'afi90-505.txt',
  'afi33-364.txt',
  'afi32-1023.txt',
  'afi36-202.txt',
  'afi1-404.txt',
  'afi33-360.txt',
  'afi31-105.txt',
  'afi64-201.txt',
  'afi91-202.txt',
  'afi36-2249.txt',
  'afi17-140.txt',
  'afi36-807.txt',
  'afi11-401.txt',
  'afi14-104.txt',
  'afi32-7001.txt',
  'afi10-2501.txt',
  'afi10-403.txt'])

In [54]:
def get_occurrence_count_of_sentence(sentence, is_hash=False):
    if is_hash:
        this_hash = sentence
    else:
        this_hash = sent_to_hash[sentence]
    occurrences = list(trimmed_baskets[this_hash])
    return (len(occurrences),occurrences)
    

In [38]:
hash_to_sent[popular_hash]

'U. Did not properly identify aircraft category or exceeded the lateral limits of circling airspace.'

In [30]:
len(list(trimmed_baskets.items()))

1492

Now we build a graph from the baskets, each combination of documents in each bucket is related to each other.. and the edge weight between documents is based on how many buckets they share. We also determine series here and enrich the node with that, so we can compare how links within or across series work.

In [None]:
import networkx as nx
from itertools import combinations
# Build a weighted graph from the baskets

def traverse_baskets(baskets):   
    graph = nx.Graph()
    for k,v in baskets.items():
        combs = combinations(v, 2)
        for comb in combs:
            # ensure nodes exist
            if comb[0] not in graph:
                graph.add_node(comb[0], series=comb[0].split('-')[0][-2:]) # split on '-' and take the last 2 chars from first part of split result
            if comb[1] not in graph:
                graph.add_node(comb[1], series=comb[1].split('-')[0][-2:]) # e.g. afi_12-21 -> 12; afi12-22 -> 12
            
            # if already exists, increment
            if comb[0] in graph[comb[1]]:
                curr_wt = graph.edges[comb[0], comb[1]]['weight']
                series_same = comb[0].split('-')[0][-2:] == comb[1].split('-')[0][-2:]
                graph.add_edge(comb[0],comb[1], weight=curr_wt+1, label="{}-{}".format(comb[0],comb[1]),sameseries="{}".format(series_same))
            # create new link
            else:
                series_same = comb[0].split('-')[0][-2:] == comb[1].split('-')[0][-2:]
                graph.add_edge(comb[0],comb[1], weight=1, label="{}-{}".format(comb[0],comb[1]), sameseries="{}".format(series_same))
    return graph
            
        

In [None]:
graph = traverse_baskets(trimmed_baskets)

In [None]:
len(graph.edges())

More noise reduction.. prune nodes/edges that only share 1 bucket.

In [None]:
# Trim graphs with only 1 link
def prune_graph(graph):
    removal_candidates = []
    for edge in graph.edges():
        if graph.edges()[edge]['weight'] < 2:
            removal_candidates.append((edge[0],edge[1]))
    # remove low weight edges
    graph.remove_edges_from(removal_candidates)
    # remove isolate nodes
    graph.remove_nodes_from(list(nx.isolates(graph)))
    return graph

In [11]:
sample_str = """General. The AEF is the Air Force‟s force generation construct used to manage the battle
rhythm of expeditionary forces in order to meet global CCDR requirements while maintaining
the highest possible level of overall readiness. Through the AEF, the Air Force establishes a
predictable, standardized battle rhythm ensuring rotational forces are properly organized, trained,
equipped, and ready to sustain capabilities while rapidly responding to emerging crises. Through
the AEF, the Air Force supports defense strategy requirements using a combination of both
permanently assigned and rotational (allocated) forces.
1.1.1. How to use this AFI. This AFI governs Unit Type Code (UTC) readiness reporting.
This instruction should be followed in the context of AFI 10-401 Air Force Operations
Planning and Execution, which contains planning considerations for these UTCs and a full
description of AEF concepts. In cases where this AFI and AFI 10-401 contradict, AFI 10-401
will take precedence. Chapter 1 contains the basic description of the AEF and AEF UTC
Reporting Tool (ART). Chapter 2 outlines roles and responsibilities in UTC reporting.
Chapter 3 describes assessment and reporting policies.
1.2. Reporting AEF Readiness - General. To view the health of USAF forces postured in the
UTC Availability (UTA), Air Force senior leadership determined the need to collect UTC
readiness data from ART reportable units in sufficient detail to support the following goals:
1.2.1. Provide HQ USAF, AF Component Commanders, AF components to Joint Force
Providers (JFPs), MAJCOM's, and the Directorate of Air and Space Expeditionary Forces
Operations (AFPC/DPW) readiness information to employ, manage, and sustain AEF
operations.
1.2.2. Provide units a mechanism to report a UTC‟s ability or inability to fulfill its Mission
Capability Statement (MISCAP) across the full range of military operations (ROMO), to
include contingency and rotational operations, and highlight associated deficiencies within
the UTC.
1.2.3. Provide information to aid resource allocation and tasking decisions during steady
state and crisis actions.
1.3. System Description. The AEF UTC Reporting Tool (ART) allows units the ability to
report UTC level readiness data. It provides one central location to archive reported data. It
allows immediate updates and ready access to an aggregate UTC status for all levels of
command with sufficient depth of information to make informed decisions on the employment of
forces for AEF operations. It further provides a means for identifying and analyzing actionable
indicators of change.
1.3.1. ART is a web-based, non-intrusive, html-environment tool with associated databases
to support collection, collation, and report generation of unit and aggregate UTC readiness
data. It resides on the SIPRNET (https://aef.afpc.randolph.af.smil.mil) for secure access.
Units that have standard UTCs postured in UTC Availability."""

In [14]:
# Determine readability of a sentence
# Sentences with longer length and higher average syllable size are less readable
# for each sentence in the corpus..
# build a dict of sentences, where we characterize number of words, average length of words, average syllables/word
from textstat import syllable_count

ImportError: cannot import name 'syllable_count'

In [None]:
def analyze_doc(doc_txt):
    MIN_SENT_LEN = 12
    for sent in nlp(doc_txt).sents:
        if len(sent) > MIN_SENT_LEN:
            print("syllables {}".format(syllable_count(sent.text)))
            print(sent)

In [None]:
analyze_doc(sample_str)