In [15]:
import topycal
import os
import glob
import random

INSTR_PATH = os.path.join(os.getcwd(),"afi_txt")

In [89]:
from concurrent.futures import ThreadPoolExecutor, wait, as_completed
def do_fn_on_iter(fn, iterator, num_threads=6):
    futures = []
    if isinstance(num_threads, str):
        num_threads = int(num_threads)
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        for elem in iterator:
            futures.append(executor.submit(fn, elem))
    results = []
    for x in as_completed(futures):
        results.append(x.result())
    return results


In [90]:
def get_file_list(limit=500, shuffle=True):
    files = glob.glob("{}/afi*.txt".format(INSTR_PATH))
    if shuffle:
        random.shuffle(files)
    if limit:
        return files[0:limit]
    else:
        return files
    #data = myfile.read()
    
def read_file(fname):
    with open(fname, errors='replace') as fd:
        return fd.read()

In [91]:
file_list = get_file_list(limit=None)


In [414]:
import re
import os

def load_file(fname):       
    with open(fname, 'r') as myfile:
        contents = re.sub(r'[\t\n\r\x0b\x0c]',' ', myfile.read())
        return (os.path.basename(fname),re.sub("\s+",' ', contents))

def load_corpus(file_list):
    return {f[0]:f[1] for f in do_fn_on_iter(load_file, file_list)}    

In [415]:
corpus_dict = load_corpus(file_list)

In [151]:
import spacy
nlp = spacy.load("en_core_web_md")

For each document, we sentence segment then create a basket based on the 64bit murmurhash (after whitespace trimming and checking for a minimum sentence threshold).

In [153]:
import mmh3

from tqdm import tqdm
from collections import defaultdict

MIN_SENT_LEN = 15

def do_market_basket_analysis(doc_dict):
    baskets = defaultdict(set)
    for k,v in tqdm(doc_dict.items()):
        for sent in nlp(v).sents:
            if len(sent) > MIN_SENT_LEN:
                baskets[mmh3.hash64(sent.text.strip())].add(k)
    return baskets

In [96]:
baskets = do_market_basket_analysis(corpus_dict)

100%|██████████| 1141/1141 [21:49<00:00,  1.15s/it]


Each basket is trimmed based on being a certain minimum size.. logic here is to reduce noise.

In [107]:
# trim baskets

def trim_baskets(baskets, min_size=3):
    return {k:v for k,v in tqdm(baskets.items()) if len(v) > min_size}


In [108]:
trimmed_baskets = trim_baskets(baskets, min_size=4)

100%|██████████| 490156/490156 [00:00<00:00, 991812.79it/s]


In [157]:
len(list(baskets.items()))

490156

In [109]:
len(list(trimmed_baskets.items()))

1785

Now we build a graph from the baskets, each combination of documents in each bucket is related to each other.. and the edge weight between documents is based on how many buckets they share. We also determine series here and enrich the node with that, so we can compare how links within or across series work.

In [154]:
import networkx as nx
from itertools import combinations
# Build a weighted graph from the baskets

def traverse_baskets(baskets):   
    graph = nx.Graph()
    for k,v in baskets.items():
        combs = combinations(v, 2)
        for comb in combs:
            # ensure nodes exist
            if comb[0] not in graph:
                graph.add_node(comb[0], series=comb[0].split('-')[0][-2:]) # split on '-' and take the last 2 chars from first part of split result
            if comb[1] not in graph:
                graph.add_node(comb[1], series=comb[1].split('-')[0][-2:]) # e.g. afi_12-21 -> 12; afi12-22 -> 12
            
            # if already exists, increment
            if comb[0] in graph[comb[1]]:
                curr_wt = graph.edges[comb[0], comb[1]]['weight']
                series_same = comb[0].split('-')[0][-2:] == comb[1].split('-')[0][-2:]
                graph.add_edge(comb[0],comb[1], weight=curr_wt+1, label="{}-{}".format(comb[0],comb[1]),sameseries="{}".format(series_same))
            # create new link
            else:
                series_same = comb[0].split('-')[0][-2:] == comb[1].split('-')[0][-2:]
                graph.add_edge(comb[0],comb[1], weight=1, label="{}-{}".format(comb[0],comb[1]), sameseries="{}".format(series_same))
    return graph
            
        

In [155]:
graph = traverse_baskets(trimmed_baskets)

In [158]:
len(graph.edges())

14948

More noise reduction.. prune nodes/edges that only share 1 bucket.

In [189]:
# Trim graphs with only 1 link
def prune_graph(graph):
    removal_candidates = []
    for edge in graph.edges():
        if graph.edges()[edge]['weight'] < 2:
            removal_candidates.append((edge[0],edge[1]))
    # remove low weight edges
    graph.remove_edges_from(removal_candidates)
    # remove isolate nodes
    graph.remove_nodes_from(list(nx.isolates(graph)))
    return graph

In [190]:
pruned_graph = prune_graph(graph)

In [188]:
len(pruned_graph.edges())

4124

Save as GML

In [191]:
nx.write_gml(pruned_graph,'afi_mba_pruned.gml')

We can also use the trimmed basket keys as a starting point for sentences that are very common in the corpus.. now if we remove those and then perform a conventional cosine similarity or other affinity analysis; we are effectively excluding the facsimile sentences. Again we exclude shorter sentences to reduce noise from outlines, etc.

In [193]:
redundant_sent_hashes = list(trimmed_baskets.keys())

In [263]:
import mmh3

from tqdm import tqdm
from collections import defaultdict

MIN_SENT_LEN = 15

def prune_corpus_of_reused_sentences(doc_dict, bad_hashes):
    pruned_dict = {}
    for docname,doctxt in tqdm(doc_dict.items()):
        reduced_sents = []
        for sent in nlp(doctxt).sents:
            if len(sent) > MIN_SENT_LEN:
                # check if hash is okay
                this_sent = sent.text.strip()
                if mmh3.hash64(this_sent) not in bad_hashes:
                    reduced_sents.append(this_sent)
        pruned_dict[docname] = " ".join(reduced_sents)
    return pruned_dict

In [264]:
pruned_dict = prune_corpus_of_reused_sentences(corpus_dict,redundant_sent_hashes)

100%|██████████| 1141/1141 [22:25<00:00,  1.18s/it]


In [400]:
list(pruned_dict.keys())[3]

'afi51-302.txt'

In [401]:
len(corpus_dict[list(pruned_dict.keys())[5]]) != len(pruned_dict[list(pruned_dict.keys())[5]])

True

Simple code to compute dot product on 1 docs using tfidf

In [306]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compare_docs(doc1,doc2):
    tfidf = TfidfVectorizer().fit_transform([doc1, doc2])
    diffs = list((tfidf * tfidf.T).A)
    return diffs[0][1]


Build similarity graph .. link each doc with similarity as attribute

In [374]:
import random
import networkx as nx

from itertools import combinations
# Build a weighted graph from the pruned docs based on cosine similarity

# O(n^2)!!
# should do this more efficiently.. fingerprint method or self-organizing map

def make_similarity_graph(doc_dict,sample=None):   
    graph = nx.Graph()
    pairwise_docs = list(combinations([k for k in doc_dict.keys()],2))
    if sample:
        random.shuffle(pairwise_docs)
        pairwise_docs = pairwise_docs[0:sample]
    
    for doc_key in doc_dict.keys():
        graph.add_node(doc_key, series=doc_key.split('-')[0][-2:])
    for pair in tqdm(pairwise_docs):
        series_same = pair[0].split('-')[0][-2:] == pair[1].split('-')[0][-2:]
        try:
            graph.add_edge(pair[0],pair[1],weight=compare_docs(doc_dict[pair[0]],doc_dict[pair[1]]),label="{}${}".format(pair[0],pair[1]), sameseries=series_same)
        except ValueError as e: # happens if we get empty vocab.. no worries
            print("{e}")
            pass
    return graph
            
        

In [404]:
key = list(pruned_dict.keys())[1]
print(len(pruned_dict[key]))
print(len(corpus_dict[key]))

18166
24854


In [416]:
print(corpus_dict[key])

 BY ORDER OF THE SECRETARY OF THE AIR FORCE AIR FORCE INSTRUCTION 36-2811 28 NOVEMBER 2014 Personnel CHAPLAIN CORPS AWARDS COMPLIANCE WITH THIS PUBLICATION IS MANDATORY ACCESSIBILITY: Publications and forms are available for downloading or ordering on the e-Publishing website at www.e-Publishing.af.mil. RELEASABILITY: There are no releasability restrictions on this publication. OPR: HQ USAF/HCX Supersedes: AFI 36-2811, 1 February 2006 Certified by: HQ USAF/HC (Ch, Major General Howard D. Stendahl) Pages: 13 This instruction implements AFPD 36-28, Awards and Decorations Programs. It describes Chaplain Corps awards presented to individuals and teams in recognition of their outstanding service or contributions to the Chaplain Corps. It explains award eligibility, nomination criteria, and procedures for nomination and winner selection. This instruction applies to Regular Air Force (RegAF), Air Reserve Component (ARC), and civilian personnel. Ensure that all records created as a result of p

In [417]:
print(pruned_dict[key])

BY ORDER OF THE  SECRETARY OF THE AIR FORCE AIR FORCE INSTRUCTION 36-2811 28 Publications and forms are available for downloading or ordering on the e-Publishing website at www.e-Publishing.af.mil. HQ USAF/HCX  Supersedes:   AFI 36-2811, 1 February 2006 Certified by: HQ USAF This  instruction  implements  AFPD  36-28,  Awards  and  Decorations  Programs. It  describes Chaplain  Corps  awards  presented  to  individuals  and  teams  in  recognition  of  their  outstanding service or contributions to the Chaplain Corps. It explains award eligibility, nomination criteria, and  procedures  for  nomination  and  winner  selection. This  instruction  applies  to  Regular  Air Force (RegAF), Air Reserve Component (ARC), and civilian personnel. Ensure that all records created as a result of processes prescribed in this publication are maintained in accordance with Air  Force  Manual  (AFMAN)  33-363,  Management  of  Records,  and  disposed  of  in  accordance with  the  Air  Force  Records  D

In [376]:
#similarity_graph = make_similarity_graph(pruned_dict,sample=None)

  3%|▎         | 17683/650370 [06:45<4:01:57, 43.58it/s]

KeyboardInterrupt: 

In [368]:
doc_a, doc_b = list(similarity_graph.edges())[3]
similarity_graph.edges()[list(similarity_graph.edges())[3]]['weight'] == compare_docs(pruned_dict[doc_a],pruned_dict[doc_b])

True

In [369]:
# Trim graphs lower similarity
def prune_similarity_graph(graph,floor=0.4):
    removal_candidates = []
    for edge in graph.edges():
        if graph.edges()[edge]['weight'] < floor:
            removal_candidates.append((edge[0],edge[1]))
    # remove low weight edges
    graph.remove_edges_from(removal_candidates)
    # remove isolate nodes
    graph.remove_nodes_from(list(nx.isolates(graph)))
    return graph

In [370]:
pruned_similarity = prune_similarity_graph(similarity_graph)

In [371]:
nx.write_gml(pruned_similarity,'afi_cosine_pruned.gml')