Merge pull request #12 from YaleDHLab/banish-matches

Banish matches
YaleDHLab · Mar 26, 2021 · 58950eb · 58950eb
2 parents b83ac27 + b51bd3e
commit 58950eb
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 21 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,9 @@
 *.pyc
 *.pem
+*.ipynb
 *.DS_Store
 *.egg-info
 build
 dist
 output
+.ipynb_checkpoints
diff --git a/README.md b/README.md
@@ -15,7 +15,10 @@ python setup.py install
 
 ```bash
 # search for intertextuality in some documents
-intertext --infiles "sample_data/texts/*.txt" --metadata "sample_data/metadata.json" --update_client
+intertext \
+  --infiles "sample_data/texts/*.txt" \
+  --metadata "sample_data/metadata.json" \
+  --update_client
 
 # serve output
 cd output && python -m http.server

diff --git a/intertext/intertext.py b/intertext/intertext.py
@@ -1,12 +1,16 @@
+from networkx.algorithms.components.connected import connected_components
+from collections import defaultdict, Hashable
 from datasketch import MinHash, MinHashLSH
-from collections import defaultdict
 from difflib import SequenceMatcher
+from unidecode import unidecode
+from bs4 import BeautifulSoup
 from nltk import ngrams
 import multiprocessing
 import functools
 import distutils
 import requests
 import argparse
+import networkx
 import zipfile
 import codecs
 import shutil
@@ -16,10 +20,14 @@
 import os
 
 config = {
-  'infile_glob': [],
+  'infile_glob': '',
+  'banish_glob': '',
+  'banish_distance': 4,
   'output': 'output',
   'metadata': {},
   'encoding': 'utf8',
+  'xml_base_tag': None,
+  'xml_remove_tags': tuple(),
   'window_length': 14,
   'slide_length': 4,
   'permutations': 256,
@@ -29,20 +37,18 @@
   'max_file_sim': 0.85,
   'client': '0.0.1a',
   'update_client': False,
+  'strip_diacritics': False,
 }
 
 '''
 TODO:
-  * add --in_memory flag else write data to disk
   * add xml parsing (tag to parse, tags to exclude)
-  * add files from which matches should be blacklisted
+  * add GPU acceleration for minhashing
+  * add --in_memory flag else write data to disk
   * add flag to indicate if same-author matches are allowed
   * add support for CSV metadata
-  * add removal of diacritics
   * add support for xml + txt in same run
   * add unique guid for each output set to avoid overwriting
-  * add GPU acceleration for minhashing
-  * add sort by similarity
 '''
 
 source_location = os.path.dirname(os.path.realpath(__file__))
@@ -53,6 +59,7 @@ def parse():
   description = 'Discover and visualize text reuse'
   parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
   parser.add_argument('--infiles', '-i', type=str, default=config['infile_glob'], dest='infile_glob', help='path to a glob of text files to process', required=True)
+  parser.add_argument('--banish', '-b', type=str, default=config['banish_glob'], dest='banish_glob', help='path to a glob of text files to remove from matches', required=False)
   parser.add_argument('--metadata', '-m', type=str, default=config['metadata'], help='path to a JSON metadata file (see README)', required=False)
   parser.add_argument('--encoding', '-e', type=str, default=config['encoding'], help='the encoding of infiles', required=False)
   parser.add_argument('--window_length', '-w', type=int, default=config['window_length'], help='the length of windows when processing files (see README)', required=False)
@@ -61,9 +68,13 @@ def parse():
   parser.add_argument('--threshold', '-t', type=int, default=config['threshold'], help='the minhash threshold value (see README)', required=False)
   parser.add_argument('--min_sim', '-s', type=int, default=config['min_sim'], help='the minimum similarity of matches to retain)', required=False)
   parser.add_argument('--recall', '-r', type=int, default=config['recall'], help='the recall value to aim for when discovering matches', required=False)
+  parser.add_argument('--banish_distance', '-bd', type=int, default=config['banish_distance'], help='the graph distance to travel when banishing linked matches', required=False)
   parser.add_argument('--max_file_sim', type=int, default=config['max_file_sim'], help='the maximum similarity between two files such that matches are retained', required=False)
   parser.add_argument('--output', '-o', type=str, default=config['output'], help='the output location', required=False)
   parser.add_argument('--client', '-c', type=str, default=config['client'], help='the client version to fetch and display', required=False)
+  parser.add_argument('--xml_base_tag', type=str, default=config['xml_base_tag'], help='if specified, text within this parent tag will be parsed', required=False)
+  parser.add_argument('--xml_remove_tags', type=tuple, default=config['xml_remove_tags'], help='if specified, text within these tags will be removed', required=False)
+  parser.add_argument('--strip_diacritics', default=config['strip_diacritics'], help='if specified, diacritics will be parsed from texts during processing', required=False, action='store_true')
   parser.add_argument('--update_client', default=config['update_client'], help='boolean indicating whether to update the stored client', required=False, action='store_true')
   config.update(vars(parser.parse_args()))
   if config['update_client']: remove_client(**config)
@@ -129,15 +140,27 @@ def process_texts(**kwargs):
   for i in ['matches', 'scatterplots', 'indices']:
     if not os.path.exists(os.path.join(kwargs['output'], 'api', i)):
       os.makedirs(os.path.join(kwargs['output'], 'api', i))
-  # identify and store infiles
+  # identify infiles
   infiles = sorted(glob.glob(kwargs['infile_glob']))
   if len(infiles) == 0:
     raise Exception('No infiles could be found!')
+  # identify banished files
+  if kwargs['banish_glob']:
+    banished_files = sorted(glob.glob(kwargs['banish_glob']))
+    infiles += banished_files
+    banished_file_set = set(banished_files)
+  # store the infiles
   with open(os.path.join(kwargs['output'], 'api', 'files.json'), 'w') as out:
     json.dump(infiles, out)
   # if the user provided metadata, store it in the kwargs
   if kwargs.get('metadata'):
     kwargs['metadata'] = json.load(open(kwargs['metadata']))
+  # if the user provided banished files, store their file ids
+  if kwargs['banish_glob']:
+    banished_file_ids = set()
+    for file_idx, file in enumerate(infiles):
+      if file in banished_file_set:
+        banished_file_ids.add(file_idx)
   # create minhashes
   print(' * creating minhashes')
   id_d = {} # d[window_id] = [file_id, window_index]
@@ -155,6 +178,7 @@ def process_texts(**kwargs):
     file_id_a, window_index_a = id_d[i]
     for j in lsh.query(minhash_d[i]):
       file_id_b, window_index_b = id_d[j]
+      # store the lower file id as the first key in the candidate match dictionary
       if file_id_a < file_id_b:
         candidate_d[file_id_a][file_id_b].append([window_index_a, window_index_b])
       else:
@@ -175,8 +199,33 @@ def process_texts(**kwargs):
         if sim >= kwargs['min_sim']:
           matches_d[file_id_a][file_id_b].append([window_a, window_b, round(sim, 2)])
   del candidate_d
+  # banish matches
+  if kwargs['banish_glob']:
+    print(' * banishing matches')
+    g = networkx.Graph()
+    for file_id_a in matches_d:
+      for file_id_b in matches_d[file_id_a]:
+        for m in matches_d[file_id_a][file_id_b]:
+          g.add_edge(f'{file_id_a}.{m[0]}', f'{file_id_b}.{m[1]}')
+    # map file_id.segment_id segments to whether or not they're banished
+    banished_set = set()
+    distances = dict(networkx.all_pairs_shortest_path_length(g))
+    for i in list(connected_components(g)):
+      banished_ids = [j for j in i if int(j.split('.')[0]) in banished_file_ids]
+      # search up to maximum path length between nodes so nodes linked to a banished node are removed
+      for j in i:
+        if any([distances[j][k] < kwargs['banish_distance'] for k in banished_ids]):
+          banished_set.add(j)
+    # apply the banish filter
+    for file_id_a in list(matches_d):
+      for file_id_b in list(matches_d[file_id_a]):
+        l = []
+        for window_a, window_b, sim in matches_d[file_id_a][file_id_b]:
+          if (f'{file_id_a}.{window_a}' not in banished_set) and \
+             (f'{file_id_b}.{window_b}' not in banished_set):
+            l.append([window_a, window_b, sim])
+        matches_d[file_id_a][file_id_b] = l
   # cluster the matches
-  print(' * clustering matches')
   formatted = []
   for file_id_a in matches_d:
     for file_id_b in matches_d[file_id_a]:
@@ -190,6 +239,8 @@ def process_texts(**kwargs):
       for a, b, sim in matches_d[file_id_a][file_id_b]:
         d[a][b] = sim
       # find sequences of windows in a and b
+      if not matches_d[file_id_a][file_id_b]:
+        continue
       window_as, window_bs, sims = zip(*matches_d[file_id_a][file_id_b])
       for a in get_sequences(window_as):
         for b in get_sequences(window_bs):
@@ -244,8 +295,8 @@ def format_matches(file_id_a, file_id_b, clusters, infiles, **kwargs):
       c['b'] = a_windows
       c['a'] = b_windows
   # format the matches
-  a_words = get_words(infiles[file_id_a], **get_cacheable(kwargs))
-  b_words = get_words(infiles[file_id_b], **get_cacheable(kwargs))
+  a_words = get_words(infiles[file_id_a], **get_cacheable(kwargs, {'display': True}))
+  b_words = get_words(infiles[file_id_b], **get_cacheable(kwargs, {'display': True}))
   formatted = []
   for c in clusters:
     a_strings = get_match_strings(a_words, c['a'], **get_cacheable(kwargs))
@@ -302,7 +353,15 @@ def get_sequences(l):
 def get_words(path, **kwargs):
   '''Given a file path return a list of strings from that file'''
   with codecs.open(path, 'r', kwargs['encoding']) as f:
-    f = f.read()
+    if kwargs['xml_base_tag']:
+      soup = BeautifulSoup(f, 'html.parser').find(kwargs['xml_base_tag'].lower())
+      if kwargs['xml_remove_tags']:
+        [soup.extract(i) for i in kwargs['xml_remove_tags']]
+      f = soup.get_text()
+    else:
+      f = f.read()
+    if kwargs['strip_diacritics'] and not kwargs.get('display', False):
+      f = unidecode(f)
   return f.split()
 
 @functools.lru_cache(maxsize=128)
@@ -316,16 +375,21 @@ def get_windows(path, **kwargs):
     l.append(' '.join(window))
   return l
 
-def get_cacheable(kwargs):
+def get_cacheable(*args):
   '''Given a dictionary of kwargs return a dictionary with cacheable values retained'''
-  d = {}
-  for k in kwargs:
-    if not isinstance(kwargs[k], list) and not isinstance(kwargs[k], dict):
-      d[k] = kwargs[k]
-  return d
+  kwargs = args[0]
+  if len(args) > 1:
+    for i in args[1:]:
+      kwargs.update(i)
+  return {k: kwargs[k] for k in kwargs if isinstance(kwargs[k], Hashable)}
 
 def write_outputs(infiles, formatted, **kwargs):
-  '''Given a 2D list where sublists are matches between two texts, write all outputs'''
+  '''
+  @arg: infiles [str]: list of strings that denote input files in the order they were processed
+  @arg: formatted [arr]: a 2D array where sublists are matching window indices between two texts
+
+  Write the match outputs for a pair of files
+  '''
   print(' * writing outputs to "{}"'.format(kwargs['output']))
   # write the subdirectories if necessary
   for i in range(len(infiles)):
@@ -378,7 +442,6 @@ def write_outputs(infiles, formatted, **kwargs):
     ids = ['{}.{}'.format(i[0], i[1]) for i in sorted(l, key=lambda j: j[idx])]
     with open(os.path.join(kwargs['output'], 'api', 'indices', 'match-ids-by-{}.json'.format(label)), 'w') as out:
       json.dump(ids, out)
-
   # write the scatterplot data
   write_scatterplots(formatted, **kwargs)
 
@@ -422,5 +485,22 @@ def write_scatterplots(formatted, **kwargs):
         with open(os.path.join(out_dir, '{}-{}-{}.json'.format(i, j, k)), 'w') as out:
           json.dump(scatterplot_data, out)
 
+def to_graph(l):
+  '''Given a 2D array, return a networkx.Graph object'''
+  G = networkx.Graph()
+  # i is a list of nodes that share edges
+  for i in l:
+    G.add_nodes_from(i)
+    G.add_edges_from(to_edges(i))
+  return G
+
+def to_edges(l):
+  '''Given a list of elements that share edges in a graph, iterate those edges pairwise'''
+  iterator = iter(l)
+  last = next(iterator)
+  for current in iterator:
+    yield last, current
+    last = current
+
 if __name__ == '__main__':
   parse()
diff --git a/sample_data/banished_text.txt b/sample_data/banished_text.txt
@@ -0,0 +1,15 @@
+THE EPITAPH.
+Here rests his Head upon the Lap of Earth
+A Youth to Fortune and to Fame unknown:
+Fair Science frown'd not on his humble Birth,
+And Melancholy mark'd him for her own.
+
+Large was his Bounty, and his Soul sincere,
+Heav'n did a Recompence as largely send:
+He gave to Mis'ry all he had, a Tear:
+He gain'd from Heav'n ('twas all he wish'd) a Friend.
+
+No farther seek his Merits to disclose,
+Or draw his Frailties from their dread Abode,
+(There they alike in trembling Hope repose)
+The Bosom of his Father and his God.
diff --git a/setup.py b/setup.py
@@ -18,9 +18,11 @@
   install_requires=[
     'beautifulsoup4==4.5.1',
     'datasketch==0.2.6',
+    'networkx==2.5',
     'nltk==3.4.5',
     'pymongo==3.3.1',
     'requests==2.24.0',
+    'unidecode==1.2.0',
   ],
   entry_points={
     'console_scripts': [