Skip to content

Commit

Permalink
Merge pull request #12 from YaleDHLab/banish-matches
Browse files Browse the repository at this point in the history
Banish matches
  • Loading branch information
duhaime committed Mar 26, 2021
2 parents b83ac27 + b51bd3e commit 58950eb
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 21 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
*.pyc
*.pem
*.ipynb
*.DS_Store
*.egg-info
build
dist
output
.ipynb_checkpoints
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ python setup.py install

```bash
# search for intertextuality in some documents
intertext --infiles "sample_data/texts/*.txt" --metadata "sample_data/metadata.json" --update_client
intertext \
--infiles "sample_data/texts/*.txt" \
--metadata "sample_data/metadata.json" \
--update_client

# serve output
cd output && python -m http.server
Expand Down
120 changes: 100 additions & 20 deletions intertext/intertext.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from networkx.algorithms.components.connected import connected_components
from collections import defaultdict, Hashable
from datasketch import MinHash, MinHashLSH
from collections import defaultdict
from difflib import SequenceMatcher
from unidecode import unidecode
from bs4 import BeautifulSoup
from nltk import ngrams
import multiprocessing
import functools
import distutils
import requests
import argparse
import networkx
import zipfile
import codecs
import shutil
Expand All @@ -16,10 +20,14 @@
import os

config = {
'infile_glob': [],
'infile_glob': '',
'banish_glob': '',
'banish_distance': 4,
'output': 'output',
'metadata': {},
'encoding': 'utf8',
'xml_base_tag': None,
'xml_remove_tags': tuple(),
'window_length': 14,
'slide_length': 4,
'permutations': 256,
Expand All @@ -29,20 +37,18 @@
'max_file_sim': 0.85,
'client': '0.0.1a',
'update_client': False,
'strip_diacritics': False,
}

'''
TODO:
* add --in_memory flag else write data to disk
* add xml parsing (tag to parse, tags to exclude)
* add files from which matches should be blacklisted
* add GPU acceleration for minhashing
* add --in_memory flag else write data to disk
* add flag to indicate if same-author matches are allowed
* add support for CSV metadata
* add removal of diacritics
* add support for xml + txt in same run
* add unique guid for each output set to avoid overwriting
* add GPU acceleration for minhashing
* add sort by similarity
'''

source_location = os.path.dirname(os.path.realpath(__file__))
Expand All @@ -53,6 +59,7 @@ def parse():
description = 'Discover and visualize text reuse'
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--infiles', '-i', type=str, default=config['infile_glob'], dest='infile_glob', help='path to a glob of text files to process', required=True)
parser.add_argument('--banish', '-b', type=str, default=config['banish_glob'], dest='banish_glob', help='path to a glob of text files to remove from matches', required=False)
parser.add_argument('--metadata', '-m', type=str, default=config['metadata'], help='path to a JSON metadata file (see README)', required=False)
parser.add_argument('--encoding', '-e', type=str, default=config['encoding'], help='the encoding of infiles', required=False)
parser.add_argument('--window_length', '-w', type=int, default=config['window_length'], help='the length of windows when processing files (see README)', required=False)
Expand All @@ -61,9 +68,13 @@ def parse():
parser.add_argument('--threshold', '-t', type=int, default=config['threshold'], help='the minhash threshold value (see README)', required=False)
parser.add_argument('--min_sim', '-s', type=int, default=config['min_sim'], help='the minimum similarity of matches to retain)', required=False)
parser.add_argument('--recall', '-r', type=int, default=config['recall'], help='the recall value to aim for when discovering matches', required=False)
parser.add_argument('--banish_distance', '-bd', type=int, default=config['banish_distance'], help='the graph distance to travel when banishing linked matches', required=False)
parser.add_argument('--max_file_sim', type=int, default=config['max_file_sim'], help='the maximum similarity between two files such that matches are retained', required=False)
parser.add_argument('--output', '-o', type=str, default=config['output'], help='the output location', required=False)
parser.add_argument('--client', '-c', type=str, default=config['client'], help='the client version to fetch and display', required=False)
parser.add_argument('--xml_base_tag', type=str, default=config['xml_base_tag'], help='if specified, text within this parent tag will be parsed', required=False)
parser.add_argument('--xml_remove_tags', type=tuple, default=config['xml_remove_tags'], help='if specified, text within these tags will be removed', required=False)
parser.add_argument('--strip_diacritics', default=config['strip_diacritics'], help='if specified, diacritics will be parsed from texts during processing', required=False, action='store_true')
parser.add_argument('--update_client', default=config['update_client'], help='boolean indicating whether to update the stored client', required=False, action='store_true')
config.update(vars(parser.parse_args()))
if config['update_client']: remove_client(**config)
Expand Down Expand Up @@ -129,15 +140,27 @@ def process_texts(**kwargs):
for i in ['matches', 'scatterplots', 'indices']:
if not os.path.exists(os.path.join(kwargs['output'], 'api', i)):
os.makedirs(os.path.join(kwargs['output'], 'api', i))
# identify and store infiles
# identify infiles
infiles = sorted(glob.glob(kwargs['infile_glob']))
if len(infiles) == 0:
raise Exception('No infiles could be found!')
# identify banished files
if kwargs['banish_glob']:
banished_files = sorted(glob.glob(kwargs['banish_glob']))
infiles += banished_files
banished_file_set = set(banished_files)
# store the infiles
with open(os.path.join(kwargs['output'], 'api', 'files.json'), 'w') as out:
json.dump(infiles, out)
# if the user provided metadata, store it in the kwargs
if kwargs.get('metadata'):
kwargs['metadata'] = json.load(open(kwargs['metadata']))
# if the user provided banished files, store their file ids
if kwargs['banish_glob']:
banished_file_ids = set()
for file_idx, file in enumerate(infiles):
if file in banished_file_set:
banished_file_ids.add(file_idx)
# create minhashes
print(' * creating minhashes')
id_d = {} # d[window_id] = [file_id, window_index]
Expand All @@ -155,6 +178,7 @@ def process_texts(**kwargs):
file_id_a, window_index_a = id_d[i]
for j in lsh.query(minhash_d[i]):
file_id_b, window_index_b = id_d[j]
# store the lower file id as the first key in the candidate match dictionary
if file_id_a < file_id_b:
candidate_d[file_id_a][file_id_b].append([window_index_a, window_index_b])
else:
Expand All @@ -175,8 +199,33 @@ def process_texts(**kwargs):
if sim >= kwargs['min_sim']:
matches_d[file_id_a][file_id_b].append([window_a, window_b, round(sim, 2)])
del candidate_d
# banish matches
if kwargs['banish_glob']:
print(' * banishing matches')
g = networkx.Graph()
for file_id_a in matches_d:
for file_id_b in matches_d[file_id_a]:
for m in matches_d[file_id_a][file_id_b]:
g.add_edge(f'{file_id_a}.{m[0]}', f'{file_id_b}.{m[1]}')
# map file_id.segment_id segments to whether or not they're banished
banished_set = set()
distances = dict(networkx.all_pairs_shortest_path_length(g))
for i in list(connected_components(g)):
banished_ids = [j for j in i if int(j.split('.')[0]) in banished_file_ids]
# search up to maximum path length between nodes so nodes linked to a banished node are removed
for j in i:
if any([distances[j][k] < kwargs['banish_distance'] for k in banished_ids]):
banished_set.add(j)
# apply the banish filter
for file_id_a in list(matches_d):
for file_id_b in list(matches_d[file_id_a]):
l = []
for window_a, window_b, sim in matches_d[file_id_a][file_id_b]:
if (f'{file_id_a}.{window_a}' not in banished_set) and \
(f'{file_id_b}.{window_b}' not in banished_set):
l.append([window_a, window_b, sim])
matches_d[file_id_a][file_id_b] = l
# cluster the matches
print(' * clustering matches')
formatted = []
for file_id_a in matches_d:
for file_id_b in matches_d[file_id_a]:
Expand All @@ -190,6 +239,8 @@ def process_texts(**kwargs):
for a, b, sim in matches_d[file_id_a][file_id_b]:
d[a][b] = sim
# find sequences of windows in a and b
if not matches_d[file_id_a][file_id_b]:
continue
window_as, window_bs, sims = zip(*matches_d[file_id_a][file_id_b])
for a in get_sequences(window_as):
for b in get_sequences(window_bs):
Expand Down Expand Up @@ -244,8 +295,8 @@ def format_matches(file_id_a, file_id_b, clusters, infiles, **kwargs):
c['b'] = a_windows
c['a'] = b_windows
# format the matches
a_words = get_words(infiles[file_id_a], **get_cacheable(kwargs))
b_words = get_words(infiles[file_id_b], **get_cacheable(kwargs))
a_words = get_words(infiles[file_id_a], **get_cacheable(kwargs, {'display': True}))
b_words = get_words(infiles[file_id_b], **get_cacheable(kwargs, {'display': True}))
formatted = []
for c in clusters:
a_strings = get_match_strings(a_words, c['a'], **get_cacheable(kwargs))
Expand Down Expand Up @@ -302,7 +353,15 @@ def get_sequences(l):
def get_words(path, **kwargs):
'''Given a file path return a list of strings from that file'''
with codecs.open(path, 'r', kwargs['encoding']) as f:
f = f.read()
if kwargs['xml_base_tag']:
soup = BeautifulSoup(f, 'html.parser').find(kwargs['xml_base_tag'].lower())
if kwargs['xml_remove_tags']:
[soup.extract(i) for i in kwargs['xml_remove_tags']]
f = soup.get_text()
else:
f = f.read()
if kwargs['strip_diacritics'] and not kwargs.get('display', False):
f = unidecode(f)
return f.split()

@functools.lru_cache(maxsize=128)
Expand All @@ -316,16 +375,21 @@ def get_windows(path, **kwargs):
l.append(' '.join(window))
return l

def get_cacheable(kwargs):
def get_cacheable(*args):
'''Given a dictionary of kwargs return a dictionary with cacheable values retained'''
d = {}
for k in kwargs:
if not isinstance(kwargs[k], list) and not isinstance(kwargs[k], dict):
d[k] = kwargs[k]
return d
kwargs = args[0]
if len(args) > 1:
for i in args[1:]:
kwargs.update(i)
return {k: kwargs[k] for k in kwargs if isinstance(kwargs[k], Hashable)}

def write_outputs(infiles, formatted, **kwargs):
'''Given a 2D list where sublists are matches between two texts, write all outputs'''
'''
@arg: infiles [str]: list of strings that denote input files in the order they were processed
@arg: formatted [arr]: a 2D array where sublists are matching window indices between two texts
Write the match outputs for a pair of files
'''
print(' * writing outputs to "{}"'.format(kwargs['output']))
# write the subdirectories if necessary
for i in range(len(infiles)):
Expand Down Expand Up @@ -378,7 +442,6 @@ def write_outputs(infiles, formatted, **kwargs):
ids = ['{}.{}'.format(i[0], i[1]) for i in sorted(l, key=lambda j: j[idx])]
with open(os.path.join(kwargs['output'], 'api', 'indices', 'match-ids-by-{}.json'.format(label)), 'w') as out:
json.dump(ids, out)

# write the scatterplot data
write_scatterplots(formatted, **kwargs)

Expand Down Expand Up @@ -422,5 +485,22 @@ def write_scatterplots(formatted, **kwargs):
with open(os.path.join(out_dir, '{}-{}-{}.json'.format(i, j, k)), 'w') as out:
json.dump(scatterplot_data, out)

def to_graph(l):
'''Given a 2D array, return a networkx.Graph object'''
G = networkx.Graph()
# i is a list of nodes that share edges
for i in l:
G.add_nodes_from(i)
G.add_edges_from(to_edges(i))
return G

def to_edges(l):
'''Given a list of elements that share edges in a graph, iterate those edges pairwise'''
iterator = iter(l)
last = next(iterator)
for current in iterator:
yield last, current
last = current

if __name__ == '__main__':
parse()
15 changes: 15 additions & 0 deletions sample_data/banished_text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
THE EPITAPH.
Here rests his Head upon the Lap of Earth
A Youth to Fortune and to Fame unknown:
Fair Science frown'd not on his humble Birth,
And Melancholy mark'd him for her own.

Large was his Bounty, and his Soul sincere,
Heav'n did a Recompence as largely send:
He gave to Mis'ry all he had, a Tear:
He gain'd from Heav'n ('twas all he wish'd) a Friend.

No farther seek his Merits to disclose,
Or draw his Frailties from their dread Abode,
(There they alike in trembling Hope repose)
The Bosom of his Father and his God.
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@
install_requires=[
'beautifulsoup4==4.5.1',
'datasketch==0.2.6',
'networkx==2.5',
'nltk==3.4.5',
'pymongo==3.3.1',
'requests==2.24.0',
'unidecode==1.2.0',
],
entry_points={
'console_scripts': [
Expand Down

0 comments on commit 58950eb

Please sign in to comment.