Skip to content

Commit

Permalink
More informative error messages and checks for correct number of .sto…
Browse files Browse the repository at this point in the history
…ry files
  • Loading branch information
abisee committed May 22, 2017
1 parent c064dc0 commit 6cee801
Showing 1 changed file with 34 additions and 11 deletions.
45 changes: 34 additions & 11 deletions make_datafiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,34 @@
dm_tokenized_stories_dir = "dm_stories_tokenized"
finished_files_dir = "finished_files"

# These are the number of .story files we expect there to be in cnn_stories_dir and dm_stories_dir
num_expected_cnn_stories = 92579
num_expected_dm_stories = 219506

VOCAB_SIZE = 200000

def tokenize_stories(stories_dir, tokenized_stories_dir):
"""Maps a whole directory of .story files to a tokenized version using Stanford CoreNLP Tokenizer"""
print "Preparing to tokenize %s..." % stories_dir
print "Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir)
stories = os.listdir(stories_dir)
# make IO list file
print "Making list of files to tokenize..."
with open("mapping.txt", "w") as f:
for s in stories:
f.write("%s \t %s\n" % (os.path.join(stories_dir, s), os.path.join(tokenized_stories_dir, s)))
command = ['java', 'edu.stanford.nlp.process.PTBTokenizer', '-ioFileList', '-preserveLines', 'mapping.txt']
print "Tokenizing %s..." % (stories_dir)
print "Tokenizing %i files in %s and saving in %s..." % (len(stories), stories_dir, tokenized_stories_dir)
subprocess.call(command)
print "Finished tokenizing!"
print "Stanford CoreNLP Tokenizer has finished."
os.remove("mapping.txt")

# Check that the tokenized stories directory contains the same number of files as the original directory
num_orig = len(os.listdir(stories_dir))
num_tokenized = len(os.listdir(tokenized_stories_dir))
if num_orig != num_tokenized:
raise Exception("The tokenized stories directory %s contains %i files, but it should contain the same number as %s (which has %i files). Was there an error during tokenization?" % (tokenized_stories_dir, num_tokenized, stories_dir, num_orig))
print "Successfully finished tokenizing %s to %s.\n" % (stories_dir, tokenized_stories_dir)


def read_text_file(text_file):
lines = []
Expand Down Expand Up @@ -104,7 +115,7 @@ def get_art_abs(story_file):

def write_to_bin(url_file, out_file, makevocab=False):
"""Reads the tokenized .story files corresponding to the urls listed in the url_file and writes them to a out_file."""
print "Making bin file for %s..." % url_file
print "Making bin file for URLs listed in %s..." % url_file
url_list = read_text_file(url_file)
url_hashes = get_url_hashes(url_list)
story_fnames = [s+".story" for s in url_hashes]
Expand All @@ -118,13 +129,18 @@ def write_to_bin(url_file, out_file, makevocab=False):
if idx % 1000 == 0:
print "Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(num_stories))

# Find the .story file corresponding to this url
# Look in the tokenized story dirs to find the .story file corresponding to this url
if os.path.isfile(os.path.join(cnn_tokenized_stories_dir, s)):
story_file = os.path.join(cnn_tokenized_stories_dir, s)
elif os.path.isfile(os.path.join(dm_tokenized_stories_dir, s)):
story_file = os.path.join(dm_tokenized_stories_dir, s)
else:
raise Exception("Tried to find tokenized story file %s in both directories %s and %s. Couldn't find it." % (s, cnn_tokenized_stories_dir, dm_tokenized_stories_dir))
print "Error: Couldn't find tokenized story file %s in either tokenized story directories %s and %s. Was there an error during tokenization?" % (s, cnn_tokenized_stories_dir, dm_tokenized_stories_dir)
# Check again if tokenized stories directories contain correct number of files
print "Checking that the tokenized stories directories %s and %s contain correct number of files..." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir)
check_num_stories(cnn_tokenized_stories_dir, num_expected_cnn_stories)
check_num_stories(dm_tokenized_stories_dir, num_expected_dm_stories)
raise Exception("Tokenized stories directories %s and %s contain correct number of files but story file %s found in neither." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir, s))

# Get the strings to write to .bin file
article, abstract = get_art_abs(story_file)
Expand All @@ -148,7 +164,7 @@ def write_to_bin(url_file, out_file, makevocab=False):
tokens = [t for t in tokens if t!=""] # remove empty
vocab_counter.update(tokens)

print "Finished writing file %s" % out_file
print "Finished writing file %s\n" % out_file

# write vocab to file
if makevocab:
Expand All @@ -159,6 +175,11 @@ def write_to_bin(url_file, out_file, makevocab=False):
print "Finished writing vocab file"


def check_num_stories(stories_dir, num_expected):
num_stories = len(os.listdir(stories_dir))
if num_stories != num_expected:
raise Exception("stories directory %s contains %i files but should contain %i" % (stories_dir, num_stories, num_expected))


if __name__ == '__main__':
if len(sys.argv) != 3:
Expand All @@ -167,14 +188,16 @@ def write_to_bin(url_file, out_file, makevocab=False):
cnn_stories_dir = sys.argv[1]
dm_stories_dir = sys.argv[2]

# Check the stories directories contain the correct number of .story files
check_num_stories(cnn_stories_dir, num_expected_cnn_stories)
check_num_stories(dm_stories_dir, num_expected_dm_stories)

# Create some new directories
if not os.path.exists(cnn_tokenized_stories_dir): os.makedirs(cnn_tokenized_stories_dir)
if not os.path.exists(dm_tokenized_stories_dir): os.makedirs(dm_tokenized_stories_dir)
if not os.path.exists(finished_files_dir): os.makedirs(finished_files_dir)

print "cnn stories dir: ", cnn_stories_dir
print "dm stories dir: ", dm_stories_dir

# Run stanford tokenizer on both stories dirs
# Run stanford tokenizer on both stories dirs, outputting to tokenized stories directories
tokenize_stories(cnn_stories_dir, cnn_tokenized_stories_dir)
tokenize_stories(dm_stories_dir, dm_tokenized_stories_dir)

Expand Down

0 comments on commit 6cee801

Please sign in to comment.