In [10]:
import os
import json
import re
import pysrt # pip install pysrt
import ass # pip install ass

In [18]:
DATA_DIR_EN = "/Users/dennybritz/Downloads/daddicts_subs/en"
JSON_PATH_JA = "../../crawlers/daddicts/download_links_ja.jsonlines"
DATA_DIR_JA = "/Users/dennybritz/Downloads/daddicts_subs/ja"
JSON_PATH_EN = "../../crawlers/daddicts/download_links_en.jsonlines"

In [5]:
def get_filetype(lines):
  """Returns the filetype based on file contents"""
  if lines[0].strip().replace(u"\ufeff", "") == "1":
    return "SRT"
  elif "[Script Info]" in lines[0].strip():
    return "ASS"
  return None

In [6]:
def get_filetypes(directory):
  """Returns an iterator of (name, type) for all files in a directory.
  """
  for filename in os.listdir(directory):
    full_path = os.path.join(directory, filename)
    with open(full_path) as f:
      try:
        yield (full_path, get_filetype(f.readlines()))
      except:
        pass

In [7]:
# Get filetypes for all files
ja_filetypes_dict = dict(list(get_filetypes(DATA_DIR_JA)))
en_filetypes_dict = dict(list(get_filetypes(DATA_DIR_EN)))

In [8]:
ja_srt_files = [k for k,v  in ja_filetypes_dict.items() if v == "SRT"]
print("Found {} Japanese SRT files".format(len(ja_srt_files)))
ja_ass_files = [k for k,v  in ja_filetypes_dict.items() if v == "ASS"]
print("Found {} Japanese ASS files".format(len(ja_ass_files)))
en_srt_files = [k for k,v  in en_filetypes_dict.items() if v == "SRT"]
print("Found {} English SRT files".format(len(en_srt_files)))
en_ass_files = [k for k,v  in en_filetypes_dict.items() if v == "ASS"]
print("Found {} English ASS files".format(len(en_ass_files)))


Found 3726 Japanese SRT files
Found 198 Japanese ASS files
Found 8898 English SRT files
Found 3943 English ASS files


In [21]:
# Match SRT/ASS file to their json info

# Load json info data about JA/EN subs
def load_json_infos(json_path):
  with open(json_path) as f:
    json_objects = [json.loads(_) for _ in f.readlines()]  
  return json_objects

#1. Parse json and create a dictionary from id -> info
def create_id_to_info_dict(info_dicts):
  result = {}
  for info in info_dicts:
    for url in info["srt_urls"]:
      id_matches = re.search(r"id=(\d+)", url)
      if id_matches:
        subtitle_id = id_matches.group(0)
        result[subtitle_id] = info
  return result

info_ja = load_json_infos(JSON_PATH_JA)
info_en = load_json_infos(JSON_PATH_EN)
id_to_info_dict_ja = create_id_to_info_dict(info_ja)
id_to_info_dict_en = create_id_to_info_dict(info_en)

In [30]:
# Find exact matches
ja_movies = set([_["info"]["name"] for _ in info_ja])
en_movies = set([_["info"]["name"] for _ in info_en])
matches = ja_movies.intersection(en_movies)
print("Found {} exact name matches".format(len(matches)))

Found 371 exact name matches


In [279]:
def create_subs_hash(filename, filetype, n=5):
  """Creates  has string that represents an SRT file. This string
  corresponds to the timestamp of the first N sub entries and can be used
  to match subs that have exactly the same timestamps."""
  try:
    if filetype == "SRT":
      subs = pysrt.open(filename)
      repr_list = [_.start.ordinal for _ in subs]
    elif filetype == "ASS":
      with open(filename) as f:
        subs = ass.parse(f)
      repr_list = [int(_.start.total_seconds() * 1000) for _ in subs.events]
    else:
      raise ValueError("Unknown sub type")
  except:
    return None
  # Skip first few subs (they are usually "brought to you by...")
  repr_list = [str(_) for _ in repr_list if _ > 25000]
  return ", ".join(repr_list[:n])

In [300]:
import tensorflow.python.util

In [None]:
tensorflow.python.util.

In [301]:
# Create hash map for JA subs
ja_subs_hashes = {}
ja_subs_hashes.update({ create_subs_hash(f, "SRT"): (f, "SRT")  for f in ja_srt_files })
ja_subs_hashes.update({ create_subs_hash(f, "ASS"): (f, "ASS")  for f in ja_ass_files })

In [302]:
# Create hash map for EN subs
en_subs_hashes = {}
en_subs_hashes.update({ create_subs_hash(f, "SRT"): (f, "SRT")  for f in en_srt_files })
en_subs_hashes.update({ create_subs_hash(f, "ASS"): (f, "ASS")  for f in en_ass_files })

In [303]:
# Clean up subs that couldn't be parsed
ja_subs_hashes = { k : v for k, v in ja_subs_hashes.items() if k is not None }
en_subs_hashes = { k : v for k, v in en_subs_hashes.items() if k is not None }

In [309]:
# Find all exact sub matches
matches = []
for subs_hash, ja_filename in ja_subs_hashes.items():
  if subs_hash in en_subs_hashes:
    en_filename = en_subs_hashes[subs_hash]
    matches.append((ja_filename, en_filename))

In [310]:
matches[0]

(('/Users/dennybritz/Downloads/daddicts_subs/ja/file.php?id=66400', 'SRT'),
 ('/Users/dennybritz/Downloads/daddicts_subs/en/file.php?id=67384', 'ASS'))

In [311]:
subs = pysrt.open(en_srt_files[5])
sub = subs[0]
sub.text_without_tags

'Brought to you by HaruHaruSubs'

In [318]:
def get_sub_text(filename, filetype):
  """Return raw text found in the subs"""
  if filetype == "SRT":
    subs = pysrt.open(filename)
    subs = [_ for _ in subs if _.start.ordinal > 25000]
    return [_.text_without_tags for _ in subs]
  elif filetype == "ASS":
    with open(filename) as f:
      subs = ass.parse(f)
    subs = [_ for _ in subs.events if int(_.start.total_seconds() * 1000) > 25000]
    return [_.text for _ in subs]
  else:
    raise ValueError("Unknown sub type")

In [321]:
OUTPUT_PREFIX = "/Users/dennybritz/Downloads/daddicts_subs/daddicts"

ja_filename = os.path.join(OUTPUT_PREFIX + ".ja")
ja_file = open(ja_filename, "w")
en_filename = os.path.join(OUTPUT_PREFIX + ".en")
en_file = open(en_filename, "w")

for ja_subfile, en_subfile in matches:
  ja_texts = get_sub_text(*ja_subfile)
  ja_texts = [_.replace("\n", " ") for _ in ja_texts]
  en_texts = get_sub_text(*en_subfile)
  en_texts = [_.replace("\n", " ") for _ in en_texts]
  print(len(ja_texts))
  print(len(en_texts))
  ja_file.write("\n".join(ja_texts))
  en_file.write("\n".join(en_texts))

503
452
584
565
700
700
225
248
777
611
612
658
1171
1135
682
771
615
593
443
443
656
656
769
729
668
636
747
674
703
702
898
841
590
590
301
301
517
517
593
571
557
557
717
718
687
685
682
662
518
518
739
688
1412
1323
409
409
690
640
583
576
545
544
663
666
546
479
700
686
1073
1093
642
654
710
784
733
637
603
545
325
264
587
589
609
609
426
414
500
498
769
703
347
301
555
499
627
627
376
376
532
520
511
511
338
338
516
516
639
582
1077
1097
648
651
477
478
258
281
509
475
598
541
671
611
557
485
748
727
478
513
695
650
611
547
498
498
327
305
617
608
618
574
645
605
709
685
553
491
509
510
217
242
1175
1156
705
694
621
609
298
268
577
543
643
623
514
469
593
540
470
470
679
657
480
485
593
593
374
359
515
469
700
649
574
574
615
615
502
444
561
519
545
533
637
636
508
493
480
449
604
590
920
920
663
613
618
618
608
620
559
486
765
712
509
446
788
692
650
650
446
446
282
282
728
671
475
475
326
325
629
580
644
602
394
385
516
518
360
358
759
773
556
509
691
673
688
619
444
444
741
63