In [149]:
from collections import namedtuple
import re
import json
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [156]:
driver = webdriver.Chrome()

In [157]:
SubtitleLink = namedtuple("SubtitleLink", ["lang", "url", "name"])

In [121]:
def _clean_linkname(name):
  res = name.lower()
  # Remove parantheses
  res = re.sub(r"\(.+?\)", "", res)
  res = re.sub(r"\[.+?\]", "", res)
  return res.strip()

In [125]:
def get_sublinks_for_lang(lang):
  """Finds forum topic urls for all subtitles in the given language"""
  driver.get("http://www.d-addicts.com/forums/page/subtitles?sid=c00e06662e59c449c2b2814b22e7bc90#Japanese")
  assert "D-Addicts" in driver.title

  # Find all links
  # links = []
  elements = driver.find_elements_by_xpath(
    """//*[text()="{} Subtitles"]//parent::div//a""".format(lang))
  for ele in elements:
    sl = SubtitleLink(
      lang=lang,
      url=ele.get_attribute("href"),
      name=_clean_linkname(ele.text))
    yield sl

In [126]:
# Find all movies with JP subs
ja_links = list(get_sublinks_for_lang("Japanese"))
ja_links_dict = { l.name: l for l in ja_links }
print("Found {} Japanese subs".format(len(ja_links)))

Found 737 Japanese subs


In [134]:
ja_links[0]._asdict()

OrderedDict([('lang', 'Japanese'),
             ('url', 'http://www.d-addicts.com/forums/viewtopic.php?t=100408'),
             ('name', '11 nin mo iru!')])

In [127]:
# Find all movies with EN subs
en_links = list(get_sublinks_for_lang("English"))
en_links_dict = { l.name: l for l in en_links }
print("Found {} English subs".format(len(en_links)))

Found 1883 English subs


In [135]:
def _get_download_links(url):
  """Finds all download links on a page"""
  driver.get(url)
  for link in driver.find_elements_by_xpath("""//a[contains(@href, 'download')]"""):
    yield link.get_attribute("href")

In [154]:
def write_file(filename, links):  
  # Load exiting records
  existing_records = []
  if os.path.exists(filename):
    with open(filename, "r") as file: 
      existing_records = [json.loads(_) for _ in file]
  crawled_urls = set([_["info"]["url"] for _ in existing_records])
  print("Found {} existing records.".format(len(crawled_urls)))
  
  with open(filename, "a") as file:
    for link in links:
      if link.url in crawled_urls:
        print("Skipping {}".format(link.url))
        continue
      srt_urls = list(_get_download_links(link.url))
      json_str = json.dumps({
          "info": link._asdict(),
          "srt_urls": srt_urls
      })
      file.write(json_str + "\n")
      file.flush()
      print(link.url)

In [162]:
# write_file("./download_links_ja.jsonlines", ja_links)
write_file("./download_links_en.jsonlines", en_links)

Found 1437 existing records.
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=68496
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=6883
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=36008
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=100431
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=40308
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=17545
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=12507
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=162943
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=27785
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=64051
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=133022
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=161125
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=109566
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=141249
Skipping http://www.d-addicts.com/forums/viewtopic.php?t=160903
Skipping http://www.