In [137]:
from collections import namedtuple
import re
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [139]:
driver = webdriver.Chrome()

In [119]:
SubtitleLink = namedtuple("SubtitleLink", ["lang", "url", "name"])

In [120]:
# driver.get("http://www.d-addicts.com/forums/page/subtitles?sid=c00e06662e59c449c2b2814b22e7bc90#Japanese")
# assert "D-Addicts" in driver.title

In [121]:
def _clean_linkname(name):
  res = name.lower()
  # Remove parantheses
  res = re.sub(r"\(.+?\)", "", res)
  res = re.sub(r"\[.+?\]", "", res)
  return res.strip()

In [125]:
def get_sublinks_for_lang(lang):
  """Finds forum topic urls for all subtitles in the given language"""
  driver.get("http://www.d-addicts.com/forums/page/subtitles?sid=c00e06662e59c449c2b2814b22e7bc90#Japanese")
  assert "D-Addicts" in driver.title

  # Find all links
  # links = []
  elements = driver.find_elements_by_xpath(
    """//*[text()="{} Subtitles"]//parent::div//a""".format(lang))
  for ele in elements:
    sl = SubtitleLink(
      lang=lang,
      url=ele.get_attribute("href"),
      name=_clean_linkname(ele.text))
    yield sl

In [126]:
# Find all movies with JP subs
ja_links = list(get_sublinks_for_lang("Japanese"))
ja_links_dict = { l.name: l for l in ja_links }
print("Found {} Japanese subs".format(len(ja_links)))

Found 737 Japanese subs


In [134]:
ja_links[0]._asdict()

OrderedDict([('lang', 'Japanese'),
             ('url', 'http://www.d-addicts.com/forums/viewtopic.php?t=100408'),
             ('name', '11 nin mo iru!')])

In [127]:
# Find all movies with EN subs
en_links = list(get_sublinks_for_lang("English"))
en_links_dict = { l.name: l for l in en_links }
print("Found {} English subs".format(len(en_links)))

Found 1883 English subs


In [135]:
def _get_download_links(url):
  """Finds all download links on a page"""
  driver.get(url)
  for link in driver.find_elements_by_xpath("""//a[contains(@href, 'download')]"""):
    yield link.get_attribute("href")

In [143]:
def write_file(filename, links):
  with open(filename, "a") as file:
    for link in links:
      srt_urls = list(_get_download_links(link.url))
      json_str = json.dumps({
          "info": link._asdict(),
          "srt_urls": srt_urls
      })
      file.write(json_str + "\n")
      file.flush()

In [144]:
write_file("./download_links_ja.jsonlines", ja_links)
write_file("./download_links_en.jsonlines", en_links)

KeyboardInterrupt: 