tumblr_backup: Add --save-notes and --cookies options

bbolli · Dec 15, 2018 · e1177d2 · e1177d2
1 parent d8c620f
commit e1177d2
Show file tree

Hide file tree

Showing 2 changed files with 195 additions and 8 deletions.
diff --git a/tumblr_backup.py b/tumblr_backup.py
@@ -5,6 +5,7 @@
 from __future__ import with_statement
 import codecs
 from collections import defaultdict
+import cookielib
 from datetime import datetime
 import errno
 from glob import glob
@@ -23,6 +24,7 @@
 import sys
 import threading
 import time
+import traceback
 import urllib
 import urllib2
 import urlparse
@@ -43,6 +45,22 @@
     from youtube_dl.utils import sanitize_filename
 except ImportError:
     youtube_dl = None
+try:
+    import selenium
+    from selenium import webdriver
+    from selenium.webdriver.firefox.options import Options
+except ImportError:
+    selenium = None
+try:
+    import bs4
+    from bs4 import BeautifulSoup
+except ImportError:
+    bs4 = None
+
+try:
+    from web_crawler import WebCrawler
+except ImportError:
+    pass
 
 # Format of displayed tags
 TAG_FMT = '#%s'
@@ -70,6 +88,9 @@ def test_jpg(h, f):
 save_folder = ''
 media_folder = ''
 
+# web crawler
+crawler = None
+
 # constant names
 root_folder = os.getcwdu()
 post_dir = 'posts'
@@ -782,17 +803,22 @@ def append_try(elt, fmt=u'%s'):
     def get_youtube_url(self, youtube_url):
         # determine the media file name
         filetmpl = u'%(id)s_%(uploader_id)s_%(title)s.%(ext)s'
-        ydl = youtube_dl.YoutubeDL({
+        yt_options = {
             'outtmpl': join(self.media_folder, filetmpl),
-            'quiet': True, 
-            'restrictfilenames': True, 
+            'quiet': True,
+            'restrictfilenames': True,
             'noplaylist': True,
             'continuedl': True,
             'nooverwrites': True,
-            'retries': 3000,		
+            'retries': 3000,
             'fragment_retries': 3000,
             'ignoreerrors': True
-        })
+        }
+
+        if options.cookies:
+            options['cookiefile'] = options.cookies
+
+        ydl = youtube_dl.YoutubeDL(yt_options)
         ydl.add_default_info_extractors()
         try:
             result = ydl.extract_info(youtube_url, download=False)
@@ -961,14 +987,27 @@ def get_post(self):
         foot = []
         if self.tags:
             foot.append(u''.join(self.tag_link(t) for t in self.tags))
-        if self.note_count:
-            foot.append(u'%d note%s' % (self.note_count, 's'[self.note_count == 1:]))
         if self.source_title and self.source_url:
             foot.append(u'<a title=Source href=%s>%s</a>' %
                 (self.source_url, self.source_title)
             )
+
+        notes_str = u'%d note%s' % (self.note_count, 's'[self.note_count == 1:])
+
+        if options.save_notes:
+            foot.append(u'<details><summary>%s</summary>\n' % notes_str)
+            foot.append(u'<ol class="notes">')
+            try:
+                foot.append(crawler.get_notes(self.url))
+            except:
+                print 'Error getting notes for post %s:' % self.ident
+                traceback.print_exc()
+            foot.append(u'</ol></details>')
+        else:
+            foot.append(notes_str)
+
         if foot:
-            post += u'\n<footer>%s</footer>' % u' — '.join(foot)
+            post += u'\n<footer>%s</footer>' % u'\n'.join(foot)
         post += '\n</article>\n'
         return post
 
@@ -1127,6 +1166,7 @@ def request_callback(option, opt, value, parser):
     parser.add_option('--save-video', action='store_true', help="save all video files")
     parser.add_option('--save-video-tumblr', action='store_true', help="save only Tumblr video files")
     parser.add_option('--save-audio', action='store_true', help="save audio files")
+    parser.add_option('--save-notes', action='store_true', help="save a list of notes for each post")
     parser.add_option('-j', '--json', action='store_true',
         help="save the original JSON source"
     )
@@ -1186,6 +1226,9 @@ def request_callback(option, opt, value, parser):
     parser.add_option('-S', '--no-ssl-verify', action='store_true',
         help="ignore SSL verification errors"
     )
+    parser.add_option('--cookies', type='string',
+        help="Netscape cookie file (needed for youtube-dl and notes on blogs marked explicit)"
+    )
     options, args = parser.parse_args()
 
     if options.auto is not None and options.auto != time.localtime().tm_hour:
@@ -1215,6 +1258,19 @@ def request_callback(option, opt, value, parser):
         parser.error("--exif: module 'pyexif2' is not installed")
     if options.save_video and not youtube_dl:
         parser.error("--save-video: module 'youtube_dl' is not installed")
+    if options.save_notes:
+        crawler = WebCrawler()
+        if not bs4:
+            parser.error("--save-notes: module 'bs4' is not installed")
+        if not selenium:
+            parser.error("--save-notes: module 'selenium' is not installed")
+        if not crawler.find_gecko_driver():
+            parser.error("--save-notes: executable 'geckodriver' is not installed or not on PATH")
+    if options.cookies and not os.access(options.cookies, os.R_OK):
+        parser.error("--cookies: file cannot be read")
+
+    if options.save_notes:
+        crawler.load(options.cookies)
 
     tb = TumblrBackup()
     try:
@@ -1223,4 +1279,7 @@ def request_callback(option, opt, value, parser):
     except KeyboardInterrupt:
         sys.exit(EXIT_INTERRUPT)
 
+    if options.save_notes:
+        crawler.quit()
+
     sys.exit(tb.exit_code())
diff --git a/web_crawler.py b/web_crawler.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+import cookielib
+import os
+import urllib2
+import urlparse
+
+import selenium
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from bs4 import BeautifulSoup
+
+class WebCrawler:
+
+    def __init__(self):
+        self.gecko_driver = None
+
+    def find_gecko_driver(self):
+        for path in os.environ["PATH"].split(os.pathsep):
+            try_loc = os.path.join(path, 'geckodriver')
+            if os.access(try_loc, os.X_OK):
+                self.gecko_driver = try_loc
+            if os.name != 'nt':
+                continue
+            try_loc += '.exe'
+            if os.access(try_loc, os.X_OK):
+                self.gecko_driver = try_loc
+
+        return self.gecko_driver
+
+    def load(self, cookiefile):
+        self.driver = None
+        d_options = Options()
+        d_options.set_headless(True)
+        self.driver = webdriver.Firefox(options=d_options, executable_path=self.gecko_driver)
+
+        if cookiefile:
+            self.cookies = cookielib.MozillaCookieJar(cookiefile)
+            self.cookies.load(ignore_discard=False, ignore_expires=False)
+
+            # Session cookies are denoted by either `expires` field set to
+            # an empty string or 0. MozillaCookieJar only recognizes the former
+            # (see [1]). So we need force the latter to be recognized as session
+            # cookies on our own.
+            # Session cookies may be important for cookies-based authentication,
+            # e.g. usually, when user does not check 'Remember me' check box while
+            # logging in on a site, some important cookies are stored as session
+            # cookies so that not recognizing them will result in failed login.
+            # 1. https://bugs.python.org/issue17164
+            for cookie in self.cookies:
+                # Treat `expires=0` cookies as session cookies
+                if cookie.expires == 0:
+                    cookie.expires = None
+                    cookie.discard = True
+
+            cookie_handler = urllib2.HTTPCookieProcessor(self.cookies)
+            redirect_handler = urllib2.HTTPRedirectHandler()
+            self.opener = urllib2.build_opener(cookie_handler, redirect_handler)
+        else:
+            self.opener = urllib2.build_opener()
+
+    def quit(self):
+        self.driver.quit()
+        self.driver = None
+
+    def load_cookies(self):
+        for cookie in self.cookies:
+            # Setting domain to None automatically instructs most webdrivers to use the domain of the current window
+            # handle
+            cookie_dict = {'domain': None, 'name': cookie.name, 'value': cookie.value, 'secure': cookie.secure}
+            if cookie.expires:
+                cookie_dict['expiry'] = cookie.expires
+            if cookie.path_specified:
+                cookie_dict['path'] = cookie.path
+
+            self.driver.add_cookie(cookie_dict)
+
+    # Selenium
+    def driver_get(self, url):
+        self.driver.get(url)
+        self.load_cookies()
+        self.driver.get(url)
+
+    # urllib2
+    def urlopen(self, url):
+        return self.opener.open(url)
+
+    def get_html(self):
+        return self.driver.execute_script("return document.documentElement.outerHTML")
+
+    @staticmethod
+    def get_more_link(soup, base):
+        element = soup.find('a', class_='more_notes_link')
+        if not element:
+            return None
+        onclick = element.get_attribute_list('onclick')[0]
+        return base + re.search(r";tumblrReq\.open\('GET','([^']+)'", onclick).group(1)
+
+    @staticmethod
+    def append_notes(soup, list):
+        notes = soup.find('ol', class_='notes')
+        if notes is None:
+            raise RuntimeError('Unexpected HTML, perhaps you need cookies?')
+        notes = notes.find_all('li')[:-1]
+        for n in reversed(notes):
+            list.append(n.prettify())
+
+    def get_notes(self, url):
+        parsed_uri = urlparse.urlparse(url)
+        base = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
+
+        self.driver_get(url)
+        html = self.get_html()
+        soup = BeautifulSoup(html, 'lxml')
+
+        notes_list = []
+        self.append_notes(soup, notes_list)
+
+        while True:
+            more_link = self.get_more_link(soup, base)
+            if not more_link:
+                break
+            with self.urlopen(more_link) as response:
+                soup = BeautifulSoup(response, 'lxml')
+            self.append_notes(soup, notes_list)
+
+        return u''.join(notes_list)