Skip to content

Commit

Permalink
tumblr_backup: Add --save-notes and --cookies options
Browse files Browse the repository at this point in the history
  • Loading branch information
cebtenzzre committed Dec 15, 2018
1 parent d8c620f commit e1177d2
Show file tree
Hide file tree
Showing 2 changed files with 195 additions and 8 deletions.
75 changes: 67 additions & 8 deletions tumblr_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from __future__ import with_statement
import codecs
from collections import defaultdict
import cookielib
from datetime import datetime
import errno
from glob import glob
Expand All @@ -23,6 +24,7 @@
import sys
import threading
import time
import traceback
import urllib
import urllib2
import urlparse
Expand All @@ -43,6 +45,22 @@
from youtube_dl.utils import sanitize_filename
except ImportError:
youtube_dl = None
try:
import selenium
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
except ImportError:
selenium = None
try:
import bs4
from bs4 import BeautifulSoup
except ImportError:
bs4 = None

try:
from web_crawler import WebCrawler
except ImportError:
pass

# Format of displayed tags
TAG_FMT = '#%s'
Expand Down Expand Up @@ -70,6 +88,9 @@ def test_jpg(h, f):
save_folder = ''
media_folder = ''

# web crawler
crawler = None

# constant names
root_folder = os.getcwdu()
post_dir = 'posts'
Expand Down Expand Up @@ -782,17 +803,22 @@ def append_try(elt, fmt=u'%s'):
def get_youtube_url(self, youtube_url):
# determine the media file name
filetmpl = u'%(id)s_%(uploader_id)s_%(title)s.%(ext)s'
ydl = youtube_dl.YoutubeDL({
yt_options = {
'outtmpl': join(self.media_folder, filetmpl),
'quiet': True,
'restrictfilenames': True,
'quiet': True,
'restrictfilenames': True,
'noplaylist': True,
'continuedl': True,
'nooverwrites': True,
'retries': 3000,
'retries': 3000,
'fragment_retries': 3000,
'ignoreerrors': True
})
}

if options.cookies:
options['cookiefile'] = options.cookies

ydl = youtube_dl.YoutubeDL(yt_options)
ydl.add_default_info_extractors()
try:
result = ydl.extract_info(youtube_url, download=False)
Expand Down Expand Up @@ -961,14 +987,27 @@ def get_post(self):
foot = []
if self.tags:
foot.append(u''.join(self.tag_link(t) for t in self.tags))
if self.note_count:
foot.append(u'%d note%s' % (self.note_count, 's'[self.note_count == 1:]))
if self.source_title and self.source_url:
foot.append(u'<a title=Source href=%s>%s</a>' %
(self.source_url, self.source_title)
)

notes_str = u'%d note%s' % (self.note_count, 's'[self.note_count == 1:])

if options.save_notes:
foot.append(u'<details><summary>%s</summary>\n' % notes_str)
foot.append(u'<ol class="notes">')
try:
foot.append(crawler.get_notes(self.url))
except:
print 'Error getting notes for post %s:' % self.ident
traceback.print_exc()
foot.append(u'</ol></details>')
else:
foot.append(notes_str)

if foot:
post += u'\n<footer>%s</footer>' % u''.join(foot)
post += u'\n<footer>%s</footer>' % u'\n'.join(foot)
post += '\n</article>\n'
return post

Expand Down Expand Up @@ -1127,6 +1166,7 @@ def request_callback(option, opt, value, parser):
parser.add_option('--save-video', action='store_true', help="save all video files")
parser.add_option('--save-video-tumblr', action='store_true', help="save only Tumblr video files")
parser.add_option('--save-audio', action='store_true', help="save audio files")
parser.add_option('--save-notes', action='store_true', help="save a list of notes for each post")
parser.add_option('-j', '--json', action='store_true',
help="save the original JSON source"
)
Expand Down Expand Up @@ -1186,6 +1226,9 @@ def request_callback(option, opt, value, parser):
parser.add_option('-S', '--no-ssl-verify', action='store_true',
help="ignore SSL verification errors"
)
parser.add_option('--cookies', type='string',
help="Netscape cookie file (needed for youtube-dl and notes on blogs marked explicit)"
)
options, args = parser.parse_args()

if options.auto is not None and options.auto != time.localtime().tm_hour:
Expand Down Expand Up @@ -1215,6 +1258,19 @@ def request_callback(option, opt, value, parser):
parser.error("--exif: module 'pyexif2' is not installed")
if options.save_video and not youtube_dl:
parser.error("--save-video: module 'youtube_dl' is not installed")
if options.save_notes:
crawler = WebCrawler()
if not bs4:
parser.error("--save-notes: module 'bs4' is not installed")
if not selenium:
parser.error("--save-notes: module 'selenium' is not installed")
if not crawler.find_gecko_driver():
parser.error("--save-notes: executable 'geckodriver' is not installed or not on PATH")
if options.cookies and not os.access(options.cookies, os.R_OK):
parser.error("--cookies: file cannot be read")

if options.save_notes:
crawler.load(options.cookies)

tb = TumblrBackup()
try:
Expand All @@ -1223,4 +1279,7 @@ def request_callback(option, opt, value, parser):
except KeyboardInterrupt:
sys.exit(EXIT_INTERRUPT)

if options.save_notes:
crawler.quit()

sys.exit(tb.exit_code())
128 changes: 128 additions & 0 deletions web_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/usr/bin/env python
# encoding: utf-8

import cookielib
import os
import urllib2
import urlparse

import selenium
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup

class WebCrawler:

def __init__(self):
self.gecko_driver = None

def find_gecko_driver(self):
for path in os.environ["PATH"].split(os.pathsep):
try_loc = os.path.join(path, 'geckodriver')
if os.access(try_loc, os.X_OK):
self.gecko_driver = try_loc
if os.name != 'nt':
continue
try_loc += '.exe'
if os.access(try_loc, os.X_OK):
self.gecko_driver = try_loc

return self.gecko_driver

def load(self, cookiefile):
self.driver = None
d_options = Options()
d_options.set_headless(True)
self.driver = webdriver.Firefox(options=d_options, executable_path=self.gecko_driver)

if cookiefile:
self.cookies = cookielib.MozillaCookieJar(cookiefile)
self.cookies.load(ignore_discard=False, ignore_expires=False)

# Session cookies are denoted by either `expires` field set to
# an empty string or 0. MozillaCookieJar only recognizes the former
# (see [1]). So we need force the latter to be recognized as session
# cookies on our own.
# Session cookies may be important for cookies-based authentication,
# e.g. usually, when user does not check 'Remember me' check box while
# logging in on a site, some important cookies are stored as session
# cookies so that not recognizing them will result in failed login.
# 1. https://bugs.python.org/issue17164
for cookie in self.cookies:
# Treat `expires=0` cookies as session cookies
if cookie.expires == 0:
cookie.expires = None
cookie.discard = True

cookie_handler = urllib2.HTTPCookieProcessor(self.cookies)
redirect_handler = urllib2.HTTPRedirectHandler()
self.opener = urllib2.build_opener(cookie_handler, redirect_handler)
else:
self.opener = urllib2.build_opener()

def quit(self):
self.driver.quit()
self.driver = None

def load_cookies(self):
for cookie in self.cookies:
# Setting domain to None automatically instructs most webdrivers to use the domain of the current window
# handle
cookie_dict = {'domain': None, 'name': cookie.name, 'value': cookie.value, 'secure': cookie.secure}
if cookie.expires:
cookie_dict['expiry'] = cookie.expires
if cookie.path_specified:
cookie_dict['path'] = cookie.path

self.driver.add_cookie(cookie_dict)

# Selenium
def driver_get(self, url):
self.driver.get(url)
self.load_cookies()
self.driver.get(url)

# urllib2
def urlopen(self, url):
return self.opener.open(url)

def get_html(self):
return self.driver.execute_script("return document.documentElement.outerHTML")

@staticmethod
def get_more_link(soup, base):
element = soup.find('a', class_='more_notes_link')
if not element:
return None
onclick = element.get_attribute_list('onclick')[0]
return base + re.search(r";tumblrReq\.open\('GET','([^']+)'", onclick).group(1)

@staticmethod
def append_notes(soup, list):
notes = soup.find('ol', class_='notes')
if notes is None:
raise RuntimeError('Unexpected HTML, perhaps you need cookies?')
notes = notes.find_all('li')[:-1]
for n in reversed(notes):
list.append(n.prettify())

def get_notes(self, url):
parsed_uri = urlparse.urlparse(url)
base = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

self.driver_get(url)
html = self.get_html()
soup = BeautifulSoup(html, 'lxml')

notes_list = []
self.append_notes(soup, notes_list)

while True:
more_link = self.get_more_link(soup, base)
if not more_link:
break
with self.urlopen(more_link) as response:
soup = BeautifulSoup(response, 'lxml')
self.append_notes(soup, notes_list)

return u''.join(notes_list)

0 comments on commit e1177d2

Please sign in to comment.