Skip to content

Commit

Permalink
Do not trust the API's 'posts' and 'liked_count' fields
Browse files Browse the repository at this point in the history
These fields are incorrect in cases such as private posts changed to
public, and this leads to missed posts when backing up blogs.

For example, create a blog with one private post and one public one,
then publish the private post. tumblr_backup.py will only download one
of the two now-public posts without this patch.

This change also skips only a single post rather than an entire batch
upon parse failures, in a further attempt to avoid dropping posts
unnecessarily.
  • Loading branch information
tu-p committed Apr 7, 2020
1 parent 4961a2f commit dd40a88
Showing 1 changed file with 16 additions and 14 deletions.
30 changes: 16 additions & 14 deletions tumblr_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,21 +537,24 @@ def backup(self, account):
if options.likes:
_get_content = lambda soup: soup['response']['liked_posts']
blog = {}
last_post = resp['liked_count']
count_estimate = resp['liked_count']
else:
_get_content = lambda soup: soup['response']['posts']
blog = resp['blog']
last_post = blog['posts']
count_estimate = blog['posts']
self.title = escape(blog.get('title', account))
self.subtitle = blog.get('description', '')

# use the meta information to create a HTML header
TumblrPost.post_header = self.header(body_class='post')

# find the post number limit to back up
# find the limit of how many posts to back up
if options.count:
last_post = min(last_post, options.count + options.skip)
desired_count = options.count + options.skip
else:
desired_count = None

# returns whether any posts from this batch were saved
def _backup(posts):
for p in sorted(posts, key=lambda x: x['id'], reverse=True):
post = post_class(p)
Expand Down Expand Up @@ -583,28 +586,27 @@ def _backup(posts):
# start the thread pool
backup_pool = ThreadPool()
try:
# Get the JSON entries from the API, which we can only do for max 50 posts at once.
# Get the JSON entries from the API, which we can only do for MAX_POSTS posts at once.
# Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
last_batch = MAX_POSTS
i = options.skip
while i < last_post:
# Download posts until we have `desired_count` (if specified), or until post range responses are empty
while not desired_count or self.post_count < desired_count:
# find the upper bound
j = min(i + MAX_POSTS, last_post)
log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, last_post))
log(account, "Getting posts %d to %d (of %d expected)\r" % (i, i + MAX_POSTS - 1, count_estimate))

soup = apiparse(base, j - i, i)
soup = apiparse(base, MAX_POSTS, i)
if soup is None:
i += last_batch # try the next batch
i += 1 # try skipping a post
self.errors = True
continue

posts = _get_content(soup)
# posts can be empty if we don't backup reblogged posts
# `_backup(posts)` can be empty even when `posts` is not if we don't backup reblogged posts
if not posts or not _backup(posts):
log(account, "Backing up posts found empty set of posts, finishing\r")
break

last_batch = len(posts)
i += last_batch
i += MAX_POSTS
except:
# ensure proper thread pool termination
backup_pool.cancel()
Expand Down

0 comments on commit dd40a88

Please sign in to comment.