Do not trust the API's 'posts' and 'liked_count' fields

These fields are incorrect in cases such as private posts changed to public, and this leads to missed posts when backing up blogs. For example, create a blog with one private post and one public one, then publish the private post. tumblr_backup.py will only download one of the two now-public posts without this patch. This change also skips only a single post rather than an entire batch upon parse failures, in a further attempt to avoid dropping posts unnecessarily.
bbolli · Apr 7, 2020 · dd40a88 · dd40a88
1 parent 4961a2f
commit dd40a88
Showing 1 changed file with 16 additions and 14 deletions.
diff --git a/tumblr_backup.py b/tumblr_backup.py
@@ -537,21 +537,24 @@ def backup(self, account):
         if options.likes:
             _get_content = lambda soup: soup['response']['liked_posts']
             blog = {}
-            last_post = resp['liked_count']
+            count_estimate = resp['liked_count']
         else:
             _get_content = lambda soup: soup['response']['posts']
             blog = resp['blog']
-            last_post = blog['posts']
+            count_estimate = blog['posts']
         self.title = escape(blog.get('title', account))
         self.subtitle = blog.get('description', '')
 
         # use the meta information to create a HTML header
         TumblrPost.post_header = self.header(body_class='post')
 
-        # find the post number limit to back up
+        # find the limit of how many posts to back up
         if options.count:
-            last_post = min(last_post, options.count + options.skip)
+            desired_count = options.count + options.skip
+        else:
+            desired_count = None
 
+        # returns whether any posts from this batch were saved
         def _backup(posts):
             for p in sorted(posts, key=lambda x: x['id'], reverse=True):
                 post = post_class(p)
@@ -583,28 +586,27 @@ def _backup(posts):
         # start the thread pool
         backup_pool = ThreadPool()
         try:
-            # Get the JSON entries from the API, which we can only do for max 50 posts at once.
+            # Get the JSON entries from the API, which we can only do for MAX_POSTS posts at once.
             # Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
-            last_batch = MAX_POSTS
             i = options.skip
-            while i < last_post:
+            # Download posts until we have `desired_count` (if specified), or until post range responses are empty
+            while not desired_count or self.post_count < desired_count:
                 # find the upper bound
-                j = min(i + MAX_POSTS, last_post)
-                log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, last_post))
+                log(account, "Getting posts %d to %d (of %d expected)\r" % (i, i + MAX_POSTS - 1, count_estimate))
 
-                soup = apiparse(base, j - i, i)
+                soup = apiparse(base, MAX_POSTS, i)
                 if soup is None:
-                    i += last_batch     # try the next batch
+                    i += 1 # try skipping a post
                     self.errors = True
                     continue
 
                 posts = _get_content(soup)
-                # posts can be empty if we don't backup reblogged posts
+                # `_backup(posts)` can be empty even when `posts` is not if we don't backup reblogged posts
                 if not posts or not _backup(posts):
+                    log(account, "Backing up posts found empty set of posts, finishing\r")
                     break
 
-                last_batch = len(posts)
-                i += last_batch
+                i += MAX_POSTS
         except:
             # ensure proper thread pool termination
             backup_pool.cancel()