In [1]:
import os
import shutil
import sys
import subprocess
import requests
import json
import re
import time
import yt_dlp
import html
import time
from datetime import datetime
from dateutil import parser
from bs4 import BeautifulSoup
from urllib.parse import urlparse, quote_plus

DEBUG = 1
REDDIT_DOWNLOAD_DIR = './cache/reddit/json'

def debug_print(level, message):
    if DEBUG >= level:
        print(message)

def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def check_cache(directory, filename):
    file_path = os.path.join(directory, filename)
    if os.path.exists(file_path):
        debug_print(2, f'Cache hit for {filename} at {directory}')
        return file_path
    return None

In [2]:
headers = {'User-Agent': 'Mozilla/5.0'}
name = 'wallpaper_links'
reddit_domain = 'https://www.reddit.com'
subreddit = 'wallpaperengine'
subreddit_url = f'{reddit_domain}/r/{subreddit}'
search_query = 'can someone animate this?'

def search_reddit(query, after=None, domain=subreddit_url):
    search_query = quote_plus(query)
    url = f'{domain}/search.json?q={search_query}&restrict_sr=1&sort=relevance'
    if after:
        url += f'&after={after}'
    key = f'{search_query}_{after if after else "default"}'
    if not check_cache(REDDIT_DOWNLOAD_DIR, key):
        response = requests.get(url, headers)
        results = response.json()
        if response.status_code == 200:
            # json dump
            file = open(os.path.join(REDDIT_DOWNLOAD_DIR, key), 'w+')
            file.write(json.dumps(results))
            file.close()
    else:
        file = open(os.path.join(REDDIT_DOWNLOAD_DIR, key), 'r')
        results = json.loads(file.read())
        file.close()
    return results            
    
def search_for_reddit_posts(query, after_pages=4):
    search_results = []
    after = None
    for i in range(after_pages):
        results = search_reddit(query, after)
        after = results['data']['after']
        search_results += results['data']['children']
    return search_results

search_results = search_for_reddit_posts(search_query)
search_results[0]

{'kind': 't3',
 'data': {'approved_at_utc': None,
  'subreddit': 'wallpaperengine',
  'selftext': '',
  'author_fullname': 't2_5d52kgpc',
  'saved': False,
  'mod_reason_title': None,
  'gilded': 0,
  'clicked': False,
  'title': 'Can someone animate this?',
  'link_flair_richtext': [{'e': 'text', 't': 'Request'}],
  'subreddit_name_prefixed': 'r/wallpaperengine',
  'hidden': False,
  'pwls': 6,
  'link_flair_css_class': '',
  'downs': 0,
  'thumbnail_height': 78,
  'top_awarded_type': None,
  'hide_score': False,
  'name': 't3_muphzi',
  'quarantine': False,
  'link_flair_text_color': 'dark',
  'upvote_ratio': 0.98,
  'author_flair_background_color': None,
  'ups': 973,
  'total_awards_received': 0,
  'media_embed': {},
  'thumbnail_width': 140,
  'author_flair_template_id': None,
  'is_original_content': False,
  'user_reports': [],
  'secure_media': None,
  'is_reddit_media_domain': True,
  'is_meta': False,
  'category': None,
  'secure_media_embed': {},
  'link_flair_text': 'Reque

In [3]:
properties = ['id', 'title', 'link_flair_text', 'post_hint', 'score', 'created', 'url', 'permalink', 'media', 'is_video']

def parse_search_results(search_results, properties=properties):
  parsed_results = []
  for result in search_results:
    post = result['data']
    if properties:
      post = {key: post[key] for key in properties if key in post}
    parsed_results.append(post)
  return parsed_results


def filter_out_non_images(search_results):
    return [result for result in search_results if 'post_hint' in result and result['post_hint'] == 'image']

search_results = parse_search_results(search_results)
search_results = filter_out_non_images(search_results)
print(f'Found {len(search_results)} results')
search_results[0]

Found 89 results


{'id': 'muphzi',
 'title': 'Can someone animate this?',
 'link_flair_text': 'Request',
 'post_hint': 'image',
 'score': 973,
 'created': 1618922763.0,
 'url': 'https://i.redd.it/xv7ooelxqbu61.png',
 'permalink': '/r/wallpaperengine/comments/muphzi/can_someone_animate_this/',
 'media': None,
 'is_video': False}

In [4]:
def parse_post_content(post_content):
  post_data = post_content[0]['data']['children'][0]['data']
  comments = post_content[1]['data']['children']
  return post_data, comments

# Progressive backoff for retries
MAX_RETRIES = 5
INITIAL_BACKOFF = 1
def get_post_from_search_result(search_result):
  id = search_result['id']
  if check_cache(REDDIT_DOWNLOAD_DIR, id):
    file = open(os.path.join(REDDIT_DOWNLOAD_DIR, id), 'r')
    post_content = json.loads(file.read())
    file.close()
  else:
    permalink = f'{reddit_domain}{search_result["permalink"]}.json'
    retries = 0
    backoff = INITIAL_BACKOFF
    while retries < MAX_RETRIES:
      response = requests.get(permalink, headers)
      if response.status_code == 200:
        break
      else:
        retries += 1
        time.sleep(backoff)
        backoff *= 2  # Progressive backoff
        if retries == MAX_RETRIES:
          raise Exception(f'Failed to get post content for {permalink} after {MAX_RETRIES} retries')
    post_content = response.json()
    file = open(os.path.join(REDDIT_DOWNLOAD_DIR, id), 'w+')
    file.write(json.dumps(post_content))
    file.close()
  post_data, comments = parse_post_content(post_content)
  return post_data, comments

def parse_post_data(post_data):
  properties = ['name', 'url', 'is_video', 'permalink' 'num_comments', 'id', 'author', 'locked', 'media_only', 'is_reddit_media_domain', 'link_flair_text', 'subreddit', 'upvote_ratio', 'domain', 'media_embed', 'ups', 'score', 'downs', 'post_hint', 'created', 'over_18']
  return {key: post_data[key] for key in properties if key in post_data}

mex_reply_depth = 100
max_replies = 100
def flatten_comments(comments, depth=0):
  if depth > mex_reply_depth:
    return []
  flat_comments = []
  for comment in comments:
    flat_comments.append(comment)
    if 'replies' in comment['data'] and 'data' in comment['data']['replies'] and 'children' in comment['data']['replies']['data']:
      flat_comments += flatten_comments(comment['data']['replies']['data']['children'], depth+1)
  return flat_comments

def parse_post_comments(comments):
  # flatten all comments and all replies
  comments_flat = flatten_comments(comments)
  properties=['body', 'body_html', 'permalink', 'ups', 'score', 'downs', 'id', 'author', 'created_utc']
  parsed_comments = []
  for comment in comments_flat:
    parsed_comment = {key: comment['data'][key] for key in properties if key in comment['data']}
    body_html = parsed_comment['body_html'] if 'body_html' in parsed_comment else None
    parsed_comment['links'] = []
    if body_html:
      soup = BeautifulSoup(html.unescape(body_html), 'html.parser')
      parsed_comment['links'] = [a['href'] for a in soup.find_all('a', href=True)]
    parsed_comments.append(parsed_comment)
  return parsed_comments

def get_posts_from_parsed_results(search_results):
  content = []
  for search_result in search_results:
    post_data, comments = get_post_from_search_result(search_result)
    has_links = lambda comment: 'links' in comment and type(comment['links']) == list and len(comment['links']) > 0
    post = parse_post_data(post_data)
    comments = parse_post_comments(comments)
    comments = list(filter(has_links, comments))
    content.append({
      'post': post,
      'comments': comments
    })
  return content

posts = get_posts_from_parsed_results(search_results)
# ignore posts with 0 comments and if post url is defined
posts = list(filter(lambda post: len(post['comments']) > 0 and 'url' in post['post'], posts))
posts[0]

{'post': {'name': 't3_muphzi',
  'url': 'https://i.redd.it/xv7ooelxqbu61.png',
  'is_video': False,
  'id': 'muphzi',
  'author': 'AlligamerIsDad',
  'locked': False,
  'media_only': False,
  'is_reddit_media_domain': True,
  'link_flair_text': 'Request',
  'subreddit': 'wallpaperengine',
  'upvote_ratio': 0.98,
  'domain': 'i.redd.it',
  'media_embed': {},
  'ups': 976,
  'score': 976,
  'downs': 0,
  'post_hint': 'image',
  'created': 1618922763,
  'over_18': False},
 'comments': [{'body': '[https://steamcommunity.com/sharedfiles/filedetails/?id=1547246726](https://steamcommunity.com/sharedfiles/filedetails/?id=1547246726)\n\n[https://steamcommunity.com/sharedfiles/filedetails/?id=1921855792](https://steamcommunity.com/sharedfiles/filedetails/?id=1921855792)\n\n[https://steamcommunity.com/sharedfiles/filedetails/?id=2412473886](https://steamcommunity.com/sharedfiles/filedetails/?id=2412473886)',
   'body_html': '&lt;div class="md"&gt;&lt;p&gt;&lt;a href="https://steamcommunity.com/sh

In [5]:
# filter post["url"], all combos of comments["links"] into pairs
# ignore_domains = ['venmo.com', 'instagram.com', 'gofile.io', 'gumroad.com', 'twitter.com', 'youtube.com/playlist', 'wolframalpha.com', 'discord.gg', 'discord.com', 'facebook.com', 'patreon.com', 'paypal.me', 'paypal.com', 'twitch.tv', 'soundcloud.com', 'open.spotify.com', 'spotify.com', 'apple.com', 'apple.co', 'music.apple.com', 'music.apple.co', 'apps.apple.com', 'apps.apple.co', 'play.google.com', 'play.google.co', 'google.com', 'google.co', 'drive.google.com', 'drive.google.co', 'docs.google.com', 'docs.google.co', 'forms.google.com', 'forms.google.co', 'meet.google.com', 'meet.google.co', 'hangouts.google.com', 'hangouts.google.co', 'calendar.google.com', 'calendar.google.co', 'photos.google.com', 'photos.google.co', 'photos.app.goo.gl', 'photos.app.goo.gl', 'maps.google.com', 'maps.google.co', 'news.google.com', 'news.google.co', 'translate.google.com', 'translate.google.co', 'books.google.com', 'books.google.co', 'shopping.google.com', 'shopping.google.co', 'flights.google.com', 'flights.google.co', 'finance.google.com', 'finance.google.co', 'play.google.com', 'play.google.co', 'podcasts.google.com', 'podcasts.google.co', 'ads.google.com', 'ads.google.co', 'about.google.com', 'about.google.co', 'store.google.com', 'store.google.co', 'support.google.com', 'support.google.co', 'blog.google.com', 'blog.google.co', 'hangouts.google.com', 'hangouts.google.co', 'meet.google.com', 'meet.google.co', 'duo.google.com', 'duo.google.co', 'fi.google.com', 'fi.google.co', 'one.google.com', 'one.google.co', 'photos.google.com', 'photos.google.co', 'docs.google.com', 'docs.google.co', 'drive.google.com', 'drive.google.co', 'calendar.google.com', 'calendar.google.co', 'translate.google.com', 'translate.google.co', 'photos.app.goo.gl', 'photos.app.goo.gl', 'sites.google.com', 'sites.google.co', 'blogger.com', 'blogger.co', 'blogspot.com', 'blogspot.co', 'wordpress.com', 'wordpress.co', 'tumblr.com', 'tumblr.co', 'medium.com', 'medium.co', "reddit.com/message/compose", "reddit.com/r/RemindMeBot", "reddit.com/r/wallpaperengine/comments"]
# def get_pairs(post):
#   pairs = []
#   for comment in post['comments']:
#     for link in comment['links']:
#       if any(domain in link for domain in ignore_domains):
#         continue
#       pairs.append((post['post']['url'], link))
#   return pairs

keep_links = ['i.redd.it', 'youtube.com/watch', 'steamcommunity.com/sharedfiles']

def get_pairs(post):
  pairs = []
  for comment in post['comments']:
    for link in comment['links']:
      if any(domain in link for domain in keep_links):
        pairs.append((post['post']['url'], link))
        return pairs
  return pairs

wallpaper_pairs = []
for post in posts:
  pairs = get_pairs(post)
  wallpaper_pairs += pairs
print(f'Found {len(wallpaper_pairs)} wallpaper pairs')

Found 59 wallpaper pairs


In [6]:
# json serialize wallpapers inline
file='wallpapers.json'
directory = './cache'
ensure_directory_exists(directory)
file_path = os.path.join(directory, file)
file = open(file_path, 'w+')
file.write(json.dumps(wallpaper_pairs))
file.close()

In [7]:
wallpaper_pairs = json.loads(open(file_path, 'r').read())
wallpaper_pairs

[['https://i.redd.it/xv7ooelxqbu61.png',
  'https://steamcommunity.com/sharedfiles/filedetails/?id=1547246726'],
 ['https://i.redd.it/rz5gjjnj2nm91.png',
  'https://steamcommunity.com/sharedfiles/filedetails/?id=2860536335%5D(https://steamcommunity.com/sharedfiles/filedetails/?id=2860536335)'],
 ['https://i.redd.it/a30z1fq8phr91.png',
  'https://steamcommunity.com/sharedfiles/filedetails/?id=2871089644'],
 ['https://i.redd.it/x3r0vke8f6u71.jpg',
  'https://steamcommunity.com/sharedfiles/filedetails/?id=2155933185'],
 ['https://i.redd.it/p3w1xak1om271.png',
  'https://steamcommunity.com/sharedfiles/filedetails/?id=2492155485'],
 ['https://i.redd.it/l4bcywqfdhv81.jpg',
  'https://steamcommunity.com/sharedfiles/filedetails/?id=2799210158'],
 ['https://i.redd.it/ecrg3cyzbpyc1.jpeg',
  'https://steamcommunity.com/sharedfiles/filedetails/?id=3247592115'],
 ['https://i.redd.it/fp0ul8014ho61.jpg',
  'https://steamcommunity.com/sharedfiles/filedetails/?id=2432116780'],
 ['https://i.redd.it/007r