# Comments Fetch Indexer

This notebook is used to fetch comments from previous reddit posts and index them in the database.

In [77]:
# https://api.pushshift.io/reddit/comment/search?html_decode=true&after=1609480800&subreddit=politicalcompassmemes&q=based%20and&size=100
import requests

base = f"https://api.pushshift.io/reddit/comment/search"

# fetch comments from pushshift after and before a given date, use a web service
def fetch_comments(after: str = None, before: str = None, size: int = 100, subreddit: str = "politicalcompassmemes") -> list:
    params = {
        "subreddit": subreddit,
        "size": size,
        "q": "based and",
        "sort":"asc",
        "sort_type":"created_utc",
        "fields": "body,author,created_utc,parent_id,link_id,id",
    }
    
    if after:
        params["after"] = after
    
    if before:
        params["before"] = before

    payload = "&".join(f"{k}={v}" for k,v in params.items())

    r = requests.get(base, params=payload)
    return r.json()["data"]

# Paginator
The paginator is used to navigate through the comments and filter out the ones that we really need

In [84]:
def paginator(size: int = 100, start=None, filter = None) -> list:

    after = start

    while size > 0:
        comments = fetch_comments(after=after, size=10 or max(size, 500))
        if filter:
            data = filter(comments)
        else:
            data = comments

        size -= len(data)

        after = comments[-1]["created_utc"]
        
        if data:
            yield data


In [85]:
def clean(data: list) -> list:
    def _c(comment):
        return comment['body'].startswith("based and")
    return list(filter(_c, data))


In [87]:
import json
for chunk in paginator(size=50, start=1609459200, filter=clean):
    print(json.dumps(chunk, indent=2))
    if input() == "q":
        break

{'subreddit': 'politicalcompassmemes', 'size': 10, 'q': 'based and', 'sort': 'asc', 'sort_type': 'created_utc', 'fields': 'body,author,created_utc,parent_id,link_id,id', 'after': 1609459200}
https://api.pushshift.io/reddit/comment/search?subreddit=politicalcompassmemes&size=10&q=based%20and&sort=asc&sort_type=created_utc&fields=body,author,created_utc,parent_id,link_id,id&after=1609459200
{'subreddit': 'politicalcompassmemes', 'size': 10, 'q': 'based and', 'sort': 'asc', 'sort_type': 'created_utc', 'fields': 'body,author,created_utc,parent_id,link_id,id', 'after': 1609460108}
https://api.pushshift.io/reddit/comment/search?subreddit=politicalcompassmemes&size=10&q=based%20and&sort=asc&sort_type=created_utc&fields=body,author,created_utc,parent_id,link_id,id&after=1609460108
{'subreddit': 'politicalcompassmemes', 'size': 10, 'q': 'based and', 'sort': 'asc', 'sort_type': 'created_utc', 'fields': 'body,author,created_utc,parent_id,link_id,id', 'after': 1609462013}
https://api.pushshift.io/