In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
import datetime

## MongoDB client

In [2]:
MONG_USER = os.getenv('MONG_USER')
MONG_PWD  = os.getenv('MONG_PWD')
MONG_HOST = os.getenv('MONG_HOST')
MONG_PORT = os.getenv('MONG_PORT')
DATABASE = "dataDumpingGround"

from pymongo import MongoClient
from bson.objectid import ObjectId

uri = f"mongodb://{MONG_USER}:{MONG_PWD}@{MONG_HOST}:{MONG_PORT}/{DATABASE}"
client = MongoClient(uri)
db = client[DATABASE]
db

Database(MongoClient(host=['171.244.50.232:27017'], document_class=dict, tz_aware=False, connect=True), 'dataDumpingGround')

## Reddit PRAW client

In [3]:
REDDIT_BOT_ID = os.getenv('REDDIT_BOT_ID')
REDDIT_BOT_SECRET = os.getenv('REDDIT_BOT_SECRET')

MY_REDDIT_USERNAME = os.getenv('MY_REDDIT_USERNAME')
MY_REDDIT_PWD = os.getenv('MY_REDDIT_PWD')

In [4]:
import praw

reddit = praw.Reddit(
    client_id = REDDIT_BOT_ID,
    client_secret = REDDIT_BOT_SECRET,
    user_agent='posts scraper',
    username = MY_REDDIT_USERNAME,
    password = MY_REDDIT_PWD
)
reddit

<praw.reddit.Reddit at 0x1a27d8138e0>

# 🎬 Action

In [5]:
from pprint import pprint
from helperfunctions import (
    compress_object,
    prettyprint,
)
from datetime import datetime, timezone, timedelta

## Params

In [6]:
from sys import maxsize as inf
SUBMISSIONS_LIM = inf
COMMENTS_LIM = 100
USERNAME = 'DjMuffinTops'

## Get user info

In [7]:
redditor = reddit.redditor(USERNAME)
redditor

Redditor(name='DjMuffinTops')

In [8]:
import copy 

redditor.id # to make it non-lazy
redditor_dict = copy.deepcopy(redditor.__dict__)

In [9]:
# https://praw.readthedocs.io/en/latest/code_overview/models/redditor.html
attrs_to_discard = [
    '_reddit', '_fetched', '_listing_use_sort'
]

[redditor_dict.pop(key) for key in attrs_to_discard]
redditor_dict['Last updated'] = datetime.now(tz=timezone(timedelta(hours=+7)))
# redditor_dict

Save to db

In [10]:
profile_col = db["redditProfiles"]
key = {'name': redditor_dict['name']}
profile_col.update(key, redditor_dict, upsert=True)

{'n': 1, 'nModified': 1, 'ok': 1.0, 'updatedExisting': True}

## 🚩 Get submissions

In [11]:
# https://praw.readthedocs.io/en/latest/code_overview/models/submission.html
attrs = [
    'created_utc', 'distinguished', 'edited', 'id', 'is_original_content', 'is_self',
    'link_flair_template_id', 'link_flair_text', 'locked', 'name', 'num_comments', 
    'over_18', 'permalink', 'poll_data', 'score', 'selftext', 'spoiler', 'stickied',
    'title', 'upvote_ratio', 'url',
    'total_awards_received'
]

In [12]:
from tqdm import tqdm

sub_col = db["redditSubmissions"]

for submission in tqdm( redditor.submissions.new(limit=SUBMISSIONS_LIM) ):
    # pprint(submission.__dict__)
    sub_obj = compress_object(submission.__dict__, attrs)
    sub_obj['Last updated'] = datetime.now(tz=timezone(timedelta(hours=+7)))
    sub_obj['Author_name'] = submission.author.name # author.name is unique
    sub_obj['Subreddit_name'] = submission.subreddit.display_name # display_name is unique
    # sub_obj['Distinct_award_count'] = len(submission.all_awardings)
    # pprint(sub_obj)

    key = {'id': sub_obj['id']}
    sub_col.update(key, sub_obj, upsert=True)

86it [00:02, 33.49it/s]


## 💬 Get comments

In [13]:
# https://praw.readthedocs.io/en/latest/code_overview/models/comment.html
cmt_attrs = [
    'body', 'body_html', 'created_utc', 'distinguished', 'edited', 'id',
    'is_submitter', 'link_id', 'parent_id', 'permalink', 'saved', 
    'score', 'stickied',
    'name', 'total_awards_received'
]

In [14]:
cmt_col = db["redditComments"]

for comment in tqdm( redditor.comments.new(limit=COMMENTS_LIM) ):
    # pprint(comment.__dict__)
    cmt_obj = compress_object(comment.__dict__, cmt_attrs)
    cmt_obj['Last updated'] = datetime.now(tz=timezone(timedelta(hours=+7)))
    cmt_obj['Author_name'] = comment.author.name
    cmt_obj['Subreddit_name'] = comment.subreddit.display_name
    # pprint(cmt_obj)

    key = {'id': cmt_obj['id']}
    cmt_col.update(key, cmt_obj, upsert=True)

100it [00:02, 47.59it/s]
