# This Notebook provides a detailed walkthrough of the python_4_reddit media analytics wrapper

## https://github.com/casonk/python_4_reddit

## Import all required packages/modules

In [256]:
from dateutil.relativedelta import relativedelta
from dataclasses import dataclass
import matplotlib.pyplot as plt
from pathlib import Path
import networkx as nx
import seaborn as sns
import pandas as pd
import numpy as np
import zstandard
import requests
import datetime
import time
import math
import json
import re
import os
import csv

### Create query superclass

In [257]:
@dataclass
class query:
    """
    SuperClass for compiling reddit queries.
    ----------
    paramaters
    ----------
    query_type:
        subreddit- query provided subreddit.
        keyword- query all subreddits for provided keyword.
    query: provided subreddit or keyword.
    time_range: dictionary input {'before' : latest post time, 'after' : earliest post time}
        times can be given in unix epoch timestamp or datetime format.
    time_format:
        'unix'- defaults to unix epoch timestamp.
        'datetime'- set this option is specifing time_range in datetime format.
    post_type: selection to query for comments or submissions, defaults to both.
        'comment'- only query comments.
        'submission'- only query submission.
        defaults to query both comments and submissions.
    """

    def __init__(
        self, query_type, query, time_range, time_format="unix", post_type=None
    ):
        """
        Initilization of query object.
        """
        self.type = query_type.lower()
        self.query = query.lower()
        if time_format == "datetime":
            time_range["before"] = int(
                datetime.datetime.timestamp(
                    datetime.datetime.strptime(time_range["before"], "%Y-%m-%d")
                )
            )
            time_range["after"] = int(
                datetime.datetime.timestamp(
                    datetime.datetime.strptime(time_range["after"], "%Y-%m-%d")
                )
            )
        self.before = int(time_range["before"])
        self.before_dt = datetime.datetime.fromtimestamp(self.before)
        self.after = int(time_range["after"])
        self.after_dt = datetime.datetime.fromtimestamp(self.after)
        try:
            self.post_type = post_type.lower()
        except:
            self.post_type = post_type

### Create pushshift file query object

In [258]:
@dataclass
class pushshift_file_query(query):
    """
    Class for compiling pushshift file queries.
    Respective files can be downloaded from : https://files.pushshift.io/reddit/
    ----------
    paramaters
    ----------
    query_type:
        'subreddit'- query provided subreddit.
        'keyword'- query all subreddits for provided keyword.
    query: provided subreddit or keyword.
    time_range: dictionary input {'before' : latest post time, 'after' : earliest post time}
        times can be given in unix epoch timestamp or datetime format.
    time_format:
        'unix'- defaults to unix epoch timestamp.
        'datetime'- set this option is specifing time_range in datetime format.
    post_type: selection to query for comments or submissions, defaults to both.
        'comment'- only query comments.
        'submission'- only query submission.
        defaults to query both comments and submissions.
    """

    def __init__(
        self, query_type, query, time_range, time_format="unix", post_type=None
    ):
        """
        Initilization of query object.
        """
        super().__init__(
            query_type, query, time_range, time_format, post_type
            )
        self.submission_folder_path = Path('F:/Research/Funded/Ethical_Reccomendations/Python/Push_File/Submissions/RS/2019+/')
        self.comment_folder_path = Path('F:/Research/Funded/Ethical_Reccomendations/Python/Push_File/Comments/RC/2019+/')
        self.line_counter = 0
        self.post_counter = 0
        self.file_counter = 0
        self.errors = 0

    def set_parent_folders(self, submission_folder_path, comment_folder_path):
        '''
        Set paths to pushshift files.
        '''
        self.submission_folder_path = Path(submission_folder_path)
        self.comment_folder_path = Path(comment_folder_path)

    def read_lines_zst(self):
        '''
        Helper function for reading from ztandandard compressed ndjson files. 
        '''
        with open(self.working_file, 'rb') as file_handle:
            buffer = ''
            reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
            while True:
                chunk = reader.read(2**27).decode()
                if not chunk:
                    break
                lines = (buffer + chunk).split("\n")

                for line in lines[:-1]:
                    yield line, file_handle.tell()

                buffer = lines[-1]
            reader.close()
        
    def make_query(self):
        """
        Initialize the query.
        """
        self.df = pd.DataFrame(
            columns=[
                "post_type",
                "subreddit",
                "id",
                "parent_id",
                "link_id",
                "url",
                "permalink",
                "created_utc",
                "datetime",
                "score",
                "num_comments",
                "title",
                "body",
                "author",
            ]
        )
        self.submissions = self.df.copy()
        self.comments = self.df.copy()

        def create_common_data(post):
            """
            Helper function to collect values common between both comments and submissions.
            """
            try:
                subreddit = post["subreddit"]
                post_id = post["id"]
                try:
                    parent_id = post["parent_id"]
                except KeyError:
                    parent_id = "nan"
                try:
                    link_id = post["link_id"]
                except KeyError:
                    link_id = "nan"
                try:
                    url = post["url"]
                except KeyError:
                    url = "nan"
                permalink = post["permalink"]
                created_utc = post["created_utc"]
                t = datetime.datetime.fromtimestamp(created_utc)
                date = t.strftime("%m/%d/%Y")
                score = post["score"]
                try:
                    num_comments = post["num_comments"]
                except KeyError:
                    num_comments = "nan"
                try:
                    title = post["title"]
                    title = r"{}".format(title)
                except KeyError:
                    title = "nan"
                author = post["author"]
                author = r"{}".format(author)
                return (
                    subreddit,
                    post_id,
                    parent_id,
                    link_id,
                    url,
                    permalink,
                    date,
                    created_utc,
                    score,
                    num_comments,
                    title,
                    author,
                )
            except KeyboardInterrupt:
                pass

        def search_sumissions(self):
            for line, file_bytes_processed in self.read_lines_zst():
                self.line_counter += 1
                if self.line_counter%1000000 == 0:
                    print('  >> Processed {} Posts, Found {} Posts'.format(
                        self.line_counter, self.post_counter)
                        )
                try:
                    post = json.loads(line) 
                    if self.type == 'subreddit':
                        if int(post['created_utc']) >= int(self.after):
                            if int(post['created_utc']) <= int(self.before):
                                if post["subreddit"] == self.query:
                                    self.post_counter += 1
                                    (
                                        subreddit,
                                        post_id,
                                        parent_id,
                                        link_id,
                                        url,
                                        permalink,
                                        date,
                                        created_utc,
                                        score,
                                        num_comments,
                                        title,
                                        author,
                                    ) = create_common_data(post=post)
                                    try:
                                        body = post["selftext"]
                                        body = r"{}".format(body)
                                    except KeyError:
                                        body = "nan"
                                    post_data = {
                                        "post_type": "submission",
                                        "subreddit": subreddit,
                                        "id": post_id,
                                        "parent_id": parent_id,
                                        "link_id": link_id,
                                        "url": url,
                                        "permalink": permalink,
                                        "created_utc": created_utc,
                                        "datetime": date,
                                        "score": score,
                                        "num_comments": num_comments,
                                        "title": title,
                                        "body": body,
                                        "author": author,
                                    }
                                    try:
                                        self.submissions = self.submissions.append(
                                            post_data, ignore_index=True
                                        )
                                    except KeyboardInterrupt:
                                        self.submissions = self.submissions.append(
                                            post_data, ignore_index=True
                                        )
                                        print(
                                            "Keyboard Interrupt Detected, please Interrupt again to break parent function."
                                        )
                                        break
                            # elif self.query_type == 'keyword':
                except (KeyError, json.JSONDecodeError):
                    self.errors += 1

        def search_comments(self):
            for line, file_bytes_processed in self.read_lines_zst():
                self.line_counter += 1
                if self.line_counter%1000000 == 0:
                    print('  >> Processed {} Posts, Found {} Posts'.format(
                        self.line_counter, self.post_counter)
                        )
                try:
                    post = json.loads(line)
                    if self.type == 'subreddit':
                        if int(post['created_utc']) >= int(self.after):
                            if int(post['created_utc']) <= int(self.before):
                                if post["subreddit"] == self.query:
                                    self.post_counter += 1
                                    (
                                        subreddit,
                                        post_id,
                                        parent_id,
                                        link_id,
                                        url,
                                        permalink,
                                        date,
                                        created_utc,
                                        score,
                                        num_comments,
                                        title,
                                        author,
                                    ) = create_common_data(post=post)
                                    try:
                                        body = post["body"]
                                        body = r"{}".format(body)
                                    except KeyError:
                                        body = "nan"
                                    post_data = {
                                        "post_type": "comment",
                                        "subreddit": subreddit,
                                        "id": post_id,
                                        "parent_id": parent_id,
                                        "link_id": link_id,
                                        "url": url,
                                        "permalink": permalink,
                                        "created_utc": created_utc,
                                        "datetime": date,
                                        "score": score,
                                        "num_comments": num_comments,
                                        "title": title,
                                        "body": body,
                                        "author": author,
                                    }
                                    try:
                                        self.comments = self.comments.append(
                                            post_data, ignore_index=True
                                        )
                                    except KeyboardInterrupt:
                                        self.comments = self.comments.append(
                                            post_data, ignore_index=True
                                        )
                                        print(
                                            "Keyboard Interrupt Detected, please Interrupt again to break parent function."
                                        )
                                        break
                            # elif self.query_type == 'keyword':
                except (KeyError, json.JSONDecodeError):
                    self.errors += 1
        
        def make_time_list(self):
            first = self.after_dt
            last = self.before_dt
            while first <= last:
                self.time_list.append(first.strftime("%Y-%m"))
                first += relativedelta(months=1)
            if last.strftime("%Y-%m") in self.time_list:
                pass
            else:
                self.time_list.append(last.strftime("%Y-%m"))

        make_time_list(self=self)

        all_submission_files = [submission_file for submission_file in self.submission_folder_path.iterdir()]
        if self.post_type == "comment":
            pass
        else:
            for file in all_submission_files:
                try:
                    for time in self.time_list:
                        if time in file.name:
                            self.working_file = str(file.as_posix())
                            print('> Parsing : {}'.format(file.name))
                            try:
                                search_sumissions(self=self)
                            except KeyboardInterrupt:
                                print(
                                    "Keyboard Interrupt Detected, your object's values are secure"
                                )
                                break
                            self.file_counter += 1
                            print('   >>> Total Files Parsed : {}, Total Posts Parsed : {}, Total Posts Collected : {}, Total Errors Found : {}'.format
                            (self.file_counter, self.line_counter, self.post_counter, self.errors)
                            )
                except KeyboardInterrupt:
                    print(
                        "Keyboard Interrupt Detected, your object's values are secure"
                    )
                    break

        all_comment_files = [comment_file for comment_file in self.comment_folder_path.iterdir()]
        if self.post_type == "submission":
            pass
        else:
            for file in all_comment_files:
                try:
                    for time in self.time_list:
                        if time in file.name:
                            self.working_file = str(file.as_posix())
                            print('> Parsing : {}'.format(file.name))
                            try:
                                search_comments(self=self)
                            except KeyboardInterrupt:
                                print(
                                    "Keyboard Interrupt Detected, your object's values are secure"
                                )
                                break
                            self.file_counter += 1
                            print('   >>> Total Files Parsed : {}, Total Posts Parsed : {}, Total Posts Collected : {}, Total Errors Found : {}'.format
                            (self.file_counter, self.line_counter, self.post_counter, self.errors)
                            )
                except KeyboardInterrupt:
                    print(
                        "Keyboard Interrupt Detected, your object's values are secure"
                    )
                    break

        self.df = self.submissions.append(self.comments)
    
    def export(self, path, to_export="df", export_format=".pkl"):
        """
        Easily save and export your data for future analytics.
        ----------
        paramaters
        ----------
        path: path to save output data to.
        to_export: select what data you wish to export
            'df'- all data.
            'submissions'- only submission data.
            'comments'- only comment data.
        export_format:
            '.pkl'- default, exports to pickle.
            '.csv'- export to comma seperated file. 
        """
        if to_export == "df":
            if export_format == ".pkl":
                self.df.to_pickle(path=path)
            elif export_format == ".csv":
                self.df.to_csv(path_or_buf=path)
        elif to_export == "submissions":
            if export_format == ".pkl":
                self.submissions.to_pickle(path=path)
            elif export_format == ".csv":
                self.submissions.to_csv(path_or_buf=path)
        elif to_export == "comments":
            if export_format == ".pkl":
                self.comments.to_pickle(path=path)
            elif export_format == ".csv":
                self.comments.to_csv(path_or_buf=path)

In [172]:
antivax_query = pushshift_file_query(
    query_type="subreddit",
    query="antivax",
    time_range={"before": "2019-06-26", "after": "2019-06-01"},
    time_format="datetime",
    post_type=None,
)

In [173]:
antivax_query.make_query()
antivax_query.time_list

Parsing : RC_2019-06.zst
Processed 1000000 Posts, Found 6 Posts
Processed 2000000 Posts, Found 110 Posts
Processed 3000000 Posts, Found 156 Posts
Processed 4000000 Posts, Found 191 Posts
Processed 5000000 Posts, Found 214 Posts
Processed 6000000 Posts, Found 264 Posts
Processed 7000000 Posts, Found 298 Posts
Processed 8000000 Posts, Found 339 Posts
Processed 9000000 Posts, Found 372 Posts
Processed 10000000 Posts, Found 407 Posts
Processed 11000000 Posts, Found 455 Posts
Processed 12000000 Posts, Found 500 Posts
Processed 13000000 Posts, Found 522 Posts
Processed 14000000 Posts, Found 533 Posts
Processed 15000000 Posts, Found 568 Posts
Processed 16000000 Posts, Found 590 Posts
Processed 17000000 Posts, Found 651 Posts
Processed 18000000 Posts, Found 668 Posts
Processed 19000000 Posts, Found 699 Posts
Processed 20000000 Posts, Found 747 Posts
Processed 21000000 Posts, Found 798 Posts
Processed 22000000 Posts, Found 819 Posts
Processed 23000000 Posts, Found 848 Posts
Processed 24000000 P

['2019-06', '2019-07']

### Create pushshift web query object

In [259]:
@dataclass
class pushshift_web_query(query):
    """
    Class for compiling pushshift web queries.
    ----------
    paramaters
    ----------
    query_type:
        subreddit- query provided subreddit.
        keyword- query all subreddits for provided keyword.
    query: provided subreddit or keyword.
    time_range: dictionary input {'before' : latest post time, 'after' : earliest post time}
        times can be given in unix epoch timestamp or datetime format.
    time_format:
        'unix'- defaults to unix epoch timestamp.
        'datetime'- set this option is specifing time_range in datetime format.
    post_type: selection to query for comments or submissions, defaults to both.
        'comment'- only query comments.
        'submission'- only query submission.
        defaults to query both comments and submissions.
    """

    def __init__(
        self, query_type, query, time_range, time_format="unix", post_type=None
    ):
        """
        Initilization of query object.
        """
        super().__init__(
            query_type, query, time_range, time_format, post_type
            )
        self.api_hit_counter = 0

    def update_url(self):
        """
        Helper function to update timestamp after each API request.
        """
        try:
            if self.type == "subreddit":
                self.comment_url = "https://api.pushshift.io/reddit/search/{}/?after={}&before={}&subreddit={}&size={}".format(
                    str("comment"),
                    str(self.current_time),
                    str(self.before),
                    str(self.query),
                    "12345",
                )
                self.submission_url = "https://api.pushshift.io/reddit/search/{}/?after={}&before={}&subreddit={}&size={}".format(
                    str("submission"),
                    str(self.current_time),
                    str(self.before),
                    str(self.query),
                    "12345",
                )
            elif self.type == "keyword":
                self.comment_url = "https://api.pushshift.io/reddit/search/{}/?q={}&after={}&before={}&size={}".format(
                    str("comment"),
                    str(self.query),
                    str(self.current_time),
                    str(self.before),
                    "12345",
                )
                self.submission_url = "https://api.pushshift.io/reddit/search/{}/?q={}&after={}&before={}&size={}".format(
                    str("submission"),
                    str(self.query),
                    str(self.current_time),
                    str(self.before),
                    "12345",
                )
        except KeyboardInterrupt:
            pass

    def make_query(self):
        """
        Initialize the query.
        """
        self.df = pd.DataFrame(
            columns=[
                "post_type",
                "subreddit",
                "id",
                "parent_id",
                "link_id",
                "url",
                "permalink",
                "created_utc",
                "datetime",
                "score",
                "num_comments",
                "title",
                "body",
                "author",
            ]
        )
        self.submissions = self.df.copy()
        self.comments = self.df.copy()

        def web_hit(self, url):
            """
            Helper function to make the API request.
            ----------
            paramaters
            ----------
            url: provide either self.submission_url or self.comment_url depending on post type.
            """
            try:
                self.api_hit_counter += 1
                try:
                    r = requests.get(url)
                    status = r.status_code
                    print("> http response is:", status)
                except:
                    status = "NO HANDSHAKE WITH API"
                    print(status)
                if status != 200:
                    retry = 0
                    while True:
                        retry += 1
                        print(
                            "\nAPI DECLINED REQUEST\n\n>> This is retry #:",
                            retry,
                            "<<\n",
                        )
                        time.sleep(15 * retry)
                        try:
                            r = requests.get(url)
                            status = r.status_code
                            print(">> retry http response is:", status)
                        except:
                            status = "NO HANDSHAKE WITH API"
                            print(status)
                        if status == 200:
                            break
                print(" >> Web Hit On", self.query, "# :", self.api_hit_counter)
                print(
                    "  >>> Current Post Time :",
                    str(datetime.datetime.fromtimestamp(self.current_time)),
                )
                self.web_data = json.loads(r.text, strict=False)
                time.sleep(1)
            except KeyboardInterrupt:
                pass

        def create_common_data(post):
            """
            Helper function to collect values common between both comments and submissions.
            """
            try:
                subreddit = post["subreddit"]
                post_id = post["id"]
                try:
                    parent_id = post["parent_id"]
                except KeyError:
                    parent_id = "nan"
                try:
                    link_id = post["link_id"]
                except KeyError:
                    link_id = "nan"
                try:
                    url = post["url"]
                except KeyError:
                    url = "nan"
                permalink = post["permalink"]
                created_utc = post["created_utc"]
                t = datetime.datetime.fromtimestamp(created_utc)
                date = t.strftime("%m/%d/%Y")
                score = post["score"]
                try:
                    num_comments = post["num_comments"]
                except KeyError:
                    num_comments = "nan"
                try:
                    title = post["title"]
                    title = r"{}".format(title)
                except KeyError:
                    title = "nan"
                author = post["author"]
                author = r"{}".format(author)
                return (
                    subreddit,
                    post_id,
                    parent_id,
                    link_id,
                    url,
                    permalink,
                    date,
                    created_utc,
                    score,
                    num_comments,
                    title,
                    author,
                )
            except KeyboardInterrupt:
                pass

        def save_submissions(self):
            """
            Helper function to save submissions to self.submissions.
            """
            for post in self.web_data["data"]:
                (
                    subreddit,
                    post_id,
                    parent_id,
                    link_id,
                    url,
                    permalink,
                    date,
                    created_utc,
                    score,
                    num_comments,
                    title,
                    author,
                ) = create_common_data(post=post)
                try:
                    body = post["selftext"]
                    body = r"{}".format(body)
                except KeyError:
                    body = "nan"
                post_data = {
                    "post_type": "submission",
                    "subreddit": subreddit,
                    "id": post_id,
                    "parent_id": parent_id,
                    "link_id": link_id,
                    "url": url,
                    "permalink": permalink,
                    "created_utc": created_utc,
                    "datetime": date,
                    "score": score,
                    "num_comments": num_comments,
                    "title": title,
                    "body": body,
                    "author": author,
                }
                try:
                    self.submissions = self.submissions.append(
                        post_data, ignore_index=True
                    )
                    self.current_time = created_utc
                except KeyboardInterrupt:
                    self.submissions = self.submissions.append(
                        post_data, ignore_index=True
                    )
                    self.current_time = created_utc
                    print(
                        "Keyboard Interrupt Detected, please Interrupt again to break parent function."
                    )
                    break

        def save_comments(self):
            """
            Helper function to save comments to self.comments.
            """
            for post in self.web_data["data"]:
                (
                    subreddit,
                    post_id,
                    parent_id,
                    link_id,
                    url,
                    permalink,
                    date,
                    created_utc,
                    score,
                    num_comments,
                    title,
                    author,
                ) = create_common_data(post=post)
                try:
                    body = post["body"]
                    body = r"{}".format(body)
                except KeyError:
                    body = "nan"
                post_data = {
                    "post_type": "comment",
                    "subreddit": subreddit,
                    "id": post_id,
                    "parent_id": parent_id,
                    "link_id": link_id,
                    "url": url,
                    "permalink": permalink,
                    "created_utc": created_utc,
                    "datetime": date,
                    "score": score,
                    "num_comments": num_comments,
                    "title": title,
                    "body": body,
                    "author": author,
                }
                try:
                    self.comments = self.comments.append(post_data, ignore_index=True)
                    self.current_time = created_utc
                except KeyboardInterrupt:
                    self.submissions = self.submissions.append(
                        post_data, ignore_index=True
                    )
                    self.current_time = created_utc
                    print(
                        "Keyboard Interrupt Detected, please Interrupt again to break parent function."
                    )
                    break

        def collect_submissions(self):
            """
            Master function to chain previous helper functions and collect the requested data for submissions.
            """
            self.current_time = self.after
            if self.post_type == "comment":
                pass
            else:
                while self.current_time < self.before:
                    self.update_url()
                    web_hit(self=self, url=self.submission_url)
                    # print(self.web_data)
                    if len(self.web_data["data"]) == 0:
                        break
                    else:
                        try:
                            save_submissions(self=self)
                        except KeyboardInterrupt:
                            print(
                                "Keyboard Interrupt Detected, your object's values are secure"
                            )
                            break

        def collect_comments(self):
            """
            Master function to chain previous helper functions and collect the requested data for comments.
            """
            self.current_time = self.after
            if self.post_type == "submission":
                pass
            else:
                while self.current_time < self.before:
                    self.update_url()
                    web_hit(self=self, url=self.comment_url)
                    if len(self.web_data["data"]) == 0:
                        break
                    else:
                        try:
                            save_comments(self=self)
                        except KeyboardInterrupt:
                            print(
                                "Keyboard Interrupt Detected, your object's values are secure"
                            )
                            break

        collect_submissions(self=self)
        collect_comments(self=self)

        self.df = self.submissions.append(self.comments)

    def export(self, path, to_export="df", export_format=".pkl"):
        """
        Easily save and export your data for future analytics.
        ----------
        paramaters
        ----------
        path: path to save output data to.
        to_export: select what data you wish to export
            'df'- all data.
            'submissions'- only submission data.
            'comments'- only comment data.
        export_format:
            '.pkl'- default, exports to pickle.
            '.csv'- export to comma seperated file. 
        """
        if to_export == "df":
            if export_format == ".pkl":
                self.df.to_pickle(path=path)
            elif export_format == ".csv":
                self.df.to_csv(path_or_buf=path)
        elif to_export == "submissions":
            if export_format == ".pkl":
                self.submissions.to_pickle(path=path)
            elif export_format == ".csv":
                self.submissions.to_csv(path_or_buf=path)
        elif to_export == "comments":
            if export_format == ".pkl":
                self.comments.to_pickle(path=path)
            elif export_format == ".csv":
                self.comments.to_csv(path_or_buf=path)

#### PushShift Web Query Examples

##### PushShift Web Query using unix epoch time

In [260]:
conspiracy_query = pushshift_web_query(
    query_type="subreddit",
    query="conspiracy",
    time_range={"before": "1609631999", "after": "1609462861"},
)

##### PushShift Web Query using datetime

In [261]:
antivax_query = pushshift_web_query(
    query_type="subreddit",
    query="antivax",
    time_range={"before": "2021-01-02", "after": "2021-01-01"},
    time_format="datetime",
    post_type=None,
)

##### Making the query request

Note, this will take some time to collect the API request, timing will depend on specified time_range

In [262]:
conspiracy_query.make_query()
antivax_query.make_query()

> http response is: 200
 >> Web Hit On conspiracy # : 1
  >>> Current Post Time : 2020-12-31 20:01:01
> http response is: 200
 >> Web Hit On conspiracy # : 2
  >>> Current Post Time : 2021-01-01 00:26:19
> http response is: 200
 >> Web Hit On conspiracy # : 3
  >>> Current Post Time : 2021-01-01 06:30:31
> http response is: 200
 >> Web Hit On conspiracy # : 4
  >>> Current Post Time : 2021-01-01 12:41:33
> http response is: 200
 >> Web Hit On conspiracy # : 5
  >>> Current Post Time : 2021-01-01 16:07:21
> http response is: 200
 >> Web Hit On conspiracy # : 6
  >>> Current Post Time : 2021-01-01 19:40:04
> http response is: 200
 >> Web Hit On conspiracy # : 7
  >>> Current Post Time : 2021-01-01 23:49:27
> http response is: 200
 >> Web Hit On conspiracy # : 8
  >>> Current Post Time : 2021-01-02 04:53:43
> http response is: 200
 >> Web Hit On conspiracy # : 9
  >>> Current Post Time : 2021-01-02 11:14:30
> http response is: 200
 >> Web Hit On conspiracy # : 10
  >>> Current Post Time :

##### Our queries return dataframes

In [32]:
conspiracy_query.df.head(3)

Unnamed: 0,post_type,subreddit,id,parent_id,link_id,url,permalink,created_utc,datetime,score,num_comments,title,body,author
0,submission,conspiracy,ko20vl,,,https://www.reddit.com/r/conspiracy/comments/k...,/r/conspiracy/comments/ko20vl/i_want_to_know_w...,1609462898,12/31/2020,1,39,I want to know what’s going on,I’ve listened to all 3 Joe Rohan and Alex Jone...,Bmille3
1,submission,conspiracy,ko21bc,,,https://www.reddit.com/r/conspiracy/comments/k...,/r/conspiracy/comments/ko21bc/why_the_world_ha...,1609462943,12/31/2020,1,37,Why the world has not ended yet?,"So many people are aware of 666, I don’t under...",yotta_e
2,submission,conspiracy,ko22uq,,,https://i.redd.it/ub4tr6ykem861.jpg,/r/conspiracy/comments/ko22uq/mf_dooms_last_po...,1609463098,12/31/2020,1,14,"MF Dooms last post on IG, #33 #freemasonryobse...",,Vegannibba


In [33]:
antivax_query.comments.head(3)

Unnamed: 0,post_type,subreddit,id,parent_id,link_id,url,permalink,created_utc,datetime,score,num_comments,title,body,author
0,comment,antivax,ghohvb4,t3_knu1v8,t3_knu1v8,,/r/antivax/comments/knu1v8/you_too_are_excited...,1609479929,01/01/2021,1,,,What,ajcabelera
1,comment,antivax,ghokmmf,t3_knpv4r,t3_knpv4r,,/r/antivax/comments/knpv4r/mmr_vs_measles/ghok...,1609482195,01/01/2021,1,,,"Why not use ""dead"" vaccine to reduce risk? Why...",MichaelAChristian
2,comment,antivax,ghomglv,t3_knpv4r,t3_knpv4r,,/r/antivax/comments/knpv4r/mmr_vs_measles/ghom...,1609483787,01/01/2021,1,,,&gt;Here are the serious side effects of MMR \...,slip63


### Create community object

In [407]:
@dataclass
class community:
    def __init__(self, name='community', path=None, dataframe=None, columns=None, file_format=None):
        """
        Initilization of object, created DataFrame for provided community.
        ----------
        paramaters
        ----------
        path: path to file location.
        datafram: pass a corresponding community dataframe.
        columns: selected colums to read, only applicable for .csv.
        file_format: defaults to "None" when passed a pandas dataframe.
            "csv"- for passing DataFrame stored as csv.
            "pkl"- for passing a pickled DataFrame.
        """
        self.name = name
        if path == None:
            self.df = dataframe
        elif file_format == "csv":
            self.df = pd.read_csv(
                filepath_or_buffer=path, 
                usecols=columns, 
                low_memory=False
            )
        elif file_format == "pkl":
            self.df = pd.read_pickle(filepath_or_buffer=path)
        submission_mask = self.df['post_type'] == 'submission'
        comment_mask = self.df['post_type'] == 'comment'
        self.submissions = self.df[submission_mask]
        self.comments = self.df[comment_mask]

    def make_urls(self, column=None, post_type=None):
        """
        Qurey posts for url embeddings in posts.
        ----------
        paramaters
        ----------
        column: DataFrame column to query on, body or title.
        post_type: option to restrict posts to only comments or submissions.
        Note* submissions usuallly contain a seperate embedded url field.
        """
        url_pattern = re.compile("((www\.[^\s]+)|(https://[^\s]+))")

        def find_urls(frame):
            mask = frame[column].str.match(url_pattern, na=False)
            self.url_df = frame[mask]
            self.urls = pd.DataFrame(
                self.url_df[column].str.extract(url_pattern)[0].rename("url")
            )

        if post_type == None:
            find_urls(frame=self.df)
        elif post_type == "comment":
            comment_mask = self.df["post_type"] == "comment"
            find_urls(frame=self.df[comment_mask])
        elif post_type == "submission":
            submission_mask = self.df["post_type"] == "submission"
            find_urls(frame=self.df[submission_mask])

    def make_authors(self):
        indx = self.df['author'].unique()
        self.authors = pd.DataFrame(
            columns=[
                "total_submissions",
                "total_submission_score",
                "total_submission_comments",
                "total_comments",
                "total_comment_score",
                "total_posts",
                "total_post_score"
            ],
            index=indx
        )
        type_mask = self.df['post_type'] == 'submission'
        total_submissions = self.df[type_mask].groupby('author').size()
        self.authors['total_submissions'] = total_submissions
        total_submission_score = self.df[type_mask].groupby('author')['score'].sum()
        self.authors['total_submission_score'] = total_submission_score
        total_submission_comments = self.df[type_mask].groupby('author')['num_comments'].sum()
        self.authors['total_submission_comments'] = total_submission_comments
        total_comments = self.df[~type_mask].groupby('author').size()
        self.authors['total_comments'] = total_comments
        total_comment_score = self.df[~type_mask].groupby('author')['score'].sum()
        self.authors['total_comment_score'] = total_comment_score
        self.authors = self.authors.apply(lambda x: x.fillna(0), axis=1)
        self.authors['total_posts'] = self.authors['total_submissions'] + self.authors['total_comments'] 
        self.authors['total_post_score'] = self.authors['total_submission_score'] + self.authors['total_comment_score']
        self.authors.index.name = 'author'

    def compare_authors(self, community):
        """
        Perform outer and inner joins on the two community authors. 
        returns a tuple containing: (outer join DataFrame, inner join DataFrame)
        """
        outer = pd.merge(
            self.authors[['total_submissions', 'total_comments', 'total_posts']], 
            community.authors[['total_submissions', 'total_comments', 'total_posts']], 
            how='outer', 
            left_index=True, 
            right_index=True, 
            suffixes=('_'+self.name, '_'+community.name)
            ).fillna(0)
        inner = pd.merge(
            self.authors[['total_submissions', 'total_comments', 'total_posts']], 
            community.authors[['total_submissions', 'total_comments', 'total_posts']], 
            how='inner', 
            left_index=True, 
            right_index=True, 
            suffixes=('_'+self.name, '_'+community.name)
            ).fillna(0)
        return outer, inner
        

    # def compare_with(self, community, on, post_type=None):
    #     '''
    #     Compare two community objects.
    #     ----------
    #     paramaters
    #     ----------
    #     community: community object to be compared to.
    #     on: column to compare on.
    #     post_type: option to compare selected post types, defaults to both comments and submissions.
    #         "comment"- only compare comments.
    #         "submission"- only compare submissions.
    #     '''
    #     if post_type == None:
    #         self.df.join(community.df, on=['subreddit', on], how='outer')
    #     elif post_type == 'submission':
    #         self.submissions.join(community.submissions, on=['subreddit', on], how='outer')
    #     elif post_type == 'comment':
    #         self.comments.join(community.comments, on=['subreddit', on], how='outer')


##### We can take our queried datafdrames and create a community object

In [408]:
conspiracy = community(name='conspiracy', dataframe=conspiracy_query.df)
antivax = community(name='antivax', dataframe=antivax_query.df)

In [409]:
conspiracy.make_authors()
antivax.make_authors()
conspiracy.authors

Unnamed: 0_level_0,total_submissions,total_submission_score,total_submission_comments,total_comments,total_comment_score,total_posts,total_post_score
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bmille3,2.0,2.0,86.0,13.0,13.0,15.0,15.0
yotta_e,1.0,1.0,37.0,10.0,10.0,11.0,11.0
Vegannibba,2.0,2.0,47.0,3.0,3.0,5.0,5.0
hiasfukit,1.0,1.0,2.0,2.0,2.0,3.0,3.0
WheniamHigh,2.0,2.0,6.0,1.0,1.0,3.0,3.0
...,...,...,...,...,...,...,...
Lupusvorax,0.0,0.0,0.0,3.0,3.0,3.0,3.0
ghost_of_mr_chicken,0.0,0.0,0.0,1.0,1.0,1.0,1.0
blurtard,0.0,0.0,0.0,1.0,1.0,1.0,1.0
bocasdt,0.0,0.0,0.0,1.0,1.0,1.0,1.0


In [411]:
outer, inner = conspiracy.compare_authors(antivax)
outer
# inner

Unnamed: 0_level_0,total_submissions_conspiracy,total_comments_conspiracy,total_posts_conspiracy,total_submissions_antivax,total_comments_antivax,total_posts_antivax
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
---Seraphim---,0.0,1.0,1.0,0.0,0.0,0.0
--Gem,0.0,1.0,1.0,0.0,0.0,0.0
--dontmindme--,0.0,2.0,2.0,0.0,0.0,0.0
-5x-,0.0,2.0,2.0,0.0,0.0,0.0
-80watt-,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
zoso418,1.0,0.0,1.0,0.0,0.0,0.0
zuzuofthewolves,0.0,14.0,14.0,0.0,0.0,0.0
zx12y,0.0,1.0,1.0,0.0,0.0,0.0
zzjient,3.0,0.0,3.0,0.0,0.0,0.0


In [412]:
inner.sum() / outer.sum()


total_submissions_conspiracy    0.002896
total_comments_conspiracy       0.054312
total_posts_conspiracy          0.052711
total_submissions_antivax       0.000000
total_comments_antivax          0.101266
total_posts_antivax             0.096386
dtype: float64

##### Example finding urls embedded in comments

In [45]:
conspiracy.make_urls(column='body', post_type='comment')
conspiracy.url_df

Unnamed: 0,post_type,subreddit,id,parent_id,link_id,url,permalink,created_utc,datetime,score,num_comments,title,body,author
70,comment,conspiracy,ghntqbx,t1_ghj2v5e,t3_kmr3wr,,/r/conspiracy/comments/kmr3wr/the_senate_finan...,1609463302,12/31/2020,1,,,https://en.wikipedia.org/wiki/Whataboutism,pinkerton--
131,comment,conspiracy,ghnupw4,t1_ghnp7h1,t3_knglp7,,/r/conspiracy/comments/knglp7/most_of_the_mate...,1609463895,12/31/2020,1,,,https://www.reuters.com/article/uk-factcheck-h...,Jonisonice
183,comment,conspiracy,ghnvb41,t1_ghnv2j1,t3_ko289s,,/r/conspiracy/comments/ko289s/wuhan_china_news...,1609464248,12/31/2020,1,,,https://www.google.com/amp/s/www.dailymail.co....,thatsgreatbruv
193,comment,conspiracy,ghnvf63,t1_ghnjf4j,t3_knu02c,,/r/conspiracy/comments/knu02c/audit_the_dead_c...,1609464317,12/31/2020,1,,,https://www.usatoday.com/story/news/factcheck/...,Bond4141
261,comment,conspiracy,ghnwaui,t1_ghnp5ou,t3_kntlek,,/r/conspiracy/comments/kntlek/of_course_wherev...,1609464834,12/31/2020,1,,,https://www.buzzfeednews.com/amphtml/rosiegray...,Annoyingly_Liberal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22403,comment,conspiracy,ghv2jbd,t1_ghtyaq1,t3_kotrqb,,/r/conspiracy/comments/kotrqb/pence_was_on_af2...,1609621482,01/02/2021,1,,,https://archive.org/details/AntiCommunitarianM...,fraxurdfuture
22517,comment,conspiracy,ghv3kr2,t1_ghuz60w,t3_kp3ypv,,/r/conspiracy/comments/kp3ypv/birds_lay_dead_i...,1609622027,01/02/2021,1,,,https://www.google.com/amp/s/www.forbes.com/si...,jamesko1989
23007,comment,conspiracy,ghv91o1,t1_ghv7v20,t3_kp5lxl,,/r/conspiracy/comments/kp5lxl/what_happens_whe...,1609624865,01/02/2021,1,,,https://www.google.com/amp/s/constitutioncente...,wileydickgoo
23167,comment,conspiracy,ghvar5a,t1_ghv5bm4,t3_kp19z7,,/r/conspiracy/comments/kp19z7/wheres_my_money_...,1609625757,01/02/2021,1,,,https://youtu.be/sY2Y-L5cvcA\n\nThere definite...,ZachElmurry


In [48]:
conspiracy.urls

Unnamed: 0,url
70,https://en.wikipedia.org/wiki/Whataboutism
131,https://www.reuters.com/article/uk-factcheck-h...
183,https://www.google.com/amp/s/www.dailymail.co....
193,https://www.usatoday.com/story/news/factcheck/...
261,https://www.buzzfeednews.com/amphtml/rosiegray...
...,...
22403,https://archive.org/details/AntiCommunitarianM...
22517,https://www.google.com/amp/s/www.forbes.com/si...
23007,https://www.google.com/amp/s/constitutioncente...
23167,https://youtu.be/sY2Y-L5cvcA


In [49]:
antivax.make_urls(column='body', post_type='comment')
antivax.urls

Unnamed: 0,url
49,https://images.app.goo.gl/4qXJtymeiSb79QnR8


### Create subreddit reference object

In [5]:
@dataclass
class subreddits:
    """
    Class for making queries based on subreddit type.
    The resulting DataFrame is meant to be used as a selection matrix based on subreddit.
    """

    def __init__(self, path=None, file_format=None):
        """
        Initilization of object, reads in subreddit list and adds datetime column.
        ----------
        paramaters
        ----------
        path: location of subredded statistics.
        file_format: formate to be specified if path is given.
            "csv"- if provided path points to .csv file.
            "pkl"- if provided path points to pickled DataFrame.
        """
        if path == None:
            self.master = pd.read_csv(
                "F:\Research\Funded\Ethical_Reccomendations\Python\Data\Docs\subreddit_list.csv"
            )
        else:
            if file_format == "csv":
                self.master = pd.read_csv(path)
            elif file_format == "pkl":
                self.master = pd.read_pickle(path)
        self.master["Creation_DateTime"] = [
            datetime.datetime.fromtimestamp(int(utc))
            for utc in self.master["Creation_UTC"]
        ]

    def split_nsfw(self):
        """
        Create attributes containig masked DataFrames for Not Safe / Safe For Work subreddits.
        """
        nsfw_mask = self.master["NSFW_BOOL"] == True
        self.nsfw = self.master[nsfw_mask]
        self.sfw = self.master[~nsfw_mask]

    def split_size(self, min_subscribers=0, max_subscribers=9999999999):
        """
        Create attribute containing subreddits within a specified subscriber range.
        ----------
        paramaters
        ----------
        min_subscribers: minimum number of allowed subscribers to a subreddit.
        max_subscribers: maximum number of allowed subscribers to a subreddit.
        """
        min_size_mask = self.master["#_Subscribers"] >= min_subscribers
        max_size_mask = self.master["#_Subscribers"] <= max_subscribers
        self.sized = self.master[min_size_mask & max_size_mask]

    def split_creation_time_unix(
        self, min_unix_timestamp=0000000000, max_unix_timestamp=9999999999
    ):
        """
        Create attribute containing subreddits created within a specified time range.
        ----------
        paramaters
        ----------
        min_unix_timestamp: earliest allowed date of creation in unix ephoch timestamp.
        max_unix_timestamp: latest allowed date of creation in unix ephoch timestamp.
        """
        min_unix_time_mask = self.master["Creation_UTC"] >= min_unix_timestamp
        max_unix_time_mask = self.master["Creation_UTC"] <= max_unix_timestamp
        self.sized = self.master[min_unix_time_mask & max_unix_time_mask]

    def split_creation_time_date(
        self, min_datetime="2000-01-01", max_datetime="2022-02-02"
    ):
        """
        Create attribute containing subreddits created within a specified time range.
        ----------
        paramaters
        ----------
        min_datetime: earliest allowed date of creation in datetime format.
        max_datetime: latest allowed date of creation in datetime format.
        """
        min_date_time_mask = self.master["Creation_DateTime"] >= min_datetime
        max_date_time_mask = self.master["Creation_DateTime"] <= max_datetime
        self.sized = self.master[min_date_time_mask & max_date_time_mask]

    def split_multi(self, nsfw=None, sizes=None, unix_times=None, date_times=None):
        """
        Query based subreddit selction based on NSFW status, subscriber count, and creation time.
        ----------
        paramaters
        ----------
        nsfw: True or False
        sizes: dictionary input {'min_subscribers' : minimum_value, 'max_subscribers' : maximum_value}
        unix_times: dictionary input {'min_unix_timestamp' : minimum_timestamp, 'max_unix_timestamp' : maximum_timestamp}
        date_times: dictionary input {'min_datetime' : 'minimum_datetime', 'max_datetime' : 'maximum_datetime}
        """
        if nsfw == None:
            nsfw_mask = [True for _ in self.master.index]
        else:
            if nsfw == True:
                nsfw_mask = self.master["NSFW_BOOL"] == True
            elif nsfw == False:
                nsfw_mask = self.master["NSFW_BOOL"] == False
        if sizes == None:
            min_size_mask = [True for _ in self.master.index]
            max_size_mask = [True for _ in self.master.index]
        else:
            min_size_mask = self.master["#_Subscribers"] >= sizes["min_subscribers"]
            max_size_mask = self.master["#_Subscribers"] <= sizes["max_subscribers"]
        if unix_times == None:
            min_unix_time_mask = [True for _ in self.master.index]
            max_unix_time_mask = [True for _ in self.master.index]
        else:
            min_unix_time_mask = (
                self.master["Creation_UTC"] >= unix_times["min_unix_timestamp"]
            )
            max_unix_time_mask = (
                self.master["Creation_UTC"] <= unix_times["max_unix_timestamp"]
            )
        if date_times == None:
            min_date_time_mask = [True for _ in self.master.index]
            max_date_time_mask = [True for _ in self.master.index]
        else:
            min_date_time_mask = (
                self.master["Creation_DateTime"] >= date_times["min_datetime"]
            )
            max_date_time_mask = (
                self.master["Creation_DateTime"] <= date_times["max_datetime"]
            )

        self.multi = self.master[
            nsfw_mask
            & min_size_mask
            & max_size_mask
            & min_unix_time_mask
            & max_unix_time_mask
            & min_date_time_mask
            & max_date_time_mask
        ]

In [12]:
subreddit_info = subreddits()
size = {"min_subscribers": 100000, "max_subscribers": 9999999999}
# unix_time = {"min_unix_timestamp": 1111111111, "max_unix_timestamp": 9999999999}
date_time = {"min_datetime": "2020-02-02", "max_datetime": "2021-01-01"}
subreddit_info.split_multi(nsfw=False, sizes=size, date_times=date_time)
subreddit_info.multi

Unnamed: 0,Subreddit,#_Subscribers,Creation_UTC,NSFW_BOOL,Creation_DateTime
650,Wallstreetbetsnew,830203,1584312000.0,False,2020-03-15 18:34:59
1066,Crypto traders with diamond hands 💎🙌,507399,1582466000.0,False,2020-02-23 09:00:27
1094,byebyejob,493627,1591489000.0,False,2020-06-06 20:20:38
1224,SHIBArmy,444642,1596479000.0,False,2020-08-03 14:27:30
1268,WallStreetbetsELITE,430701,1584912000.0,False,2020-03-22 17:15:30
1309,Awarded… posthumously.,418724,1600648000.0,False,2020-09-20 20:24:02
1351,r/GoodAnimemes,402648,1596521000.0,False,2020-08-04 01:59:49
1560,COVID-19,353131,1581434000.0,False,2020-02-11 10:16:21
1601,Distant Socializing,345052,1584321000.0,False,2020-03-15 21:02:18
1810,Anime Titties,308258,1588832000.0,False,2020-05-07 02:13:07
