## Setup

In [4]:
import pandas as pd

In [29]:
from psaw import PushshiftAPI

api = PushshiftAPI()

import datetime as dt
import time


## Collect Dataset

In [109]:
#Takes generator of posts and adds them to "data" dictionary
#NOTE: make sure to globally define data if converted to script

def add_posts(posts, dictionary):
    for post in posts:
        try: # makes sure no fields are null before adding to dictionary
            ID = post.id
            title = post.title
            author = post.author
            subreddit = post.subreddit
            subreddit_subscribers = post.subreddit_subscribers
            num_comments = post.num_comments
            upvote_ratio = post.upvote_ratio
            selftext = post.selftext
        except AttributeError:
            continue
        dictionary["id"].append(ID)
        dictionary["title"].append(title)
        dictionary["author"].append(author)
        dictionary["subreddit"].append(subreddit)
        dictionary["subreddit_subscribers"].append(subreddit_subscribers)
        dictionary["num_comments"].append(num_comments)
        dictionary["upvote_ratio"].append(upvote_ratio)                            
        dictionary["selftext"].append(selftext)
    return dictionary
        

In [156]:
def collect_posts(subreddits: list, monthRange: tuple, dayRange: tuple):
    
    """
    Collects posts over a given time range in a given number of subreddits and returns a list of dictionaries containing the results.

    Parameters
    ----------
    subreddits: list
        List of strings, each specifying a subreddits to query
    monthRange: tuple
        Tuple of two integers corresponding to first and last month to query. Use integers 1 through 12.
    dayRange
        Tuple of two integers corresponding to first and last day per month to query. Use integers 1 through 31.

    Returns
    -------
    list of dict
        Each dictionary corresponds to queries of a given subreddit

    """
    
    dictionaries = list()
    for s in subreddits:
        d = {"id": [], "title" : [], "author" : [], "subreddit": [], "subreddit_subscribers" : [], "num_comments": [], "upvote_ratio": [], "selftext" : []}
        for month in range(monthRange[0],monthRange[1]+1):
            for i in range(dayRange[0],dayRange[1]+1):
                try:
                    start_epoch = int(dt.datetime(2021, month, i).timestamp())
                    end_epoch = int(dt.datetime (2021, month, i+1).timestamp())
                except ValueError: # occurs when trying to access Feb 30, April 31, etc.
                    break
                posts = api.search_submissions(q = '"climate change"|"global warming"',
                after = start_epoch,
                before = end_epoch,
                subreddit = s,
                limit = 100)
                
                
                d = add_posts(posts, d)
        dictionaries.append(d)
    return dictionaries

In [157]:
ds = collect_posts(["news", "science", "askreddit", "politics", "technology"], (1,12), (1,31))



In [162]:
len(ds[4]["subreddit"])

99

## Troubleshooting section

In [78]:
terms = ["climate change","global warming""]

In [202]:
# This code is used to count keywords appearing in title vs selftext

start_epoch = int(dt.datetime(2021, 1, 4).timestamp())
end_epoch = int(dt.datetime (2021, 1, 5).timestamp())

posts = api.search_submissions(q = '"climate change"|"global warming"',
after = start_epoch,
before = end_epoch,
limit = 100
)

title = 0
selftext = 0
either = 0
total = 0
for post in posts:
    total += 1
    for term in terms:
        if (term in post.title.lower()) or (term in post.selftext.lower()):
            either += 1
            if (term in post.title.lower()):
                title += 1
            if (term in post.selftext.lower()):
                selftext += 1
        break
    # else:
    #     print(post)
    #     break
print(f"total: {total}\tEither: {either}\ttitle: {title}\tselftext: {selftext}")

total: 100	Either: 98	title: 56	selftext: 45


In [219]:
#This code is used to collect 100 posts from a single day (makes it easy to troubleshoot)

start_epoch = int(dt.datetime(2021, 1, 4).timestamp())
end_epoch = int(dt.datetime (2021, 1, 5).timestamp())

posts = api.search_submissions(q = '"climate change"|"forest fire"|"greenhouse gas"',
after = start_epoch,
before = end_epoch,
limit = 100
)

In [220]:
p = next(posts) #gets individual post

In [221]:
p.title #access attribute of a post

"Why haven't we engineered a bacteria that essentially eats CO2 and sequesters it very efficiently to combat climate change? Desperate for this discussion!"

In [222]:
#counts number of keywords contained in posts

cc = 0
ff = 0
gg = 0
total = 0
for post in posts:
    total += 1
    if ("climate change" in post.title.lower()) or ("climate change" in post.selftext.lower()):
        cc += 1
    elif ("forest fire" in post.title) or ("forest fire" in post.selftext):
        ff += 1
    elif ("greenhouse gas" in post.title) or ("greenhouse gas" in post.selftext):
        ff += 1
    # else:
    #     print(post)
    #     break
print(f"FF: {ff}\tGG: {gg}\tCC: {cc}\tTotal: {total}")

FF: 5	GG: 0	CC: 91	Total: 99


In [414]:
#There are a lot of attributes in a post. I found these ones helpful

#Title
#subreddit
#author
#media_only
#subreddit_subscribers (number)
#id
#num_comments

## Converting to usable files

In [99]:
df = pd.DataFrame.from_dict(data) #load dictionary into dataframe

In [78]:
# remove empty posts
df = df.loc[ ((~(df["selftext"] == '')) & (~(df["selftext"] == '[removed]')) & (~(df["selftext"] == '[deleted]')) & (~(df["title"] == '')) & (~(df["title"] == '')))]

In [91]:
len(df["title"])

150

In [100]:
df.head(30)

Unnamed: 0,id,title,author,subreddit,subreddit_subscribers,num_comments,upvote_ratio,selftext
0,kpfhjc,https://ift.tt/rBO82H 2021 could be turning po...,Mubashar110,news,22533936,0,1.0,
1,la131z,Evaluation | Biden sweeps away Trump’s climate...,newsnationglobal,news,22789503,0,1.0,
2,la0bed,How Climate Change May Affect Your Health,corealphanews,news,22789337,0,1.0,
3,lb144x,CAMPAIGN 2021: Climate change rises as an issu...,[deleted],news,22795253,0,0.5,[deleted]
4,laye0s,Spinach taught how to send emails in MIT study...,goodwoodenship,news,22794941,0,1.0,
5,laydbj,Spinach taught how to send emails in MIT study...,goodwoodenship,news,22794938,0,1.0,
6,lc79ym,Scientists Taught Spinach How to Send Emails t...,Alaskan_Lost,news,22801350,14,0.78,
7,lwdzv4,Pope Francis has warned that mankind is facing...,BSA3279,news,22904368,0,1.0,
8,lwdxl8,Pope Francis has warned that mankind is facing...,BSA3279,news,22904360,0,1.0,
9,lwdvmi,Pope Francis has warned that mankind is facing...,BSA3279,news,22904348,0,1.0,


In [61]:
raw = df[["subreddit", "subreddit_subscribers"]]
raw = raw.set_index(["subreddit"])

In [129]:
counts = df["subreddit"].value_counts().head(10)
counts

AskReddit           969
AutoNewspaper       868
environment         716
collapse            647
climatechange       614
climatedisalarm     587
unpopularopinion    545
climate             540
climateskeptics     539
newsbotbot          448
Name: subreddit, dtype: int64

In [93]:
# pd.merge(counts, raw, how="right", on=["subreddit"])
# raw.loc[raw["subreddit"] == "CryptoMars"]
# raw.loc[raw.index == "autotldr"]
pd.unique(raw.index)

array(['Gangstalking', 'ElizabethWarren', 'SFr4r', ..., 'u_nilz2977',
       'u_Coyoteaus', 'TVWriting'], dtype=object)

In [108]:
df2 = df.loc[df["subreddit_subscribers"] > 400000]
df2["subreddit"].value_counts().head(10)

unpopularopinion     88
NoStupidQuestions    70
conspiracy           64
teenagers            60
environment          34
changemyview         32
TrueOffMyChest       28
offmychest           27
childfree            27
r4r                  24
Name: subreddit, dtype: int64

In [127]:
counts

Unnamed: 0,subreddit
autotldr,428
unpopularopinion,410
climatechange,307
conspiracy,307
NoStupidQuestions,305
collapse,271
teenagers,259
CryptoMoonShots,233
dirtypenpals,160
CryptoMoon,155


In [None]:
'"climate emergency"|"greenhouse gas"|"climate justice"|"climate action"|"carbon offset"|"carbon capture"|"carbon emissions"|"carbon dioxide emissions"|"co2 emissions"|"carbon offsets"'

In [126]:
counts = counts.to_frame()

In [100]:
x = raw.drop_duplicates(raw.index)
x.loc[x.index == "climate"]

KeyError: Index(['1000xCoins', '23andmeforums', '5BJ7919UCLGJWICUWJLTA', 'AARP_Politics',
       'ABA', 'ABDLPersonals', 'ABoringDystopia', 'ADHD',
       'AI4CivilEngineering', 'AMCSTOCKS',
       ...
       'wyomingdoesntexist', 'xHumanity', 'xboxone', 'xco2', 'xrmed',
       'youtubers', 'yucreat', 'zelda', 'zerocarb', 'zizek'],
      dtype='object', length=1525)

In [101]:
#write to dataframe
with open("sample_reddit_COMP400_news.csv", "wt") as f:
    df.to_csv(f)

In [163]:
subs = ["news", "science", "askreddit", "politics", "technology"]
for i in range(len(ds)):
    df = pd.DataFrame.from_dict(ds[i])
    with open(f"{subs[i]}.csv", "wt") as f:
        df.to_csv(f)

In [232]:
#This lets you check for a single word whether it's in selftext. Just for exploratory purposes

df.loc[df["selftext"].str.contains("forest fire")]