In [1]:
import requests
import time
import pandas as pd

import datetime as dt

In [2]:
def scrape_pushshift(subreddit, content, start_date, end_date, search_range):
    if 24 % search_range != 0:
        print('not a valid search_range')
        return None
    
    # setting up number of inital conditions
    hours = ((end_date-start_date).days + 1) * (24//search_range)
    before = (dt.date.today()-end_date).days*24
    after = before + search_range
    completed_hours = 0
    completed_days = 0
    
    dict_list = []

    for _ in range(hours):
        # sets up url with the specified subreddit and next hour block
        url = f'https://api.pushshift.io/reddit/search/{content}/?subreddit={subreddit}&size=500&before={str(before)}h&after={str(after)}h'
        res = requests.get(url)
        
        # checks for a valid connection
        while res.status_code != 200:
            print(res.status_code)
            res = requests.get(url)
            time.sleep(1)
        
        the_json = res.json()
        dict_list.extend(the_json['data'])
        
        before += search_range
        after += search_range
        completed_hours += search_range
        
        if completed_hours % 240 == 0:
            completed_days += 10
            print(f'Days Complete: {completed_days}')
        time.sleep(1)

    return dict_list

In [3]:
def scrape_to_dataframe(data):
    posts = []
    for dct in data:
        post_dict = {}
        post_dict['subreddit'] = dct['subreddit']
        post_dict['body'] = dct['body']
        post_dict['author'] = dct['author']
        post_dict['score'] = dct['score']
        post_dict['date'] = dct["created_utc"]
        posts.append(post_dict)

    df =  pd.DataFrame(posts)
    
    def get_date(created):
            return dt.date.fromtimestamp(created)
    df['date'] = df['date'].apply(get_date)
    
    return df

In [4]:
donald_comments_scrape = scrape_pushshift(subreddit = 'the_donald', 
                                          content = 'comment', 
                                          start_date = dt.date(2018, 11, 1), 
                                          end_date = dt.date(2018, 11, 30),
                                          search_range = 12)

Days Complete: 10
Days Complete: 20
Days Complete: 30


In [5]:
donald_comments_data = scrape_to_dataframe(donald_comments_scrape)

In [6]:
donald_comments_data.head()

Unnamed: 0,author,body,date,score,subreddit
0,nixfu,No completed deals = not doing business. He w...,2018-11-30,1,The_Donald
1,Crypulous,"Trump has been amazing on a lot of things, but...",2018-11-30,1,The_Donald
2,Dueler312,Actually Fox news did show it.,2018-11-30,1,The_Donald
3,soberlight,The excuse about being concerned about their r...,2018-11-30,1,The_Donald
4,enterthewalrus,Well Arizona did not get Martha McSally but it...,2018-11-30,1,The_Donald


In [11]:
chapo_comments_scrape = scrape_pushshift(subreddit = 'chapotraphouse', 
                                          content = 'comment', 
                                          start_date = dt.date(2018, 11, 1), 
                                          end_date = dt.date(2018, 11, 30),
                                          search_range = 12)

Days Complete: 10
Days Complete: 20
Days Complete: 30


In [13]:
chapo_comments_data = scrape_to_dataframe(chapo_comments_scrape)
chapo_comments_data.head()

Unnamed: 0,author,body,date,score,subreddit
0,Lieutenant_Rans,/u/34193248710478172340 you know the drill pos...,2018-11-30,1,ChapoTrapHouse
1,StickmanPirate,You mean ANTIFAcebook,2018-11-30,1,ChapoTrapHouse
2,Soupy_Phil,From MA. This has been going on for years and ...,2018-11-30,1,ChapoTrapHouse
3,Saltwaterpapi,respect the hustle,2018-11-30,1,ChapoTrapHouse
4,thesch,lmao I had no idea Brodus Clay was a Fox News guy,2018-11-30,1,ChapoTrapHouse


In [14]:
combined_data = pd.concat([donald_comments_data, chapo_comments_data], axis=0, ignore_index=True)

In [15]:
combined_data.body.value_counts()

[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               1683
Reminder: when someone links to a thread outside of /r/ChapoTrapHouse, please avoid posting or voting in the linked thread. If it's seen as "brigading" it could get you suspended or banned.\n\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/ChapoTrapHouse) if you have any questions or concerns.*                                                                             

In [16]:
reminder_mask = combined_data.body.map(lambda x: True if 'Reminder: when someone links to a thread outside of /r/ChapoTrapHouse' in x else False)
combined_data = combined_data.loc[~reminder_mask]

newsnetwork_mask = combined_data.body.map(lambda x: True if 'f8SJIS3XxU8iqHoInw' in x else False)
combined_data = combined_data.loc[~newsnetwork_mask]

automatic_mask = combined_data.body.map(lambda x: True if 'Your comment was automatically removed' in x else False)
combined_data = combined_data.loc[~automatic_mask]

snapshillbot_mask = combined_data.body.map(lambda x: True if 'SnapshillBot' in x else False)
combined_data = combined_data.loc[~snapshillbot_mask]

removal_mask = (combined_data.body == '[removed]') | (combined_data.body == '[deleted]') 
combined_data = combined_data.loc[~removal_mask, :]

In [17]:
combined_data.dropna(inplace=True)
combined_data.reset_index(drop=True, inplace=True)

In [18]:
#combined_data.to_csv('./datasets/raw_donald_chappo_text.csv')

---

Text Cleaning

In [19]:
from bs4 import BeautifulSoup      
from nltk.corpus import stopwords
import regex as re

from nltk.stem
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

In [20]:
def clean_text(raw_text):
    text = re.sub(r'https:[^\s]+', repl='', string=raw_text)
    text = re.sub(r'http:[^\s]+', repl='', string=text)
    text = BeautifulSoup(text, 'lxml').get_text() 
    text = text.lower()
    words_only = re.sub("[^a-zA-Z]", " ", text)
    return(words_only)

Both the WordLemmeatier and PorterStemmer from NLTK were tried, however the unaltered text consistently performed better. 

In [21]:
combined_data.body = combined_data.body.apply(clean_text)

In [22]:
combined_data.head()

Unnamed: 0,author,body,date,score,subreddit
0,nixfu,no completed deals not doing business he w...,2018-11-30,1,The_Donald
1,Crypulous,trump has been amazing on a lot of things but...,2018-11-30,1,The_Donald
2,Dueler312,actually fox news did show it,2018-11-30,1,The_Donald
3,soberlight,the excuse about being concerned about their r...,2018-11-30,1,The_Donald
4,enterthewalrus,well arizona did not get martha mcsally but it...,2018-11-30,1,The_Donald


In [23]:
combined_data.shape

(58014, 5)

In [24]:
#combined_data.to_csv('./datasets/cleaned_donald_chappo_text.csv')

---

In [4]:
# modifying the function to include a comment subject filter, and changing tracker to 100 days

def scrape_pushshift(subreddit, content, q, start_date, end_date, search_range):
    if 24 % search_range != 0:
        print('not a valid search_range')
        return None
    
    # setting up number of inital conditions
    hours = ((end_date-start_date).days + 1) * (24//search_range)
    before = (dt.date.today()-end_date).days*24
    after = before + search_range
    completed_hours = 0
    completed_days = 0
    
    dict_list = []

    for _ in range(hours):
        # sets up url with the specified subreddit and next hour block
        url = f'https://api.pushshift.io/reddit/search/{content}/?subreddit={subreddit}&size=500&before={str(before)}h&after={str(after)}h&q={q}'
        res = requests.get(url)
        
        # checks for a valid connection
        while res.status_code != 200:
            print(res.status_code)
            res = requests.get(url)
            time.sleep(1)
        
        the_json = res.json()
        dict_list.extend(the_json['data'])
        
        before += search_range
        after += search_range
        completed_hours += search_range
        
        if completed_hours % 2400 == 0:
            completed_days += 100
            print(f'Days Complete: {completed_days}')
        time.sleep(1)

    return dict_list

In [7]:
mccain_scrapes = scrape_pushshift(subreddit = 'the_donald', 
                                          content = 'comment',
                                          q = 'McCain',
                                          start_date = dt.date(2015, 1, 1), 
                                          end_date = dt.date.today(),
                                          search_range = 24)

Days Complete: 100
Days Complete: 200
Days Complete: 300
Days Complete: 400
Days Complete: 500
Days Complete: 600
Days Complete: 700
Days Complete: 800
Days Complete: 900
Days Complete: 1000
Days Complete: 1100
Days Complete: 1200
Days Complete: 1300
Days Complete: 1400


In [10]:
mccain_data = scrape_to_dataframe(mccain_scrapes)
mccain_data.shape

(64189, 5)

In [26]:
mccain_data.head()

Unnamed: 0,author,body,date,score,subreddit
0,havoc2bg,Not the first time McCain peddled foreign prop...,2019-01-01,1,The_Donald
1,bhoelscher,Fucking McCain,2019-01-01,1,The_Donald
2,armorkingII,He didn't sell out. He always was a Deep State...,2019-01-01,1,The_Donald
3,Bulldog65,"Such a two faced sack of crap, cut from the sa...",2019-01-01,1,The_Donald
4,armorkingII,I voted for this guy. I voted for McCain. No l...,2019-01-01,1,The_Donald


In [21]:
mccain_data.to_csv('./datasets/mccain_data.csv', index=False)

In [12]:
orange_man_bad_scrapes = scrape_pushshift(subreddit = 'the_donald', 
                                          content = 'comment',
                                          q = 'orange man bad',
                                          start_date = dt.date(2015, 1, 1), 
                                          end_date = dt.date.today(),
                                          search_range = 24)

Days Complete: 100
Days Complete: 200
Days Complete: 300
Days Complete: 400
Days Complete: 500
Days Complete: 600
Days Complete: 700
Days Complete: 800
Days Complete: 900
Days Complete: 1000
Days Complete: 1100
Days Complete: 1200
Days Complete: 1300
Days Complete: 1400


In [13]:
orange_man_bad_data = scrape_to_dataframe(orange_man_bad_scrapes)
orange_man_bad_data.shape

(6706, 5)

In [14]:
orange_man_bad_data.head()

Unnamed: 0,author,body,date,score,subreddit
0,BoomerMaxwell,Orange man still bad for eating kfc!,2019-01-01,1,The_Donald
1,Fishyman908,Kim: Hey Trump last meeting was great. Can we ...,2019-01-02,1,The_Donald
2,HillarysRussianBot,But but but Trump is a racist and... And slave...,2019-01-02,1,The_Donald
3,ziggyzona,Assange is the founder of a journalist institu...,2019-01-02,1,The_Donald
4,god_emperor_jeb,The left/right dichotomy only serves the elite...,2019-01-02,1,The_Donald


In [25]:
orange_man_bad_data.to_csv('./datasets/orange_man_bad_data.csv', index=False)

In [15]:
god_emperor_scrapes = scrape_pushshift(subreddit = 'the_donald', 
                                          content = 'comment',
                                          q = 'god emperor',
                                          start_date = dt.date(2015, 1, 1), 
                                          end_date = dt.date.today(),
                                          search_range = 24)

Days Complete: 100
Days Complete: 200
Days Complete: 300
Days Complete: 400
Days Complete: 500
Days Complete: 600
Days Complete: 700
Days Complete: 800
Days Complete: 900
Days Complete: 1000
Days Complete: 1100
Days Complete: 1200
Days Complete: 1300
Days Complete: 1400


In [16]:
god_emperor_data = scrape_to_dataframe(god_emperor_scrapes)
god_emperor_data.shape

(57768, 5)

In [17]:
god_emperor_data.head()

Unnamed: 0,author,body,date,score,subreddit
0,Taellyn,Right?! I was all packed and ready to go! Stil...,2019-01-02,1,The_Donald
1,ono412,Perhaps even a God Emperor?,2019-01-02,1,The_Donald
2,ShadowBanThisCucks,I wouldn't mind if there's a GOP primary. Then...,2019-01-02,1,The_Donald
3,TheComedianGLP,It's hard to hide your identity as Saviour of ...,2019-01-02,1,The_Donald
4,ConservativelyRight,They're already caving. Our God Emperor will m...,2019-01-02,1,The_Donald


In [23]:
god_emperor_data.to_csv('./datasets/god_emperor_data.csv', index=False)

In [5]:
build_the_wall_scrapes = scrape_pushshift(subreddit = 'the_donald', 
                                          content = 'comment',
                                          q = 'build the wall',
                                          start_date = dt.date(2015, 1, 1), 
                                          end_date = dt.date.today(),
                                          search_range = 24)

Days Complete: 100
Days Complete: 200
Days Complete: 300
Days Complete: 400
Days Complete: 500
Days Complete: 600
Days Complete: 700
Days Complete: 800
Days Complete: 900
Days Complete: 1000
Days Complete: 1100
Days Complete: 1200
Days Complete: 1300
Days Complete: 1400


In [6]:
build_the_wall_data = scrape_to_dataframe(build_the_wall_scrapes)
build_the_wall_data.shape

(79887, 5)

In [7]:
build_the_wall_data.head()

Unnamed: 0,author,body,date,score,subreddit
0,beerpuke,Mexico has been and will always be a pass thro...,2019-01-02,1,The_Donald
1,TrumpBrickBot,**FOR THE LOVE OF GOD GET THIS PATRIOT A BRICK...,2019-01-02,1,The_Donald
2,Catit50,Did BHO pay for his wall? Did the US tax payer...,2019-01-02,1,The_Donald
3,missionofpiece,"I return all the GOP ""surveys"" that are really...",2019-01-02,1,The_Donald
4,UsernameNeo,I want a build the wall camp and I want it now!,2019-01-02,1,The_Donald


In [8]:
build_the_wall_data.to_csv('./datasets/build_the_wall.csv', index=False)