## Loading Libraries & Data

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import time
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

%matplotlib inline

### Collecting Data via Web API and Creating a Dataframe

In [3]:
def query_pushshift(subreddit, kind = 'submission', day_window = 30, n = 5): # Code from Mahdi
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full 

In [4]:
csb = query_pushshift('TwoBestFriendsPlay', kind='submission', day_window = 30, n = 11)

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TwoBestFriendsPlay&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TwoBestFriendsPlay&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TwoBestFriendsPlay&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TwoBestFriendsPlay&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TwoBestFriendsPlay&size=500&after=150d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TwoBestFriendsPlay&size=500&after=180d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TwoBestFriendsPlay&size=500&after=210d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TwoBestFriendsPlay&size=500&after=240d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TwoBestFriendsPl

In [5]:
csb.shape

(1164, 9)

In [6]:
csb.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Hard and Soft canon,Resident Evil isnt the only capcom game that r...,TwoBestFriendsPlay,1577905719,SuperUnhappyman,19,1,True,2020-01-01
14,What are some games with good first-person mel...,I'm not talking hitting someone with a gun typ...,TwoBestFriendsPlay,1577912961,GoodVillain101,62,1,True,2020-01-01
18,How to Completely Retrain Yourself in Fighting...,Ok so I've been asking for some advice now and...,TwoBestFriendsPlay,1577914380,redwill1001,16,1,True,2020-01-01
22,What are some games you would love an in depth...,Example 1: Resident evil 0 was originally a N6...,TwoBestFriendsPlay,1577919099,dope_danny,46,1,True,2020-01-01
28,Star Wars Prequel Trilogy vs New Trilogy,After finally seeing Rise of Skywalker a littl...,TwoBestFriendsPlay,1577922063,bassmanchris95,50,1,True,2020-01-01


In [7]:
rt = query_pushshift('roosterteeth', kind='submission', day_window = 15, n = 7)

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=roosterteeth&size=500&after=15d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=roosterteeth&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=roosterteeth&size=500&after=45d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=roosterteeth&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=roosterteeth&size=500&after=75d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=roosterteeth&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=roosterteeth&size=500&after=105d
Query Complete!


In [8]:
rt.shape

(1211, 9)

In [9]:
rt.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Off Topic Livestream Discussion Thread,#[Click Here To Watch The Livestream](http://r...,roosterteeth,1579202939,AutoModerator,4,1,True,2020-01-16
13,RT iPhone app issue,Is anyone else having an issue with downloaded...,roosterteeth,1579242644,workingthesystem,2,1,True,2020-01-17
16,Why is Joel not on the RT Podcast anymore?,He is probably my favorite cast member on the ...,roosterteeth,1579269479,Bandit_Ke1th,12,1,True,2020-01-17
23,"Million dollars, but...","Million dollars, but... Every time you spoke o...",roosterteeth,1579273812,Beigeun,6,1,True,2020-01-17
24,Looking for source: Story of Michael yelling a...,I remember a story from a podcast about Michae...,roosterteeth,1579279419,thoughtless447,3,1,True,2020-01-17


In [10]:
data = pd.concat([csb, rt], sort = False)
data.reset_index(drop=True)

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Hard and Soft canon,Resident Evil isnt the only capcom game that r...,TwoBestFriendsPlay,1577905719,SuperUnhappyman,19,1,True,2020-01-01
1,What are some games with good first-person mel...,I'm not talking hitting someone with a gun typ...,TwoBestFriendsPlay,1577912961,GoodVillain101,62,1,True,2020-01-01
2,How to Completely Retrain Yourself in Fighting...,Ok so I've been asking for some advice now and...,TwoBestFriendsPlay,1577914380,redwill1001,16,1,True,2020-01-01
3,What are some games you would love an in depth...,Example 1: Resident evil 0 was originally a N6...,TwoBestFriendsPlay,1577919099,dope_danny,46,1,True,2020-01-01
4,Star Wars Prequel Trilogy vs New Trilogy,After finally seeing Rise of Skywalker a littl...,TwoBestFriendsPlay,1577922063,bassmanchris95,50,1,True,2020-01-01
...,...,...,...,...,...,...,...,...,...
2370,Thought I saw a RWBY reference in Doctor Sleep...,Got to see Doctor Sleep last night and may hav...,roosterteeth,1572551927,rabidstu3,6,3,True,2019-10-31
2371,Is there any way to find out who's in a specif...,[removed],roosterteeth,1572555130,gditbarb,0,1,True,2019-10-31
2372,I speculate that #Teamtrees and all the donati...,These are just my thoughts and I truly hope I ...,roosterteeth,1572558705,TrapCardLol,10,0,True,2019-10-31
2373,[No Spoliers] RWBY Posters in Doctor Sleep,"Just saw Doctor Sleep. No spoilers, thought it...",roosterteeth,1572565161,octopus-god,5,12,True,2019-10-31


## Data Cleaning

In [11]:
data.isnull().sum()

title           0
selftext        1
subreddit       0
created_utc     0
author          0
num_comments    0
score           0
is_self         0
timestamp       0
dtype: int64

In [12]:
data[data['selftext'].isnull()]
data.drop(axis=0, index=125, inplace=True)

In [13]:
data.isnull().sum()

title           0
selftext        1
subreddit       0
created_utc     0
author          0
num_comments    0
score           0
is_self         0
timestamp       0
dtype: int64

In [14]:
data.dtypes

title           object
selftext        object
subreddit       object
created_utc      int64
author          object
num_comments     int64
score            int64
is_self           bool
timestamp       object
dtype: object

In [15]:
data[data['is_self'] != True]

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp


In [16]:
data['subreddit'] = data['subreddit'].map({'roosterteeth': 0, 'TwoBestFriendsPlay': 1})

In [17]:
data['subreddit'].value_counts()

0    1210
1    1162
Name: subreddit, dtype: int64

In [18]:
data.tail()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
490,Thought I saw a RWBY reference in Doctor Sleep...,Got to see Doctor Sleep last night and may hav...,0,1572551927,rabidstu3,6,3,True,2019-10-31
492,Is there any way to find out who's in a specif...,[removed],0,1572555130,gditbarb,0,1,True,2019-10-31
495,I speculate that #Teamtrees and all the donati...,These are just my thoughts and I truly hope I ...,0,1572558705,TrapCardLol,10,0,True,2019-10-31
498,[No Spoliers] RWBY Posters in Doctor Sleep,"Just saw Doctor Sleep. No spoilers, thought it...",0,1572565161,octopus-god,5,12,True,2019-10-31
499,Man of Medan,"Hey rooster teeth, are you planning on doing m...",0,1572565441,jo3keelo,17,43,True,2019-10-31


In [20]:
data.to_csv('subreddit_data')