To test the effectiveness of our approach, we will duplicate it on completely separate subreddits to see the results.

In [32]:
# importing the libraries
import requests
import time
import pandas as pd
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [5]:
def scrape_pushshift(subreddit, content, start_date, end_date, search_range):
    if 24 % search_range != 0:
        print('not a valid search_range')
        return None
    
    # setting up number of inital conditions
    hours = ((end_date-start_date).days + 1) * (24//search_range)
    before = (dt.date.today()-end_date).days*24
    after = before + search_range
    completed_hours = 0
    completed_days = 0
    
    dict_list = []

    for _ in range(hours):
        # sets up url with the specified subreddit and next hour block
        url = f'https://api.pushshift.io/reddit/search/{content}/?subreddit={subreddit}&size=500&before={str(before)}h&after={str(after)}h'
        res = requests.get(url)
        
        # checks for a valid connection
        while res.status_code != 200:
            print(res.status_code)
            res = requests.get(url)
            time.sleep(1)
        
        the_json = res.json()
        dict_list.extend(the_json['data'])
        
        before += search_range
        after += search_range
        completed_hours += search_range
        
        if completed_hours % 240 == 0:
            completed_days += 10
            print(f'Days Complete: {completed_days}')
        time.sleep(1)

    return dict_list

In [6]:
def scrape_to_dataframe(data):
    posts = []
    for dct in data:
        post_dict = {}
        post_dict['subreddit'] = dct['subreddit']
        post_dict['title'] = dct['title']
        post_dict['author'] = dct['author']
        post_dict['score'] = dct['score']
        post_dict['domain'] = dct['domain']
        post_dict['source_url'] = dct['url']
        posts.append(post_dict)

    return pd.DataFrame(posts)

In [8]:
zelda_scrape = scrape_pushshift(subreddit = 'zelda', 
                                content = 'submission', 
                                start_date = dt.date(2018, 9, 1), 
                                end_date = dt.date(2018, 11, 30),
                                search_range = 24)

Days Complete: 10
Days Complete: 20
Days Complete: 30
Days Complete: 40
Days Complete: 50
Days Complete: 60
Days Complete: 70
Days Complete: 80
Days Complete: 90


In [9]:
zelda_data = scrape_to_dataframe(zelda_scrape)
zelda_data.head()

Unnamed: 0,author,domain,score,source_url,subreddit,title
0,zatchquill,self.zelda,1,https://www.reddit.com/r/zelda/comments/a1ob7g...,zelda,Has a Zelda game inspired hobbies and careers ...
1,Anilxe,i.imgur.com,1,https://i.imgur.com/EN2gFLQ.gifv,zelda,Hopping off these and sliding off into the wat...
2,QuantumFighter,self.zelda,1,https://www.reddit.com/r/zelda/comments/a1oqyb...,zelda,I'm playing through all of the main series LOZ...
3,Viathan1108,i.redd.it,1,https://i.redd.it/a635rxt73e121.jpg,zelda,My first tat.
4,landyoop,twitter.com,1,https://twitter.com/LookAtT74424605/status/106...,zelda,Pretty cool. You can get the The Legend of Zel...


In [10]:
bitcoin_scrape = scrape_pushshift(subreddit = 'bitcoin', 
                                  content = 'submission', 
                                  start_date = dt.date(2018, 11, 1), 
                                  end_date = dt.date(2018, 11, 30),
                                  search_range = 24)

Days Complete: 10
Days Complete: 20
Days Complete: 30


In [11]:
bitcoin_data = scrape_to_dataframe(bitcoin_scrape)
bitcoin_data.head()

Unnamed: 0,author,domain,score,source_url,subreddit,title
0,jpoc1,self.Bitcoin,1,https://www.reddit.com/r/Bitcoin/comments/a1ob...,Bitcoin,Bitcoin’s Crash and Post-Thanksgiving Market P...
1,dubblies,self.Bitcoin,1,https://www.reddit.com/r/Bitcoin/comments/a1od...,Bitcoin,Bitcoin recovery method - anything in the works?
2,herzmeister,medium.com,1,https://medium.com/@kaykurokawa/the-nigerian-n...,Bitcoin,The Nigerian Nakamoto Scam – Kay Kurokawa
3,content404,i.redd.it,1,https://i.redd.it/0s07eorsvd121.png,Bitcoin,This is why short term volatility doesn't faze...
4,CumiasTheBeesKnees,imgflip.com,1,https://imgflip.com/i/2nsnss,Bitcoin,When withdrawing on Bitfinex.


In [12]:
combined_data = pd.concat([zelda_data, bitcoin_data], axis=0, ignore_index=True)

In [13]:
removal_mask = (combined_data.title == '[removed]') | (combined_data.title == '[deleted]') 
combined_data = combined_data.loc[~removal_mask, :]
combined_data.dropna(inplace=True)
combined_data.reset_index(inplace=True)

In [18]:
combined_data.shape

(13176, 7)

Text Cleaning

In [19]:
from bs4 import BeautifulSoup      
from nltk.corpus import stopwords
import regex as re

In [16]:
def clean_text(raw_text):
    text = re.sub(r'https:[^\s]+', repl='', string=raw_text)
    text = re.sub(r'http:[^\s]+', repl='', string=text)
    text = BeautifulSoup(text, 'lxml').get_text() 
    text = text.lower()
    words_only = re.sub("[^a-zA-Z]", " ", text)
    return(words_only)

In [17]:
combined_data.title = combined_data.title.map(clean_text)

In [20]:
#combined_data.to_csv('./datasets/zelda-bitcoin.csv', index=False, index_label=None)

In [21]:
X = combined_data['title']
y = combined_data['subreddit'].map(lambda x: 1 if x == 'Bitcoin' else 0)

In [22]:
#baseline
y.value_counts(normalize=True)

1    0.664466
0    0.335534
Name: subreddit, dtype: float64

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, stratify=y)

---

In [35]:
def vect_model(vectorizer, model):    
    vect_dict = {
        'count' : CountVectorizer(), 
        'td' : TfidfVectorizer()
    }

    model_dict = {
        'bayes' : MultinomialNB(), 
    }

    steps = [
        ('vectorize', vect_dict[vectorizer]),
        ('model', model_dict[model])
    ]

    return Pipeline(steps = steps)

Testing different vectorizors with Naive Bayes

In [36]:
bayes_count_model = vect_model('count', 'bayes')

In [37]:
pipe_params = {
    'vectorize__stop_words': ['english', None],
    'vectorize__ngram_range': [(1,1), (1,2)],
    'vectorize__max_features' : [5000, 10000, None]
}

grid = GridSearchCV(bayes_count_model, pipe_params, n_jobs=1, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   10.4s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorize', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preproc...zer=None, vocabulary=None)), ('model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vectorize__stop_words': ['english', None], 'vectorize__ngram_range': [(1, 1), (1, 2)], 'vectorize__max_features': [5000, 10000, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [38]:
grid.best_params_

{'vectorize__max_features': None,
 'vectorize__ngram_range': (1, 2),
 'vectorize__stop_words': None}

In [39]:
grid.best_estimator_.score(X_train, y_train)

0.9876660341555977

In [40]:
grid.best_estimator_.score(X_test, y_test)

0.9571320182094082

---

In [41]:
# testing with tifidf
bayes_td_model = vect_model('td', 'bayes')

In [42]:
pipe_params = {
    'vectorize__stop_words': ['english', None],
    'vectorize__ngram_range': [(1,1), (1,2)],
    'vectorize__max_features' : [5000, 10000, None],
    'vectorize__lowercase' : [True, False],
    'vectorize__norm' : ['l1', 'l2']
}

grid = GridSearchCV(bayes_td_model, pipe_params, n_jobs=-2, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-2)]: Done 144 out of 144 | elapsed:   11.0s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorize', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...e,
        vocabulary=None)), ('model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'vectorize__stop_words': ['english', None], 'vectorize__ngram_range': [(1, 1), (1, 2)], 'vectorize__max_features': [5000, 10000, None], 'vectorize__lowercase': [True, False], 'vectorize__norm': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [43]:
grid.best_params_

{'vectorize__lowercase': True,
 'vectorize__max_features': 5000,
 'vectorize__ngram_range': (1, 1),
 'vectorize__norm': 'l2',
 'vectorize__stop_words': 'english'}

In [44]:
grid.best_estimator_.score(X_train, y_train)

0.9671726755218216

In [45]:
grid.best_estimator_.score(X_test, y_test)

0.9438543247344461

Given the high accuarcy on train and test data for both naive bayes and logistic regression, we can conclude that this methodology works well given two subreddits with completly unrelated content. 