In [1]:
import pandas as pd
import json
import ast
import requests
import time
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def two_subreddits(subreddit1, subreddit2):
    url1 = 'https://www.reddit.com/r/' + subreddit1 + '/top.json?sort=top&t=all'
    url2 = 'https://www.reddit.com/r/'+ subreddit2 +'/top.json?sort=top&t=all'
    
    #gathering urls and checking status codes
    headers = {'User-agent': 'Adrian'}
    res1 = requests.get(url1, headers = headers)
    res2 = requests.get(url2, headers = headers)
    if (res1.status_code < 300 and res2.status_code < 300):
        print ('Urls received')
    else:
        print('Please inset a valid subreddit')
        return
    
    
    print('Gathering posts...')
    
    
    # gathering posts for subreddit 1
    posts1 = []
    after = None
    for i in range(40):
        if after == None:
            params = {}
        else:
            params = {'after': after}
        res1 = requests.get(url1, params = params, headers = headers)
        if res1.status_code == 200:
            the_json1 = res1.json()
            posts1.extend(the_json1['data']['children'])
            after = the_json1['data']['after']
        else:
            print(f'Server error {res1.status_code}, please try different subreddits')
            return
        time.sleep(1)
    print(f'{len(posts1)} posts gathered from {subreddit1} subreddit')
    
    
    # gathering posts for subreddit 2
    posts2 = []
    for i in range(40):
        if after == None:
            params = {}
        else:
            params = {'after': after}
        res2 = requests.get(url2, params = params, headers = headers)
        if res2.status_code == 200:
            the_json2 = res2.json()
            posts2.extend(the_json2['data']['children'])
            after = the_json2['data']['after']
        else:
            print(f'Server error {res2.status_code}, please try different subreddits')
            return
        time.sleep(1)
    print(f'{len(posts2)} posts gathered from {subreddit2} subreddit')
    
    
    # checking number of posts, if too few, return
    if (len(posts1) < 20 or len(posts2) < 20):
        print('Subreddits do not contain enough posts, please try different subreddits')
        return
    
    
    # creating dataframe with data
    posts1_df = pd.DataFrame(posts1)
    posts2_df = pd.DataFrame(posts2)
    
    # creating subreddit column. 1 stands for subreddit1 and 0 stands for subreddit2
    posts1_df['subreddit'] = 1
    posts2_df['subreddit'] = 0
    
    # getting titles and selftext from posts from the original lists
    posts1_df['title'] = [(post['data']['title']) for post in posts1]
    posts2_df['title'] = [(post['data']['title']) for post in posts2]
    
    posts1_df['selftext'] = [(post['data']['selftext']) for post in posts1]
    posts2_df['selftext'] = [(post['data']['selftext']) for post in posts2]
    
    # combining title text and selftext into another column
    posts1_df['alltext'] = posts1_df['title'] + ' ' + posts1_df['selftext']
    posts2_df['alltext'] = posts2_df['title'] + ' ' + posts2_df['selftext']
    
    
    # concat both dataframes
    master_df = pd.concat([posts1_df, posts2_df], ignore_index = True)
    try:
        master_df.drop('kind', axis = 1, inplace = True) # removing useless column
    except:
        pass
    
    
    # getting rid off urls and mentions of other subreddits
    master_df['alltext'] = master_df.alltext.map(lambda x: re.sub('\s[\/]?r\/[^s]+', ' ', x))
    master_df['alltext'] = master_df.alltext.map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))
    master_df['alltext'] = master_df.alltext.map(lambda x: re.sub("\d+", "", x))

    
    # checking for unbalanced classes:
    if (((len(master_df[master_df['subreddit']==1])/(len(master_df['subreddit']))) > .60)
        or (len(master_df[master_df['subreddit']==0])/(len(master_df['subreddit']))) > .60):
        print('WARNING: Unbalanced classes')
        print('Do you wish to continue?')
        answer = input()
        if answer == 'yes':
            pass
        else:
            print('Function has ended')
            return
    
    # baseline
    baseline = max(((len(master_df[master_df["subreddit"] == 1]))/(len(master_df))), 
                   ((len(master_df[master_df["subreddit"] == 0]))/(len(master_df))))
    print(f'Your baseline is: {baseline}')
    
    # defining X and y
    X = master_df['alltext']
    y = master_df['subreddit']
    
    
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)
    
    
    # Tfidf Vectorizer transformer
    tfidf = TfidfVectorizer(stop_words = 'english',
                            max_df = .95,
                            min_df = 5)
    
    
    # applying tfidf vectorizer to train and test data
    train_tfi = tfidf.fit_transform(X_train)
    train_df = pd.SparseDataFrame(train_tfi, columns = tfidf.get_feature_names())
    train_df.fillna(0, inplace = True)
    test_tfi = tfidf.transform(X_test)
    test_df = pd.SparseDataFrame(test_tfi, columns=tfidf.get_feature_names())
    test_df.fillna(0, inplace=True)
    
    
    # RandomForestClassifier model. Also using GridSearch to look for best hyperparameters
    rf = RandomForestClassifier()
    params = {'min_samples_split': [12, 25, 40], 'n_estimators': [100, 500]}
    gs = GridSearchCV(rf, param_grid=params, return_train_score=True, cv=5)
    gs.fit(train_df, y_train)
    
    # accuracy score for randomforestclassifier
    print(f'Accuracy score using Random Forest Classifier model is {gs.score(train_df, y_train)} on the train data')
    print(f'Accuracy score using Random Forest Classifier model is {gs.score(test_df, y_test)} on the test data')

    
    
    # CountVectorizer transformer
    countvect = CountVectorizer(analyzer = 'word',
                                tokenizer = None,
                                preprocessor = None,
                                stop_words = 'english',
                                max_features = 5000)

    
    # applying countvectorizer to our X variable
    X_train_cv = countvect.fit_transform(X_train)
    X_test_cv = countvect.transform(X_test)
    
    
    # instantiating and using logistic regression model
    lr = LogisticRegression()
    lr.fit(X_train_cv, y_train);
    
    lr_score_test = lr.score(X_test_cv, y_test)
    lr_score_train = lr.score(X_train_cv, y_train)
    
    # acuracy score
    print(f'Accuracy score using Logistic Regression model is {lr_score_train} on the train data')
    print(f'Accuracy score using Logistic Regression model is {lr_score_test} on the test data')
          
    
    # Multinomial Naive Bayes
    nb = MultinomialNB()
    nb.fit(X_train_cv, y_train)
    
    nb_score_train = nb.score(X_train_cv, y_train)
    nb_score_test = nb.score(X_test_cv, y_test)
    
    # accuracy score for multinomialNB
    print(f'Accuracy score using Multinomial Naive Bayes model is {nb_score_train} on the train data')
    print(f'Accuracy score using Multinomial Naive Bayes model is {nb_score_test} on the test data')    
    
    
    importance = pd.DataFrame(gs.best_estimator_.feature_importances_, train_df.columns, columns = ['importance'])
    
    print('These were the most important words:')
    
    print(importance.sort_values(by = 'importance', ascending = False).head(10))
    


In [4]:
two_subreddits('europe', 'canada')

Urls received
Gathering posts...
996 posts gathered from europe subreddit
992 posts gathered from canada subreddit
Your baseline is: 0.5010060362173038
Accuracy score using Random Forest Classifier model is 0.9758551307847082 on the train data
Accuracy score using Random Forest Classifier model is 0.8430583501006036 on the test data
Accuracy score using Logistic Regression model is 0.9899396378269618 on the train data
Accuracy score using Logistic Regression model is 0.8772635814889336 on the test data
Accuracy score using Multinomial Naive Bayes model is 0.9738430583501007 on the train data
Accuracy score using Multinomial Naive Bayes model is 0.8953722334004024 on the test data
These were the most important words:
           importance
canada       0.166086
canadian     0.073750
canadians    0.034939
toronto      0.021928
europe       0.021178
trudeau      0.017295
eu           0.016250
vancouver    0.014354
alberta      0.013440
ontario      0.011652


In [5]:
two_subreddits('datascience', 'computerscience')

Urls received
Gathering posts...
998 posts gathered from datascience subreddit
998 posts gathered from computerscience subreddit
Your baseline is: 0.5
Accuracy score using Random Forest Classifier model is 0.9926519706078825 on the train data
Accuracy score using Random Forest Classifier model is 0.8797595190380761 on the test data
Accuracy score using Logistic Regression model is 0.9946559786239145 on the train data
Accuracy score using Logistic Regression model is 0.8957915831663327 on the test data
Accuracy score using Multinomial Naive Bayes model is 0.9565798263193053 on the train data
Accuracy score using Multinomial Naive Bayes model is 0.8977955911823647 on the test data
These were the most important words:
             importance
data           0.140256
computer       0.073110
cs             0.036351
science        0.020794
python         0.016612
programming    0.016132
machine        0.015349
analysis       0.013567
scientist      0.013211
scientists     0.012343


In [7]:
two_subreddits('food', 'healthyfood')

Urls received
Gathering posts...
989 posts gathered from food subreddit
999 posts gathered from healthyfood subreddit
Your baseline is: 0.5025150905432596
Accuracy score using Random Forest Classifier model is 0.9932930918846412 on the train data
Accuracy score using Random Forest Classifier model is 0.9517102615694165 on the test data
Accuracy score using Logistic Regression model is 0.9825620389000671 on the train data
Accuracy score using Logistic Regression model is 0.9496981891348089 on the test data
Accuracy score using Multinomial Naive Bayes model is 0.9738430583501007 on the train data
Accuracy score using Multinomial Naive Bayes model is 0.9416498993963782 on the test data
These were the most important words:
          importance
homemade    0.343097
ate         0.130141
salad       0.034016
cake        0.022484
salmon      0.013892
chef        0.013669
pro         0.011803
healthy     0.011472
lunch       0.010605
rice        0.008561
