## Step 1: Import the data, from two subreddits ##

1. Function takes two subreddits, scrapes the top thousand posts from each of them, saves it to a csv. Combines the DFs, maps out the target column. Pre-processes the title data appropriately. returns the clean df (X) and target(y)

In [21]:
import requests, json, time, re
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, Sno
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.externals import joblib

%matplotlib inline

In [11]:
# This function gets a subreddit name and scrapes the top thousand entries in JSON, returning a list of them
def scrape_subr(subr):
    url= "https://www.reddit.com/r/"+subr+'/top.json?t=all'; 
    posts=[];
    after= None; 
    
    for _ in range(40):
            if after == None:
                current_url = url
            else:
                current_url = url + '&after=' + after
            print(f'/r/{subr} Page {_}:', current_url);

            #make the request and handle status code, add 2 sec sleep

            res= requests.get(current_url, headers={'User-agent': 'DataSci 5.5'})

            if res.status_code != 200:
                print('Status error', res.status_code)
                break

            current_dict = res.json()
            current_posts = [p['data'] for p in current_dict['data']['children']]
            posts.extend(current_posts)

            after= current_dict['data']['after']

            time.sleep(2)
    df = pd.DataFrame(posts).drop('Unnamed: 0', 1)
    df.to_csv(f'./data_csvs/{subr}_raw.csv', index=False)
    # For seperation of concerns, could pull out a "clean_df/csv" function
    df_clean = df[['title', 'subreddit']]
    df_clean.to_csv(f'./data_csvs/{subr}_clean.csv', index=False)
    return df_clean

In [17]:
def redditize(subr1, subr2):
    df_list=[]
    for subr in [subr1, subr2]:
        #scrape the subreddit, save it as a csv, add it to our list
        df_list.append(scrape_subr(subr));
    #concatenate the dfs into one
    df= pd.concat(df_list, ignore_index=True)
    #clean up the title column: remove punctuations and lowercase it
    #### NOTE: If other  transformations are required, apply the following (or more) ####
#     stopWords = set(stopwords.words('english'))
#     df['qmark']= df['title'].apply(lambda x: 1 if '?' in x else 0 )
#     df['words_not_stopword'] = df['processed'].apply(lambda x: len([t for t in x.split(' ') if t not in stopWords]))
    df['processed'] = df['title'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))
    # This will be our X and y:
    map_dict= {
        subr1: 1,
        subr2: 0
    }
    df['subreddit'] = df['subreddit'].map(map_dict)
    return df['processed'], df['subreddit']
#   right way to call is >> X, y = redditize('subr1', 'subr2')  

In [18]:
X, y = df['processed'], df['subreddit']

# Step 2: Train test split and fit our model to the train, score on split #

In [23]:
#Tokenizer needed for our model (may pickle for later)
from nltk import word_tokenize          

class SnowballTokenizer(object):
    def __init__(self):
        self.sbs = SnowballStemmer('english')
    def __call__(self, doc):
        return [self.sbs.stem(t) for t in word_tokenize(doc)]

In [24]:
# Import our pickled model
model = joblib.load('./tf_svc_pipe.pkl')

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [25]:
model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=9500, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
 ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [26]:
model.score(X_train, y_train)

0.9966577540106952

In [27]:
model.score(X_test, y_test)
#This is not horrible, but not great (the original two subr got a 98%)

0.9619238476953907

In [None]:
# Function that takes a df, splits, it and processes the accuracy (and even confusion matrix!) on the two. 
def run_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y) 
    model.fit(X_train, y_train)
    print(f"Training Accuracy: {model.score(X_train, y_train)}")
    print(f"Testing Accuracy: {model.score(X_test, y_test)}")
    tfidf = model.named_steps.tfidf
    svc = model.named_steps.svc
    

In [28]:
tfidf = model.named_steps.tfidf
svc = model.named_steps.svc

In [53]:
top_coefs= pd.DataFrame(svc.coef_.toarray()[0], tfidf.get_feature_names()).sort_values(0)[-5:]
least_coefs= pd.DataFrame(svc.coef_.toarray()[0], tfidf.get_feature_names()).sort_values(0)[:5]

In [55]:
top_coefs

Unnamed: 0,0
us,1.529895
pipelin,1.693543
rig,2.175624
opec,2.382832
oil,6.151136
