In [241]:
# Import libraries
import praw 
import time
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction import stop_words
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC 


In [3]:
reddit = praw.Reddit(client_id='szHwjceQhkUa4Q', client_secret='ep8J2cyH-1pYvBfbmDw2axrnREM', user_agent='reddit_ webscrape')

In [4]:
# Scrape all new posts from Samsung and Apple subreddits
posts_apple_samsung = []
for submission in reddit.subreddit('apple+samsung').new(limit=None):
    posts_apple_samsung.append([submission.title,  submission.score,  submission.id,  submission.subreddit,  submission.url,  submission.num_comments,  submission.selftext,  submission.created])
posts_apple_samsung = pd.DataFrame(posts_apple_samsung,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
# Show head of new dataframe 
posts_apple_samsung.head()

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created
0,Luna Display Introduces Mac-to-Mac Mode Allowi...,1,dj9b2z,apple,https://www.macrumors.com/2019/10/17/luna-disp...,0,,1571361000.0
1,Which one to choose: S10+ or N10+? (Specifics ...,1,dj8t63,samsung,https://www.reddit.com/r/samsung/comments/dj8t...,2,"Ok, so I'm finding it hard to decide over thes...",1571359000.0
2,Samsung Admits Major Security Flaw in Galaxy S...,1,dj8sxk,samsung,https://www.reddit.com/r/samsung/comments/dj8s...,1,[News Post](http://www.macrumors.com/2019/10/1...,1571358000.0
3,Samsung DeX is the most useful it's ever been ...,2,dj8dxe,samsung,https://www.reddit.com/r/samsung/comments/dj8d...,4,I just got on the OneUI Beta a couple days ago...,1571357000.0
4,Major Galaxy S10/Note 10 fingerprint flaw will...,4,dj8dw5,samsung,https://www.sammobile.com/news/major-galaxy-s1...,3,,1571357000.0


In [5]:
# Convert the  subreddit titles into binary data
posts_apple_samsung = pd.get_dummies(posts_apple_samsung, columns=['subreddit'], drop_first=True)
posts_apple_samsung

Unnamed: 0,title,score,id,url,num_comments,body,created,subreddit_samsung
0,Luna Display Introduces Mac-to-Mac Mode Allowi...,1,dj9b2z,https://www.macrumors.com/2019/10/17/luna-disp...,0,,1.571361e+09,0
1,Which one to choose: S10+ or N10+? (Specifics ...,1,dj8t63,https://www.reddit.com/r/samsung/comments/dj8t...,2,"Ok, so I'm finding it hard to decide over thes...",1.571359e+09,1
2,Samsung Admits Major Security Flaw in Galaxy S...,1,dj8sxk,https://www.reddit.com/r/samsung/comments/dj8s...,1,[News Post](http://www.macrumors.com/2019/10/1...,1.571358e+09,1
3,Samsung DeX is the most useful it's ever been ...,2,dj8dxe,https://www.reddit.com/r/samsung/comments/dj8d...,4,I just got on the OneUI Beta a couple days ago...,1.571357e+09,1
4,Major Galaxy S10/Note 10 fingerprint flaw will...,4,dj8dw5,https://www.sammobile.com/news/major-galaxy-s1...,3,,1.571357e+09,1
...,...,...,...,...,...,...,...,...
1848,"John Gruber: ""Apple has shipped hardware that ...",444,d3yspi,https://daringfireball.net/linked/2019/09/13/d...,197,,1.568453e+09,0
1849,Apple Fifth Ave. Store Reopening Animation,334,d3yql2,https://www.youtube.com/watch?v=1NKw1C2_VDU,19,,1.568453e+09,0
1850,"The new iPhones don’t launch till next week, b...",1262,d3yofv,https://twitter.com/markgurman/status/11725708...,124,,1.568453e+09,0
1851,iPhone 11 Pro cinematic tests — Apple,143,d3xqgf,https://youtu.be/7krzWNOXrFY,51,,1.568448e+09,0


In [263]:
# Save collected data to csv file
posts_apple_samsung.to_csv('data/apple_samsung_posts.csv', index=False)

In [7]:
# Define X and y
X = posts_apple_samsung['title']
y = posts_apple_samsung['subreddit_samsung']

In [315]:
# Logistic Regression GridSearch and Pipelines

# Instantiate pipeline for CountVectorizer 
pipe_cvec = Pipeline([
        ('cvec', CountVectorizer()),
        ('lr', LogisticRegression())
    ])
# Define Pipeline parameters 
pipe_cvec_params = {
        'cvec__max_features' : [100, 300, 500, 1000, 5000, 10000],
        'cvec__stop_words': ['english', None],
        'cvec__ngram_range' : [(1, 5),(1,2), (1,1), (1,10), (1,20)],

}

# Instantiate Gridsearch for Logistic Regression
gs_cvec = GridSearchCV(
            pipe_cvec,
            pipe_cvec_params,
            cv=5)

# Naive Bayes (Multinomial and Bernoulli) GridSearch and Pipelines

# Instantiate pipeline for CountVectorizer using Bernoulli model
pipe_cvec_nb_bn = Pipeline([
        ('cvec', CountVectorizer()),
        ('bn', BernoulliNB())
    ])

# Define parameters
pipe_cvec_nb_bn_params = {
        'cvec__max_features' : [100, 500, 1000, 5000, 10000],
        'cvec__stop_words': ['english', None],
        'cvec__ngram_range' : [(1,2), (1,1), (1,5), (1,10), (1,20)],

}

# Instantiate pipeline for TfidfVectorizer using Multinomial model
pipe_tvec_nb_mn = Pipeline([
        ('tvec', TfidfVectorizer()),
        ('bn', MultinomialNB())
    ])
pipe_tvec_nb_mn_params = {
        'tvec__max_features' : [100,500,1000, 5000,10000],
        'tvec__stop_words' : ['english', None],
        'tvec__ngram_range' : [(1,2), (1,1), (1,20)]
}

# Instantiate GridsearchCV for Bernoulli using CounterVectorizer 

gs_cvec_nb_bn = GridSearchCV(
            pipe_cvec_nb_bn,
            pipe_cvec_nb_bn_params,
            cv=5)

# Instantiate GridsearchCV for Multinomial using TfidfVectorizer

gs_tvec_nb_mn = GridSearchCV(
            pipe_tvec_nb_mn,
            pipe_tvec_nb_mn_params,
            cv=5)


# Instantiate pipeline TfidfVectorizer 
pipe_tvec_svc = Pipeline([
        ('tvec', TfidfVectorizer()),
        ('svc', SVC(C = 750))])

# Define parameters
pipe_tvec_svc_params = {
        'tvec__max_features' : [100, 500,1000, 5000,10000],
        'tvec__stop_words' : ['english', None],
        'tvec__ngram_range' : [(1,2), (1,1), (1,20), (1,5), (1,3)]
}  

# Instantiate Gridsearch using TfidfVectorizer   
gs_tvec_svc = GridSearchCV(
        pipe_tvec_svc,
        pipe_tvec_svc_params,
        cv=5)
    

In [292]:
# Train/Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,stratify=y)

In [320]:
# Fit using LogisticRegression and CounterVectorizer
gs_cvec.fit(X_train, y_train)

















GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [321]:
# Get best cv scoure
gs_cvec.best_score_

0.9344852411807055

In [322]:
#  Get best parameters
gs_cvec.best_params_

{'cvec__max_features': 1000,
 'cvec__ngram_range': (1, 5),
 'cvec__stop_words': 'english'}

In [323]:
# Score training data
gs_cvec.score(X_train, y_train)

0.9740820734341252

In [324]:
# Score testing data
gs_cvec.score(X_test, y_test)

0.9353448275862069

In [39]:
# Generate perdictions using logistic regression and CountVectorizer
y_pred = gs_cvec.predict(X_test)

In [91]:
# Create DataFrame with column for predicted values.
results = pd.DataFrame(y_test)

# Create column for observed values.
results['predicted'] = y_pred

In [98]:
# Show distibuti
results['subreddit_samsung'].value_counts()

1    240
0    224
Name: subreddit_samsung, dtype: int64

In [99]:
results['predicted'].value_counts()

1    260
0    204
Name: predicted, dtype: int64

In [100]:
results

Unnamed: 0,subreddit_samsung,predicted
274,1,1
1118,1,1
1521,0,0
258,1,1
1464,0,0
...,...,...
907,1,1
1062,1,1
969,1,1
50,1,1


In [102]:
results['title'] = X_test

In [111]:
results.to_excel('data/results.xls')

In [104]:
row_ids = results[results['predicted'] != results['subreddit_samsung']].index
print(row_ids)

Int64Index([ 129, 1417,  501, 1461, 1195, 1414, 1434, 1499,  758,  349, 1566,
              97,  776, 1807, 1808,  396, 1706, 1067, 1606, 1463,  144,   70,
            1193,   95, 1532,  784,  674,   15,  760, 1430],
           dtype='int64')


In [109]:
results.index

Int64Index([ 274, 1118, 1521,  258, 1464,  188,  385, 1360, 1448,  952,
            ...
            1515, 1716,  750,  446,  134,  907, 1062,  969,   50,  992],
           dtype='int64', length=464)

In [325]:
# Fit on CounterVectorizer and Naive Bayes Bernoulli
gs_cvec_nb_bn.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [326]:
gs_cvec_nb_bn.best_score_

0.9388048956083513

In [327]:
gs_cvec_nb_bn.best_params_

{'cvec__max_features': 1000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [328]:
gs_cvec_nb_bn.score(X_train, y_train)

0.9690424766018718

In [329]:
gs_cvec_nb_bn.score(X_test, y_test)

0.9504310344827587

In [154]:
y_pred_bn = gs_cvec_nb_bn.predict(X_test)

In [178]:
# Create DataFrame with column for predicted values.
results_bn = pd.DataFrame(y_test)

# Create column for observed values.
results_bn['predicted'] = y_pred_bn

In [179]:
results_bn

Unnamed: 0,subreddit_samsung,predicted
274,1,1
1118,1,1
1521,0,0
258,1,1
1464,0,1
...,...,...
907,1,1
1062,1,1
969,1,1
50,1,1


In [182]:
# Remspping binary to subreddit names for the results.csv
results_bn['predicted'] = results_bn['predicted'].map({0:'apple', 1: 'samsung'})

In [184]:
# Adding in title text from reddit 
results_bn['subission_title'] = X

In [185]:
results_bn

Unnamed: 0,subreddit_samsung,predicted,subission_title
274,samsung,samsung,Fixing small scratch?
1118,samsung,samsung,Wireless headphones Advice
1521,apple,apple,Wall Street is underestimating how much money ...
258,samsung,samsung,Samsung members app broken
1464,apple,samsung,Just posting my review here as well.
...,...,...,...
907,samsung,samsung,Thought the cityscape feature on the photo opt...
1062,samsung,samsung,Samsung Galaxy S11: official presentation on F...
969,samsung,samsung,"Case that covers the ""chin"" on the a20?"
50,samsung,samsung,Buying Samsung product (Galaxy Watch Active 2)...


In [186]:
# save to file
results_bn.to_excel('data/results_bn.xls')

In [334]:
# Comparing differences in rows 
row_ids = results_bn[results_bn['predicted'] != results_bn['subreddit_samsung']].index
print(row_ids)

Int64Index([1464,  129, 1417,  501, 1195, 1434,  857,  101, 1566,  405,   97,
            1807, 1808, 1706, 1463,  144, 1649,   95, 1451,  674, 1833,  760,
            1430],
           dtype='int64')


In [316]:
# Fit on training data.
gs_tvec_svc.fit(X_train, y_train)





















GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [330]:
# Evaluate model.
gs_tvec_svc.best_params_

{'tvec__max_features': 5000,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

In [331]:
gs_tvec_svc.best_score_

0.9380849532037437

In [332]:
gs_tvec_svc.score(X_train, y_train)

0.9820014398848093

In [333]:
gs_tvec_svc.score(X_test, y_test)

0.9504310344827587