# Webscraping Reddit
## Sentiment Analysis in Product Subreddits

### Packages used:
* PRAW (Python Reddit API Wrapper)
* VADER (Valence Aware Dictionary and sEntiment Reasoner)
* nltk

### Links used:
* [PRAW tutorial](https://towardsdatascience.com/scraping-reddit-with-praw-76efc1d1e1d9)
* [PRAW doc](https://praw.readthedocs.io/en/latest/index.html)
* [VADER github](https://github.com/cjhutto/vaderSentiment)

In [None]:
import praw
import pandas as pd
import time
from datetime import datetime as dt

In [None]:
# Create the instance of praw. All credetials stored in
# praw.ini file 
reddit = praw.Reddit()

print(reddit.user.me())


In [None]:
# create instances for the subreddits to investigate
# we will first investigate the sentiment of big phone/tablet manuf
samsung = reddit.subreddit('samsung')
apple = reddit.subreddit('iphone')
google = reddit.subreddit('googlepixel')
            

In [None]:
# get top in the last year
s_top = samsung.top(time_filter = 'year', limit = 1000)
a_top = apple.top(time_filter = 'year', limit = 1000)
g_top = google.top(time_filter = 'year', limit = 1000)

In [None]:
# With PRAW, everything is saved into a submission datatype
# containing: title, score (upvotes), dateTime, author, etc
# lets collect all of that
manuf = {
    "company"      : [],
    "score"        : [],
    "datetime"     : [],
    "author"       : [],
    "title"        : [],
    "selftext"     : [],
    "permalink"    : []
}

# we will now add each post into our dictionary with an added company column
gener_list = [s_top, a_top, g_top]
cnt = 0
for subreddit in gener_list:
    if cnt==0:
        name = 'samsung'
    elif cnt==1:
        name = 'apple'
    else:
        name = 'google'
        
    for submission in subreddit:
        manuf['company'].append(name)
        manuf["title"].append(submission.title)
        manuf["score"].append(submission.score)
        manuf["datetime"].append(dt.utcfromtimestamp(submission.created_utc))
        manuf["author"].append(submission.author)
        if submission.selftext == "":
            manuf["selftext"].append(" ")
        else:
            manuf["selftext"].append(submission.selftext)
        manuf["permalink"].append("https://www.reddit.com" + submission.permalink)
        time.sleep(0.1)
    
    time.sleep(5)
    cnt += 1
        
    

In [None]:
manuf_df = pd.DataFrame(manuf)
manuf_df.to_csv("./three_companies.csv", index=False)

In [None]:
manuf_df = pd.read_csv("./three_companies.csv")

# print(manuf_df['selftext'][0])

#display head
manuf_df.head()

In [None]:
# We now can work with VADER to do sentiment analysis on each post and title
# because subreddits dont always contain text body, we want to concat the title
# and selftext together to analyze all text per post.
title_text_zip = zip(manuf_df.title, manuf_df.selftext)
title_text = []
for ti, txt in title_text_zip:
    title_text.append(ti + " "+ txt)


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# import vader analyzer. This will conduct the sentiment analysis
# analysis will return with a dictionary with a positive, negative, 
# neutral, and compound score
vder = SentimentIntensityAnalyzer()
resp = []
for text in title_text:
    analysis = vder.polarity_scores(text)
    resp.append(analysis)

for i in range(10):
    print(resp[i])

In [None]:
# we have 4 identifiers that are returned:
# neg(ative), neu(tral), pos(itive), and compound.
# compound will produce the net sentiment of a post.
# pos if compound >=.05, neg if compound <=-.05, neu othw

# add both sentiment and compound score to df
vader_anlys = {
    "sentiment" : [],
    "compound"  : []
}

for i in resp:
    if i['compound']<= -0.05:
        vader_anlys["sentiment"].append("negative") 
        vader_anlys["compound"].append(i['compound'])
    elif i['compound']>= 0.05:
        vader_anlys["sentiment"].append("positive") 
        vader_anlys["compound"].append(i['compound'])
    else:
        vader_anlys["sentiment"].append("neutral") 
        vader_anlys["compound"].append(i['compound'])
        
        
manuf_df["sentiment"] = vader_anlys["sentiment"]
manuf_df["compound"] = vader_anlys["compound"]

manuf_df.head(5)

In [None]:
import matplotlib.pyplot as plt

# Display the pos/neg/neu sentiment grouped by phon manufacturer
manuf_sentiment = manuf_df.groupby(['company', 'sentiment']).sentiment.count().unstack()  
manuf_sentiment.head()

In [None]:
manuf_sentiment.plot(kind='bar')  

In [None]:
# We can now use this to do a specific 
# analysis on a subreddit

# Lets analyze DirecTVNow and use top, hot and
# controversial posts of the last year for more data points
dtvnow = reddit.subreddit('DirecTVNow')
dtv_top = dtvnow.top(time_filter = 'year', limit = 1000)
dtv_cont = dtvnow.controversial(time_filter = 'year', limit = 1000)
dtv_hot = dtvnow.hot(limit = 1000)

In [None]:
# With PRAW, everything is saved into a submission datatype
# containing: title, score (upvotes), dateTime, author, etc
# lets collect all of that
dtvdata = {
    "score"        : [],
    "datetime"     : [],
    "author"       : [],
    "title"        : [],
    "selftext"     : [],
    "permalink"    : []
}
id_set = set()
dup_id = []

for filt in [dtv_top, dtv_cont, dtv_hot]:
    time.sleep(5)
    for submission in filt:
        if submission.id not in id_set:
            id_set.add(submission.id)
            dtvdata["title"].append(submission.title)
            dtvdata["score"].append(submission.score)
            dtvdata["datetime"].append(dt.utcfromtimestamp(submission.created_utc))
            dtvdata["author"].append(submission.author)
            if submission.selftext == "":
                dtvdata["selftext"].append(" ")
            else:
                dtvdata["selftext"].append(submission.selftext)
            dtvdata["permalink"].append("https://www.reddit.com" + submission.permalink)
        else:
            dup_id.append(submission.id)
        time.sleep(0.1)

In [None]:
# export as a pandas df
dtv_df = pd.DataFrame(dtvdata)

# for key in dtvdata.keys():
#     print(key, len(dtvdata[key]))
print(len(dup_id))


In [None]:
# dtv_df
# move to a csv
dtv_df.to_csv("./dtvnow.csv", index=False)

In [None]:
dtv_df = pd.read_csv("./dtvnow.csv")

# Display some of the results we got

dtv_df.head(5)


In [None]:
# We now can work with VADER to do sentiment analysis on each post and title
# because subreddits dont always contain text body, we want to concat the title
# and selftext together to analyze all text per post.
title_text_zip = zip(dtv_df.title, dtv_df.selftext)
title_text = []
for ti, txt in title_text_zip:
    title_text.append(ti + " "+ txt)

# title_text

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# import vader analyzer. This will conduct the sentiment analysis
vder = SentimentIntensityAnalyzer()
resp = []
for text in title_text:
    analysis = vder.polarity_scores(text)
    resp.append(analysis)


In [None]:
# sort VADER sentiment

# add both sentiment and compound score to df
vader_anlys = {
    "sentiment" : [],
    "compound"  : [],
}

for i in resp:
    if i['compound']<= -0.05:
        vader_anlys["sentiment"].append("negative") 
        vader_anlys["compound"].append(i['compound'])
    elif i['compound']>= 0.05:
        vader_anlys["sentiment"].append("positive") 
        vader_anlys["compound"].append(i['compound'])
    else:
        vader_anlys["sentiment"].append("neutral") 
        vader_anlys["compound"].append(i['compound'])
        
        
dtv_df["sentiment"] = vader_anlys["sentiment"]
dtv_df["compound"] = vader_anlys["compound"]


In [None]:
dtv_df.head(10)

In [None]:
# export data to a csv

dtv_df.to_csv("./dtvnow_with_sentiment.csv", index=False)
dtv_df.loc[dtv_df['sentiment'] == 'negative'].to_csv("./dtvnow_negative_sentiment.csv", index=False)
dtv_df.loc[dtv_df['sentiment'] == 'positive'].to_csv("./dtvnow_positive_sentiment.csv", index=False)

In [None]:
import matplotlib.pyplot as plt
print(dtv_df['sentiment'].value_counts())

fig, ax = plt.subplots()
dtv_df['sentiment'].value_counts().plot(ax=ax, kind='pie', autopct='%1.0f%%')

In [None]:
# Let's now take a look at the positives and negative
# posts and see if we can find any specific key words

# we can use nltk for this
import collections
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 
# from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import nltk

# DataFrames containing just pos/neg sentiment posts
neg_df = dtv_df.loc[dtv_df['sentiment'] == 'negative']
pos_df = dtv_df.loc[dtv_df['sentiment'] == 'positive']

# def to concat title and text
def title_text(zip_gen):
    zip_arr = []
    for ti, tx in zip_gen:
        zip_arr.append(ti + " " + tx)
    
    return zip_arr

neg_df_zip = zip(neg_df.title, neg_df.selftext)
pos_df_zip = zip(pos_df.title, pos_df.selftext)

neg_txt = title_text(neg_df_zip)
pos_txt = title_text(pos_df_zip)

print(neg_txt[0])

In [None]:
# Use nltk to remove stop words to increase frequency clarity
# put into fcn
def remove_stop_wds(text_list):
    # regex found here: https://gist.github.com/ameyavilankar/10347201
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    for i in range(len(text_list)):
        text = text_list[i].lower()
        word_tokens = tokenizer.tokenize(text)
        filtered = [w for w in word_tokens if not w in stop_words]
        text_list[i] = filtered
    
remove_stop_wds(neg_txt)
remove_stop_wds(pos_txt)

print(neg_txt[0])

In [None]:
# we can create a function that displays the top 10 
# most common words, and output to a pandas df
def word_freq_df(text_list):
    frq = {}
    for txt in text_list:
        word_frequency = nltk.FreqDist(txt)
        for word, frequency in word_frequency.most_common():
            if word not in frq:
                frq[word] = frequency
            else:
                frq[word] += frequency
    w_f = {
        'word'      : [],
        'frequency' : []
    }
    for w in frq.keys():
        w_f['word'].append(w)
        w_f['frequency'].append(frq[w])
    
    frequency_df = pd.DataFrame(w_f).sort_values(by=['frequency'], ascending = False)
    return frequency_df
    
neg_df = word_freq_df(neg_txt)
pos_df = word_freq_df(pos_txt)
# neg_df
# pos_df.head(100)

ngx = neg_df[:20].plot.bar(x='word',y='frequency', legend=False, rot = 50, title='Negative Sentiment Word Frequency')
ngx.set_ylabel('frequency')
psx = pos_df[:20].plot.bar(x='word',y='frequency', legend=False, rot = 50, title='Positive Sentiment Word Frequency')
psx.set_ylabel('frequency')

In [None]:
# Want to train our own model with our data now.
# So we can use a 80-20 split of our pos/neg data
# to create a svm.

#first clean and prepare data
# print(int(2489*0.75))
totals = dtv_df['sentiment'].value_counts()
# print(totals)

dtv_df['titxt'] = dtv_df['title'] + " " + dtv_df['selftext']

# randomize our data.
# did not want to use train_test_split since i wanted to
# keep an 80/20 split of pos and neg posts too
dtv_df = dtv_df.sample(frac = 1.0)

postrain = dtv_df.loc[dtv_df['sentiment'] == 'positive'][:int(totals['positive']*0.80)]
postest = dtv_df.loc[dtv_df['sentiment'] == 'positive'][int(totals['positive']*0.80):]
negtrain = dtv_df.loc[dtv_df['sentiment'] == 'negative'][:int(totals['negative']*0.80)]
negtest = dtv_df.loc[dtv_df['sentiment'] == 'negative'][int(totals['negative']*0.80):]
# print(len(postrain), len(postest))
# print(len(negtrain), len(negtest))
train = pd.concat([postrain,negtrain])
test = pd.concat([postest,negtest])
# print(len(train),len(test))

dtv_df.head(5)

In [None]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# create feature vectors using a count vectorizer
# forms feature vectors based off groupings of
# ngram words removing stop words
stop_words = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(binary=True, ngram_range=(1, 3), 
                             stop_words=stop_words)

# a tf-idf transformer will create feature vectors
# based off term frequency but will weight the frequency 
# of terms in a document less to lessen bias/skew


#creating vectors for my test and train data
train_vectors = vectorizer.fit_transform(train['titxt'])
test_vectors = vectorizer.transform(test['titxt'])

# Perform classification with SVM, kernel=linear
# varying c levels dependent on label noise
for c in [1, 1.01, 1.05, 1.1, 1.15, 1.5, 2, 5, 10]:
    classifier_linear = svm.SVC(kernel='linear', C = c)
    t0 = time.time()
    classifier_linear.fit(train_vectors, train['sentiment'])
    t1 = time.time()
    prediction_linear = classifier_linear.predict(test_vectors)
    t2 = time.time()
    time_linear_train = t1-t0
    time_linear_predict = t2-t1

    # results
    print("Training time: " + str(time_linear_train) + "s; Prediction time: "+ str(time_linear_predict) +"s; C: "+ str(c))

    report = classification_report(test['sentiment'], prediction_linear, output_dict=True)
    print('positive: ', report['positive'])
    print('negative: ', report['negative'])