In [13]:
import pandas as pd 
from collections import Counter
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

from nltk.sentiment import SentimentIntensityAnalyzer
nltk.downloader.download('vader_lexicon')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andywu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/andywu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
scrape_output = pd.read_csv("scrape_output.csv")

In [4]:
posts = scrape_output['Content']
authors = scrape_output['Author']
urls = scrape_output['URL']

In [21]:
print("len(posts)", len(posts))

len(posts) 187641


In [8]:
stop_words = set(stopwords.words('english'))

 
# Use this to read file content as a stream:
count_words = []
for word in " ".join(map(str, posts)).split():
    word = word.lower()
    if word not in stop_words:
        count_words.append(word)
Counter(count_words).most_common(10)

[(';', 50936),
 ('like', 22418),
 ('people', 21298),
 ('get', 20257),
 ('would', 20227),
 ('one', 19375),
 ('good', 17831),
 ('top', 14850),
 ('think', 14588),
 ('even', 13779)]

In [9]:
count_authors = []
for author in " ".join(map(str, authors)).split():
    author = author.lower()
    count_authors.append(author)
Counter(count_authors).most_common(10)

[('nan', 475),
 ('marginal', 327),
 ('revolution', 327),
 ('ejmrbear', 190),
 ('karl', 172),
 ('kirk', 134),
 ('eurovision', 130),
 ('1c08', 113),
 ('2c5d', 91),
 ('30e6', 76)]

In [12]:
# https://realpython.com/python-nltk-sentiment-analysis/
fd = nltk.FreqDist(count_words)
fd.most_common(5)
fd.tabulate(5)

     ;   like people    get  would 
 50936  22418  21298  20257  20227 


In [11]:
fd = nltk.FreqDist(count_authors)
fd.most_common(5)
fd.tabulate(5)

       nan   marginal revolution   ejmrbear       karl 
       475        327        327        190        172 


In [25]:
fd["anyone"]

14

In [14]:
'''
Vader Sentiment Analysis 
'''
sia = SentimentIntensityAnalyzer()

In [18]:
def is_positive(word: str) -> bool:
    """True if word has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(word)["compound"] > .5

def is_negative(word: str) -> bool:
    """True if word has negative compound sentiment, False otherwise."""
    return sia.polarity_scores(word)["compound"] < -.5

positive_words = []
negative_words = []

for word in " ".join(map(str, posts)).split():
    word = word.lower()

    if word not in stop_words:
        if is_positive(word):
            positive_words.append(word)
        elif is_negative(word):
            negative_words.append(word)
print("negative words: ", Counter(negative_words).most_common(10))
print("positive words: ", Counter(positive_words).most_common(10))

negative words:  [('bad', 4585), ('failed', 1087), ('hate', 979), ('worst', 949), ('rejected', 935), ('stupid', 792), ('negative', 740), ('bad.', 696), ('fail', 681), ('fraud', 665)]
positive words:  [('best', 5431), ('great', 4611), ('kind', 2476), ('strong', 2247), ('love', 1993), ('free', 1700), ('happy', 1185), ('super', 988), ('confidence', 697), ('trust', 685)]


In [22]:
def is_positive_sentiment_sentence(sentence):
    return sia.polarity_scores(sentence)["compound"] > .5
    # print("{:-<40} {}".format(sentence, str(snt)))

def is_negative_sentiment_sentence(sentence):
    return sia.polarity_scores(sentence)["compound"] < -.5


positive_authors = []
negative_authors = []
for index, post in enumerate(posts):
    if is_positive_sentiment_sentence(str(post)):
        positive_authors.append(authors[index])
    elif is_negative_sentiment_sentence(str(post)):
        negative_authors.append(authors[index])
print("negative authors: ", Counter(negative_authors).most_common(10))
print("positive authors: ", Counter(positive_authors).most_common(10))

negative authors:  [(nan, 63), ('Marginal Revolution', 57), ('e4c8', 21), ('Kirk', 17), ('5093', 16), ('f24e', 15), ('5c03', 15), ('ded9', 15), ('c7ab', 14), ('480c', 14)]
positive authors:  [('Marginal Revolution', 159), (nan, 126), ('3f50', 48), ('Kirk', 38), ('54de', 38), ('Karl', 31), ('EJMRBear', 24), ('1f11', 24), ('42c7', 22), ('da9d', 21)]
