In [33]:
import pandas as pd
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

"""
All the news: https://www.kaggle.com/snapcrack/all-the-news
"""
nltk.download('punkt')
nltk.download('vader_lexicon')

tokenizer = PunktSentenceTokenizer()
classifier = SentimentIntensityAnalyzer()

SEED=42
# load content from csv
articles = (pd.read_csv('all-the-news/articles1.csv').append(pd.read_csv('all-the-news/articles2.csv'))).sample(2000, random_state=SEED)
content = articles['content']

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atefe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\atefe\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [35]:
import nltk
import warnings
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

warnings.filterwarnings('ignore')
nltk.download('punkt') # for tokenizing the text to sentences
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atefe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atefe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\atefe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
import re
from nltk.tokenize.punkt import PunktSentenceTokenizer
tokenized_sentences = []
for ind in content.index: 
    letter_only = re.sub("[\d-]",  # Search for all numbers
                          " ",          # Replace all non-letters with spaces
                          str(content[ind]))
    for s in sent_tokenize(letter_only):
        tokenized_sentences.append(s)
print(tokenized_sentences[0])
print(len(tokenized_sentences))

The unspeakable atrocity in Orlando last Sunday has opened the floodgates to a torrent of heated discussion about the reasons behind America’s deadliest mass shooting.
64528


In [37]:
from random import sample
rand_sentences = sample(tokenized_sentences, 15000)

In [38]:
from nltk.tokenize import word_tokenize
print(len(rand_sentences))
### remove noise ###
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()
lem = WordNetLemmatizer()
filtered_sentences = []
i = 0
for sent in rand_sentences:
  curr_sent = []
  t_word = word_tokenize(sent)
  for w in t_word:
    if w not in stop_words and w.isalpha(): # remove the common words
        # sentences.append(ps.stem(w)) # reduce words to their roots (stemming) the result may not be an english word
        curr_sent.append(lem.lemmatize(w,pos="v")) # reduce words to their roots (lemmatization) the result is an english word
  if len(curr_sent) > 0:
    filtered_sentences.append(curr_sent)      

filtered_sentences = list(map(' '.join, filtered_sentences))       
print(len(filtered_sentences))

15000
14750


In [39]:
print(filtered_sentences[0])
warnings.filterwarnings('ignore')
nltk.download('vader_lexicon') # for sentiment analyzer
  
sia = SentimentIntensityAnalyzer()

It start organically


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\atefe\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [40]:
sentiment = sia.polarity_scores(filtered_sentences[0])
for k in sorted(sentiment):
    print('{0}: {1}, '.format(k, sentiment[k]), end='')

compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 

In [41]:
sentiment = [sia.polarity_scores(s) for s in filtered_sentences]
print(len(sentiment))

14750


In [42]:
from statistics import mean

print("Average of the neutral scores:", mean([sent['neu'] for sent in sentiment]))
print("Average of the positive scores:", mean([sent['pos'] for sent in sentiment]))
print("Average of the negative scores:", mean([sent['neg'] for sent in sentiment]))
print("Average of the compound scores:", mean([sent['compound'] for sent in sentiment]))

Average of the neutral scores: 0.7592887457627119
Average of the positive scores: 0.13163077966101694
Average of the negative scores: 0.10860284745762712
Average of the compound scores: 0.03515086779661017


In [43]:
articles = []
sum = 0
for article in filtered_sentences:
    sentiment = sia.polarity_scores(article)
    if sentiment['compound'] < 0.0:
        sum += 1
    if sentiment['pos'] > sentiment['neg']:
        articles.append((article, 'pos'))
    else:
        articles.append((article, 'neg'))    
print(sum)

4573


In [44]:
sum = 0
for (article, cat) in articles:
  if cat == 'pos':
    sum += 1
print(sum)

5701


In [45]:
from nltk.tokenize import word_tokenize 

all_words_set = set(word.lower() for passage in articles for word in word_tokenize(passage[0]))
all_words_list = list(word.lower() for passage in articles for word in word_tokenize(passage[0]))

In [46]:
print(len(all_words_set), len(all_words_list))
filtered_words_list = []
for w in all_words_list:
    if w not in stop_words: # remove the common words
        filtered_words_list.append(lem.lemmatize(w,pos="v"))
print(len(all_words_set), len(filtered_words_list))

19137 178928
19137 166692


In [47]:
word_features = [word for (word,_) in (nltk.FreqDist(filtered_words_list)).most_common(10000)]
word_list = (nltk.FreqDist(filtered_words_list)).most_common(10000)
fdist = nltk.FreqDist(filtered_words_list)
print(word_list[-30:])
# import matplotlib.pyplot as plt
# fdist.plot(10,cumulative=False)
# plt.show()

[('scopes', 2), ('townhall', 2), ('antiquities', 2), ('singh', 2), ('fresco', 2), ('micromanage', 2), ('booty', 2), ('enrollment', 2), ('shawn', 2), ('stilettos', 2), ('espionage', 2), ('mo', 2), ('auckland', 2), ('civility', 2), ('hoax', 2), ('gays', 2), ('lecturer', 2), ('ivanka', 2), ('onions', 2), ('legion', 2), ('inversion', 2), ('spokesmen', 2), ('superpowers', 2), ('cloture', 2), ('legend', 2), ('aroldis', 2), ('chapman', 2), ('kemp', 2), ('liberalize', 2), ('adjustments', 2)]


In [48]:
from random import shuffle
shuffle(articles)
training_articles = articles[:int(0.8*len(articles))]
test_articles = articles[int(0.8*len(articles)):]
print(len(articles), len(training_articles), len(test_articles))

14750 11800 2950


In [49]:
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

sa = SentimentAnalyzer()
unigram_features = sa.unigram_word_feats(word_features)
sa.add_feat_extractor(extract_unigram_feats, unigrams=unigram_features)

training_set = sa.apply_features(training_articles)
test_set = sa.apply_features(test_articles)

In [60]:
from nltk.classify import NaiveBayesClassifier as NBC
classifier = sa.train(NBC.train, training_set)

Training classifier


In [46]:
for key, value in sorted(sa.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))

Evaluating NaiveBayesClassifier results...
Accuracy: 0.595391392748221
F-measure [neg]: 0.6602162777461582
F-measure [pos]: 0.5000000000000001
Precision [neg]: 0.6819517930629042
Precision [pos]: 0.4776
Recall [neg]: 0.6398234969663541
Recall [pos]: 0.5246045694200352


# Benchmark

In [28]:
import pandas as pd
# pd.read_csv('stanfordSentimentTreebank/datasetSentences.txt', sep='\t')
phrases = pd.read_csv('stanfordSentimentTreebank/dictionary.txt', sep='|', names=['phrase', 'phrase ids'])
sentiment = pd.read_csv('stanfordSentimentTreebank/sentiment_labels.txt', sep='|', usecols=['sentiment values']).astype(float)
sentiment['labels'] = sentiment['sentiment values'].apply(lambda x: 'pos' if x >= 0.50000 else 'neg')
benchmark = phrases.join(sentiment, on='phrase ids', lsuffix="l")[['phrase', 'labels']]
#benchmark['sentiment values'] = benchmark['sentiment values'].apply(sst_threshold)
benchmark.columns = ['sentences', 'labels']
#loader = package_data(benchmark, tokenizer, max_input_length, BATCH_SIZE)

In [53]:
tokenized_benchmark = []
for ind in (benchmark['sentences']).index: 
    letter_only = re.sub("[\d-]",  # Search for all numbers
                          " ",          # Replace all non-letters with spaces
                          str((benchmark['sentences'])[ind]))
    #for s in sent_tokenize(letter_only):
    tokenized_benchmark.append(letter_only)
print(tokenized_benchmark[0])
print(len(tokenized_benchmark))

!
239232


In [54]:
### remove noise ###
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()
lem = WordNetLemmatizer()
filtered_benchmark = []
i = 0
for sent in tokenized_benchmark:
  curr_sent = []
  t_word = word_tokenize(sent)
  for w in t_word:
    if w not in stop_words and w.isalpha(): # remove the common words
        # sentences.append(ps.stem(w)) # reduce words to their roots (stemming) the result may not be an english word
        curr_sent.append(lem.lemmatize(w,pos="v")) # reduce words to their roots (lemmatization) the result is an english word
  if len(curr_sent) > 0:
    filtered_benchmark.append(curr_sent)      

filtered_benchmark = list(map(' '.join, filtered_benchmark))       
print(len(filtered_benchmark))

237141


In [58]:
articles = []
sum = 0
for article in filtered_benchmark:
    sentiment = sia.polarity_scores(article)
    if sentiment['compound'] < 0.0:
        sum += 1
    if sentiment['pos'] > sentiment['neg']:
        articles.append((article, 'pos'))
    else:
        articles.append((article, 'neg'))    
print(sum)

43063


In [62]:
test_set_benchmark = sa.apply_features(articles[:10000])

In [63]:
for key, value in sorted(sa.evaluate(test_set_benchmark).items()):
    print('{0}: {1}'.format(key, value))

Evaluating NaiveBayesClassifier results...
Accuracy: 0.6094
F-measure [neg]: 0.731509485839978
F-measure [pos]: 0.2835656639765224
Precision [neg]: 0.6130890655605484
Precision [pos]: 0.5851627554882665
Recall [neg]: 0.9066280456636565
Recall [pos]: 0.18712176228516098
