In [2]:
import sys
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim.corpora import WikiCorpus

in_f = "enwiki-latest-pages-articles1.xml-p1p30303.bz2"
print("Working on", in_f)
wiki = WikiCorpus(in_f)
print("Got the text corpus")
tokenized_sentences=list(wiki.get_texts())

Working on enwiki-latest-pages-articles1.xml-p1p30303.bz2
Got the text corpus


In [4]:
import nltk
import warnings
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

warnings.filterwarnings('ignore')
nltk.download('punkt') # for tokenizing the text to sentences
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atefe\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atefe\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\atefe\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [10]:
print(len(tokenized_sentences))

### remove noise ###
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()
lem = WordNetLemmatizer()
filtered_sentences = []
i = 0
for sent in tokenized_sentences[:2000]:
  curr_sent = []
  for w in sent:
    if w not in stop_words: # remove the common words
        # sentences.append(ps.stem(w)) # reduce words to their roots (stemming) the result may not be an english word
        curr_sent.append(lem.lemmatize(w,pos="v")) # reduce words to their roots (lemmatization) the result is an english word
  if len(curr_sent) > 0:
    filtered_sentences.append(sent)      

filtered_sentences = list(map(' '.join, filtered_sentences))       
print(len(filtered_sentences))

14872
5000


In [6]:
warnings.filterwarnings('ignore')
nltk.download('vader_lexicon') # for sentiment analyzer
  
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\atefe\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [15]:
#print(filtered_sentences[0])
tokenized_sent=sent_tokenize(filtered_sentences[0])
print(len(tokenized_sent))
sentiment = sia.polarity_scores(filtered_sentences[0])
for k in sorted(sentiment):
    print('{0}: {1}, '.format(k, sentiment[k]), end='')

1
compound: 0.996, neg: 0.099, neu: 0.791, pos: 0.11, 

In [17]:
sentiment = [sia.polarity_scores(s) for s in filtered_sentences]
print(len(sentiment))
#for k in sorted(sentiment):
#    print('{0}: {1}, '.format(k, sentiment[k]), end='')

5000


In [19]:
s = sentiment[10]
for k in sorted(s):
    print('{0}: {1}, '.format(k, s[k]), end='')

compound: 1.0, neg: 0.04, neu: 0.755, pos: 0.204, 

In [28]:
#for ind, sent in enumerate(sentiment):
#    print('Sentence {0}: {1},'.format(ind, sent['neu']), end='\n')
from statistics import mean

print("Average of the neutral scores:", mean([sent['neu'] for sent in sentiment]))
print("Average of the positive scores:", mean([sent['pos'] for sent in sentiment]))
print("Average of the negative scores:", mean([sent['neg'] for sent in sentiment]))
print("Average of the compound scores:", mean([sent['compound'] for sent in sentiment]))

Average of the neutral scores: 0.8721908
Average of the positive scores: 0.0757996
Average of the negative scores: 0.0520086
Average of the compound scores: 0.43968298


In [66]:
#articles = [(article, sia.polarity_scores(article)) for article in filtered_sentences]
articles = []
for article in filtered_sentences:
    sentiment = sia.polarity_scores(article)
    if sentiment['pos'] > sentiment['neg']:
        articles.append((article, 'pos'))
    else:
        articles.append((article, 'neg'))    

In [69]:
from nltk.tokenize import word_tokenize 

all_words_set = set(word.lower() for passage in articles for word in word_tokenize(passage[0]))
all_words_list = list(word.lower() for passage in articles for word in word_tokenize(passage[0]))
#t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]

In [70]:
print(len(all_words_set), len(all_words_list))
filtered_words_list = []
for w in all_words_list:
    if w not in stop_words: # remove the common words
        filtered_words_list.append(lem.lemmatize(w,pos="v"))
print(len(all_words_set), len(filtered_words_list))


290316 16235263
290316 10066509


In [71]:
word_features = [word for (word,_) in (nltk.FreqDist(filtered_words_list)).most_common(5000)]
word_list = (nltk.FreqDist(filtered_words_list)).most_common(5000)
fdist = nltk.FreqDist(filtered_words_list)
print(word_list[:30])
# import matplotlib.pyplot as plt
# fdist.plot(10,cumulative=False)
# plt.show()

[('use', 51374), ('also', 43960), ('one', 36934), ('first', 32916), ('state', 31753), ('include', 28975), ('time', 27156), ('new', 27062), ('two', 26357), ('make', 23654), ('american', 21972), ('may', 21893), ('th', 20592), ('many', 20166), ('world', 19165), ('become', 18814), ('work', 18795), ('number', 18244), ('know', 17755), ('would', 17596), ('form', 17395), ('name', 17247), ('call', 16745), ('war', 15788), ('years', 15726), ('city', 15556), ('write', 15234), ('part', 15120), ('take', 15020), ('unite', 14672)]


In [72]:
from random import shuffle
shuffle(articles)
training_articles = articles[:4000]
test_articles = articles[4000:]

In [73]:
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

sa = SentimentAnalyzer()
unigram_features = sa.unigram_word_feats(word_features)
sa.add_feat_extractor(extract_unigram_feats, unigrams=unigram_features)

training_set = sa.apply_features(training_articles)
test_set = sa.apply_features(test_articles)

In [None]:
from nltk.classify import NaiveBayesClassifier as NBC
classfier = sa.train(NBC.train, training_set)

Training classifier


In [68]:
print((articles[100])[1])

neg
