In [4]:
import re
import nltk

In [5]:
# Preparatory steps - must be executed only once
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acohal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\acohal\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [6]:
FILE_PATH = "input\\miracle_in_the_andes.txt"

# Load the book

In [7]:
with open(FILE_PATH, "r", encoding='utf-8') as file:
    book = file.read()

# The most used words (non-articles)

In [4]:
pattern = re.compile("[a-zA-z]+")
findings = re.findall(pattern, book.lower())
dictionary_words = {}
for word in findings:
    if word in dictionary_words:
        dictionary_words[word] = dictionary_words[word] + 1
    else:
        dictionary_words[word] = 1
list_words = [(value, key) for (key, value) in dictionary_words.items()]
list_words = sorted(list_words, reverse=True)        
list_words[:10]

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my')]

In [8]:
from nltk.corpus import stopwords
english_stopwords = stopwords.words("english")
english_stopwords[:10]

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

In [9]:
filtered_words = []
for count, word in list_words:
    if word not in english_stopwords:
        filtered_words.append(word)
filtered_words[:10]

['would',
 'us',
 'said',
 'roberto',
 'could',
 'one',
 'snow',
 'mountain',
 'time',
 'like']

# Sentiment analysis: How positive and how negative is each chapter

In [16]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [17]:
pattern = re.compile("Chapter [0-9]+")
chapters = re.split(pattern, book)
chapters = chapters[1:]

In [18]:
analyzer = SentimentIntensityAnalyzer()

In [19]:
for number, chapter in enumerate(chapters):
    scores = analyzer.polarity_scores(chapter)
    print("Chapter", number+1)
    print("\tNegative score:", scores["neg"])
    print("\tNeutral score:", scores["neg"])
    print("\tPositive score:", scores["neg"])

    if scores["compound"] > 0.05:
        print("\tOverall sentiment: Positive")
    elif scores["compound"] < -0.05:
        print("\tOverall sentiment: Negative")
    else:
        print("\tOverall sentiment: Neutral")

Chapter 1
	Negative score: 0.061
	Neutral score: 0.061
	Positive score: 0.061
	Overall sentiment: Positive
Chapter 2
	Negative score: 0.12
	Neutral score: 0.12
	Positive score: 0.12
	Overall sentiment: Positive
Chapter 3
	Negative score: 0.145
	Neutral score: 0.145
	Positive score: 0.145
	Overall sentiment: Negative
Chapter 4
	Negative score: 0.141
	Neutral score: 0.141
	Positive score: 0.141
	Overall sentiment: Negative
Chapter 5
	Negative score: 0.118
	Neutral score: 0.118
	Positive score: 0.118
	Overall sentiment: Positive
Chapter 6
	Negative score: 0.124
	Neutral score: 0.124
	Positive score: 0.124
	Overall sentiment: Negative
Chapter 7
	Negative score: 0.136
	Neutral score: 0.136
	Positive score: 0.136
	Overall sentiment: Negative
Chapter 8
	Negative score: 0.12
	Neutral score: 0.12
	Positive score: 0.12
	Overall sentiment: Negative
Chapter 9
	Negative score: 0.097
	Neutral score: 0.097
	Positive score: 0.097
	Overall sentiment: Negative
Chapter 10
	Negative score: 0.086
	Neutral 