# Load the book

In [7]:
with open("book.txt", "r") as file:
    book = file.read()

# How many chapters?

### With string methods

In [5]:
book.count("Chapter")

11

### With regex

In [6]:
import re
pattern = re.compile("Chapter [0-9]+")
findings = re.findall(pattern, book)
len(findings)

10

# Which are the sentences where "love" is used? 

In [7]:
import re
pattern = re.compile("[A-Z]{1}[^.]*[^a-zA-Z]+love[^a-zA-Z]+[^.]*.")
findings = re.findall(pattern, book)
findings[:10]

['As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives.',
 'Guido and I grew up together, playing soccer and sharing a love of motorcycles, cars, and auto racing.',
 'Under the guidance of the Christian Brothers, both of us grew to love the game of rugby with a consuming passion.',
 'That rowdiness came to an abrupt end for Guido in 1969, when he met and fell in love with the beautiful daughter of a Chilean diplomat.',
 'I believe he had a great hunger for the love and comforts of a family that was happy and whole.',
 'He shared, with my father and me, a love for cars and driving, and he loved going with us to auto races.',
 'The house had a beautiful view of the sea, and this more than anything made my mother love it.',
 'She was a true tower of strength

# What are the most used words? 

In [8]:
import re
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower())

In [9]:
d = {}
for word in findings:
    if word in d.keys():
        d[word] = d[word] + 1
    else:
        d[word] = 1

In [10]:
d_list = [(value, key) for (key, value) in d.items()]
sorted(d_list, reverse=True)[:10]

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my')]

# What are the most used words? (non-articles)

In [11]:
import re
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower())

In [12]:
d = {}
for word in findings:
    if word in d.keys():
        d[word] = d[word] + 1
    else:
        d[word] = 1

In [26]:
d_list = [(value, key) for (key, value) in d.items()]
d_list = sorted(d_list, reverse=True)

In [27]:
import nltk
from nltk.corpus import stopwords
english_stopwords = stopwords.words("english")

In [23]:
filtered_words = []
for count, word in d_list:
    if word not in english_stopwords:
        filtered_words.append({word, count})

In [25]:
filtered_words[:10]

[{575, 'would'},
 {519, 'us'},
 {292, 'said'},
 {284, 'roberto'},
 {252, 'could'},
 {249, 'one'},
 {227, 'snow'},
 {183, 'mountain'},
 {182, 'time'},
 {165, 'like'}]

# Sentiment Analysis: What is the most positive and the most negative chapter? 

### An example

In [2]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [3]:
analyzer = SentimentIntensityAnalyzer()

In [9]:
scores = analyzer.polarity_scores("Hey, look how beautiful the trees are. I love them.")
scores

{'neg': 0.0, 'neu': 0.464, 'pos': 0.536, 'compound': 0.8442}

In [10]:
if scores["pos"] > scores["neg"]:
    print("It is a positive text")
else:
    print("It is a negative text")

It is a positive text


### Chapter sentiment analysis

In [24]:
import re
pattern = re.compile("Chapter [0-9]+")
chapters = re.split(pattern, book)

In [25]:
chapters = chapters[1:]
for nr, chapter in enumerate(chapters):
    scores = analyzer.polarity_scores(chapter)
    print(nr + 1, scores)

1 {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
2 {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
3 {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
4 {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
5 {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
6 {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
7 {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
8 {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
9 {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
10 {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
