# Read The Book


In [1]:
with open("datas/miracle_in_the_andes.txt", "r") as file:
    book = file.read()

## How Many Chapters Are There?


### Calculate Using String Methods


In [2]:
chapterCount = book.count("Chapter")
print(f"There are {chapterCount} chapters in the book.")

There are 11 chapters in the book.


### Calculate Using Regular Expressions (Regex)


In [3]:
import re

pattern = re.compile("Chapter [0-9]+")
foundPatterns = re.findall(pattern, book)
chapterCount = len(foundPatterns)
print(f"There are {chapterCount} chapters in the book.")
print("Found chapters are:")
for element in foundPatterns:
    print(element)

There are 10 chapters in the book.
Found chapters are:
Chapter 1
Chapter 2
Chapter 3
Chapter 4
Chapter 5
Chapter 6
Chapter 7
Chapter 8
Chapter 9
Chapter 10


## Every Sentence That Contains The Word "Love"


In [4]:
pattern = re.compile("[A-Z]{1}[^.]*[^A-Za-z]+love[^A-Za-z]+[^.]*.")
foundPatterns = re.findall(pattern, book)
sentenceCount = len(foundPatterns)
print(f"There are {sentenceCount} sentences with the word 'love' in them.")

There are 67 sentences with the word 'love' in them.


## What Are The Most Used Words


In [5]:
pattern = re.compile("[A-Za-z]+")
foundPatterns = re.findall(pattern, book.lower())
wordAmount = len(foundPatterns)
print(f"There are {wordAmount} words in the book.")

import collections

wordCounts = collections.Counter(foundPatterns)

print("Top five words are:")
for word, count in wordCounts.most_common(5):
    frequency = count / wordAmount
    perc = frequency * 100.0

    print(f"'{word}' with count of {count}")
    print(f"Frequency: {frequency:.8f}")
    print(f"Probability: {perc:.4f}%")
    print()

There are 86798 words in the book.
Top five words are:
'the' with count of 5346
Frequency: 0.06159128
Probability: 6.1591%

'and' with count of 2795
Frequency: 0.03220120
Probability: 3.2201%

'i' with count of 2729
Frequency: 0.03144082
Probability: 3.1441%

'to' with count of 2400
Frequency: 0.02765041
Probability: 2.7650%

'of' with count of 2060
Frequency: 0.02373327
Probability: 2.3733%



## Filter Out The Stopwords


In [6]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
engStopWords = stopwords.words("english")

pattern = re.compile("[A-Za-z]+")
foundPatterns = re.findall(pattern, book.lower())

filteredPatterns = [element for element in foundPatterns if element not in engStopWords]

wordCounts = collections.Counter(filteredPatterns)

print("Top five words are:")
for word, count in wordCounts.most_common(5):
    frequency = count / wordAmount
    perc = frequency * 100.0

    print(f"'{word}' with count of {count}")
    print(f"Frequency: {frequency:.8f}")
    print(f"Probability: {perc:.4f}%")
    print()

Top five words are:
'would' with count of 575
Frequency: 0.00662458
Probability: 0.6625%

'us' with count of 519
Frequency: 0.00597940
Probability: 0.5979%

'said' with count of 292
Frequency: 0.00336413
Probability: 0.3364%

'roberto' with count of 284
Frequency: 0.00327196
Probability: 0.3272%

'could' with count of 252
Frequency: 0.00290329
Probability: 0.2903%



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gbpekalp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Most Positive And Negative Chapters


In [7]:
from nltk.sentiment import SentimentIntensityAnalyzer as SIA

analyzer = SIA()

pattern = re.compile("Chapter [0-9]+")

chapterNumbers = re.findall(pattern, book)

chapters = re.split(pattern, book)
chapters = [chapter.replace("\n", " ").strip() for chapter in chapters]
chapters = [chapter for chapter in chapters if chapter.strip() != ""]

results = [analyzer.polarity_scores(chapter) for chapter in chapters]

chapterResults = list(zip(chapterNumbers, results))

positive = max(chapterResults, key=lambda x: x[1]["pos"])
neutral = max(chapterResults, key=lambda x: x[1]["neu"])
negative = max(chapterResults, key=lambda x: x[1]["neg"])

message = f"""\
Most Positive: {positive[0]}, Positivity Score: {positive[1]["pos"]}
Most Neutral: {neutral[0]}, Neutrality Score: {neutral[1]["neu"]}
Most Negative: {negative[0]}, Negativity Score: {negative[1]["neg"]}
    """

print(message)

for result in chapterResults:
    message = f"""\
{result[0]}
Positivity: {result[1]["pos"]}
Neutrality: {result[1]["neu"]}
Negativitiy: {result[1]["neg"]}
        """
    print(message)

Most Positive: Chapter 10, Positivity Score: 0.181
Most Neutral: Chapter 9, Neutrality Score: 0.824
Most Negative: Chapter 3, Negativity Score: 0.145
    
Chapter 1
Positivity: 0.16
Neutrality: 0.779
Negativitiy: 0.061
        
Chapter 2
Positivity: 0.154
Neutrality: 0.726
Negativitiy: 0.12
        
Chapter 3
Positivity: 0.105
Neutrality: 0.751
Negativitiy: 0.145
        
Chapter 4
Positivity: 0.138
Neutrality: 0.721
Negativitiy: 0.141
        
Chapter 5
Positivity: 0.141
Neutrality: 0.742
Negativitiy: 0.118
        
Chapter 6
Positivity: 0.115
Neutrality: 0.761
Negativitiy: 0.124
        
Chapter 7
Positivity: 0.103
Neutrality: 0.761
Negativitiy: 0.136
        
Chapter 8
Positivity: 0.094
Neutrality: 0.786
Negativitiy: 0.12
        
Chapter 9
Positivity: 0.079
Neutrality: 0.824
Negativitiy: 0.097
        
Chapter 10
Positivity: 0.181
Neutrality: 0.733
Negativitiy: 0.086
        
