# Load file

In [2]:
with open('miracle_in_the_andes.txt', 'r') as file:
    book = file.read()

# How many chapters?

## With string count

In [3]:
book.count('Chapter')

11

## With regex

In [4]:
import re
chapters = re.findall(r'Chapter \d+', book)
len(chapters)

10

# Which are the sentences where 'love' are used ?

In [19]:
findings = re.findall(r'[A-Z]{1}[^.]*[^a-zA-Z]+love[^a-zA-Z]+[^.]*.', book)
findings[:10]  # Display the first 10 sentences containing 'love'

['As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives.',
 'Guido and I grew up together, playing soccer and sharing a love of motorcycles, cars, and auto racing.',
 'Under the guidance of the Christian Brothers, both of us grew to love the game of rugby with a consuming passion.',
 'That rowdiness came to an abrupt end for Guido in 1969, when he met and fell in love with the beautiful daughter of a Chilean diplomat.',
 'I believe he had a great hunger for the love and comforts of a family that was happy and whole.',
 'He shared, with my father and me, a love for cars and driving, and he loved going with us to auto races.',
 'The house had a beautiful view of the sea, and this more than anything made my mother love it.',
 'She was a true tower of strength

# What are the most used word ?

In [24]:
patterns = re.compile('[a-zA-Z]+')
findings = re.findall(patterns, book.lower())
len(findings)

86798

In [41]:
counts = {}
for word in findings:
    counts[word] = counts.get(word, 0) + 1
sorted(counts, key=lambda x: counts[x], reverse=True)[:10]  # Display the top 10 most used words

['the', 'and', 'i', 'to', 'of', 'a', 'was', 'in', 'we', 'my']

In [42]:
counts_list = [(value, key) for key, value in counts.items()]
counts_list.sort(reverse=True)
counts_list[:10]  # Display the top 10 most used words with their counts

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my')]

# Extract the paragraphs where 'love' was used

In [62]:
patterns = re.compile('[^\n]+love[^\n]+')
paragraphs = re.findall(patterns, book)
len(paragraphs)  # Display the first 10 paragraphs containing 'love'

60

# Extract the chapter titles

In [102]:
patterns = re.compile('[a-zA-Z ,]+\n\n')
titles = re.findall(patterns, book)
titles = [title.strip() for title in titles]
titles

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

# Function that finds the occurrence of any word

In [75]:
def find(word):
    if word not in counts.keys():
        print(f'The book does not contain the word "{word}".')
    else:
        print(counts[word], f'instances of the word "{word}" found in the book.')

# Call the function

In [60]:
find('hate')

The book does not contain the word "hate".


In [61]:
find('love')

83 instances of the word "love" found in the book.


# What are the most used word (non-stopwords) ?

In [79]:
import nltk
from nltk.corpus import stopwords

In [80]:
d_list = []
for key, count in counts.items():
    if key in stopwords.words('english'):
        continue
    d_list.append((count, key))
d_list.sort(reverse=True)

In [81]:
d_list[:10]  # Display the top 10 most used non-stopwords with their counts

[(575, 'would'),
 (519, 'us'),
 (292, 'said'),
 (284, 'roberto'),
 (252, 'could'),
 (249, 'one'),
 (227, 'snow'),
 (183, 'mountain'),
 (182, 'time'),
 (165, 'like')]

# Sentiment Analysis: What is the most positive and negative chapter ?

## An example

In [83]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\catas\AppData\Roaming\nltk_data...


True

In [96]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
scores = sia.polarity_scores('hate, kill, die, love')

In [99]:
if scores['neg'] > scores['pos']:
    print('This is a negative text.')
else:
    print('This is a positive text.')

This is a negative text.


## Chapters sentiment analysis

In [103]:
patterns = re.compile('Chapter [0-9]+')
chapters = re.split(patterns, book)
for i, chapters in enumerate(chapters[1:]):
    scores = sia.polarity_scores(chapters)
    print(f'Chapter {i + 1:2}: {scores}')

Chapter  1: {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
Chapter  2: {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
Chapter  3: {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
Chapter  4: {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
Chapter  5: {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
Chapter  6: {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
Chapter  7: {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
Chapter  8: {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
Chapter  9: {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
Chapter 10: {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
