In [1]:
import re

In [2]:
FILE_PATH = "input\\miracle_in_the_andes.txt"

# Load the book

In [3]:
with open(FILE_PATH, "r", encoding='utf-8') as file:
    book = file.read()

# Number of chapters

In [4]:
pattern = re.compile("Chapter [0-9]+")
findings = re.findall(pattern, book)
findings

['Chapter 1',
 'Chapter 2',
 'Chapter 3',
 'Chapter 4',
 'Chapter 5',
 'Chapter 6',
 'Chapter 7',
 'Chapter 8',
 'Chapter 9',
 'Chapter 10']

In [5]:
len(findings)

10

# Sentences where "love" was used

In [6]:
"""
Explanation of the pattern:
    [A-Z]{1} = exactly one capital letter
    [^.]* = zero or more non period symbols
    [^a-zA-Z]+ = one or more non lowercase letters nor capital letters
    love = exactly this sequence of letters
    . = one period
"""
pattern = re.compile("[A-Z]{1}[^.]*[^a-zA-Z]+love[^a-zA-Z]+[^.]*.")
findings = re.findall(pattern, book)
findings[:10]

['As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives.',
 'Guido and I grew up together, playing soccer and sharing a love of motorcycles, cars, and auto racing.',
 'Under the guidance of the Christian Brothers, both of us grew to love the game of rugby with a consuming passion.',
 'That rowdiness came to an abrupt end for Guido in 1969, when he met and fell in love with the beautiful daughter of a Chilean diplomat.',
 'I believe he had a great hunger for the love and comforts of a family that was happy and whole.',
 'He shared, with my father and me, a love for cars and driving, and he loved going with us to auto races.',
 'The house had a beautiful view of the sea, and this more than anything made my mother love it.',
 'She was a true tower of strength

In [7]:
len(findings)

67

# The most used words

In [8]:
pattern = re.compile("[a-zA-z]+")
findings = re.findall(pattern, book.lower())
findings[:10]

['chapter',
 'before',
 'it',
 'was',
 'friday',
 'the',
 'thirteenth',
 'of',
 'october',
 'we']

In [9]:
len(findings)

86798

In [10]:
dictionary_words = {}
for word in findings:
    if word in dictionary_words:
        dictionary_words[word] = dictionary_words[word] + 1
    else:
        dictionary_words[word] = 1

In [12]:
# Convert from dictionary to list of tuples in order to be able to sort it
list_words = [(value, key) for (key, value) in dictionary_words.items()]
list_words = sorted(list_words, reverse=True)
list_words[:10]

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my')]

# Paragraphs where "love" was used

In [13]:
pattern = re.compile("[^\n]+love[^\n]+")
findings = re.findall(pattern, book)
findings[:3]

['To me, this is the essence of rugby. No other sport gives you such an intense sense of selflessness and unified purpose. I believe this is why rugby players all over the world feel such a passion for the game and such a feeling of brotherhood. As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives. For eight years we played our hearts out for the Christian Brothers—a brotherhood of young boys with Latin names, playing a game with deep Anglo roots under Uruguay’s sunny skies, and proudly wearing the bright green shamrock on our uniforms. The game became so much a part of our lives, in fact, that when we graduated from Stella Maris at the age of sixteen, many of us could not bear the thought that our playing days were over. Our salvation came in the form of

In [14]:
len(findings)

60

# Chapter titles

## Method 1

In [15]:
pattern = re.compile("\n\n[a-zA-Z ]+\n\n")
findings = re.findall(pattern, book)
findings = [item.strip("\n\n") for item in findings]
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

In [16]:
## Method 2

In [17]:
pattern = re.compile("\n\n([a-zA-Z ]+)\n\n")
findings = re.findall(pattern, book)
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

# Function that finds the occurence of any word

In [18]:
def word_occurences(word):
    pattern = re.compile(f"[^a-zA-Z]{word}[^a-zA-Z]")
    findings = re.findall(pattern, book.lower())
    if findings:
        return len(findings)
    else:
        return f"The book does not contain the word '{word}'"

In [19]:
word_occurences("love")

83

In [21]:
word_occurences("hate")

"The book does not contain the word 'hate'"