# Import libraries and define variables

In [1]:
import re
import os
import pandas as pd

In [2]:
# pattern to extract the quotes
quote_pattern = r'"([^"]*)"'

# folders
## folder containing the raw txt files of the books
raw_text_folder = r'.\raw-text'

## folder we will export the book's character text (CT)
chara_text_folder = r'.\processed-text' 

## folder we will export the dialogue of each character to
quotes_folder = r'.\character-quotes'

In [3]:
# book titles
book_titles = ["tlt", "som", "ttc", "bol", "tlo"] # shortform titles of the books, in order of release date
book_titles_caps = [i.upper() for i in book_titles] 

# book file names
book_file_names = [i+'_fulltext.txt' for i in book_titles] # getting the file name of the books' original text
ct_file_names = [i+'_ct.txt' for i in book_titles] # getting the file name of the books' CT

# book file paths
full_book_file_names = [os.path.join(raw_text_folder,file) for file in book_file_names] # full file path of each book's text
full_ct_file_names = [os.path.join(chara_text_folder,file) for file in ct_file_names] # full file path of each book's CT

# Functions

In [4]:
# clear all files in folder
def clear_folder(folder_path):
    files_list = os.listdir(folder_path)
    
    for i in files_list:
        os.remove(os.path.join(folder_path, i))
    
    return

In [5]:
# gets all the lines from the file and returns it as a list of strings
def open_book(book_file_path):
    f = open(book_file_path, "r", encoding="utf-8")
    contents = f.read()
    lst = contents.split("\n")
    f.close()
    return lst

In [6]:
# exports the quotes list as a txt file
def export_text(file_name, quotes_list):
    f = open(file_name, "w", encoding="utf-8")
    
    started = False
    
    for line in quotes_list:
        if started:
            f.write('\n')
        started = True
        f.write(line)
    
    f.close()
    
    return

In [7]:
# returns all the quotes from a single line and returns it as a list of strings
def get_quotes(string):
    quotes_list = re.findall(quote_pattern, string)
    return quotes_list

In [8]:
# returns the dialogue tag
def get_who(string, quote_list):
    dialogue_tag = string
    
    for i in quote_list:
        dialogue_tag = dialogue_tag.replace('"'+i+'"',"")
        
    dialogue_tag = dialogue_tag.strip()
    
    return dialogue_tag

In [9]:
# will return 'True' if the dialogue is said by Annabeth
def get_annabeth(dialogue_tag):
    is_Annabeth = True
    
    if len(dialogue_tag) != 0:
        # check that word 'Annabeth' is in the string
        if re.search('Annabeth', dialogue_tag):

            # check that Percy isn't describing Annabeth - quote doesn't include 'I'
            words = ["I", "I'd", "I’d"]
            for word in words:
                if re.search(r'\b' + word + " ", dialogue_tag):
                    
                    # if it contains I, mark as False
                    is_Annabeth = False
                    break
                    
        # otherwise means annabeth doesn't say this quote
        else:
            is_Annabeth = False
        
    # otherwise, who says this quote is unknown
    else:
        is_Annabeth = False
    
    return is_Annabeth

In [10]:
# will return 'True' if the quote is said by Percy
def get_percy(dialogue_tag):
    is_Percy = True
    
    if len(dialogue_tag) != 0:
        # check that word 'I' is in the string to see if percy said the quotes
        if re.search(r'\b' + 'I ', dialogue_tag):

            # check that percy isn't describing something someone else said
            words = ["I'd", "I’d", "Annabeth"]
            for word in words:
                if re.search(r'\b' + word + " ", dialogue_tag):
                    
                    # if it contains I, mark as False
                    is_Percy = False
                    break
                    
        # otherwise means percy doesn't say this quote
        else:
            is_Percy = False
        
    # otherwise, who says this quote is unknown
    else:
        is_Percy = False
    
    return is_Percy

# Clear folders to make sure files are exported properly

In [11]:
# create chara text and quotes folders if they don't exist
# if they do exist, clear chara text and quotes folders to ensure outputted files are correct 
# this is so we don't accidentally continue writing on old files which may be outdated

for folder in [chara_text_folder, quotes_folder]:
    # create folder if it doesn't exist
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    # if folder has items, clear all items before starting
    for i in os.listdir(folder):
        item_path = os.path.join(folder, i)

        if not(os.path.isfile(item_path)):
            clear_folder(item_path)

# Filtering out character text from book

In [12]:
# using the file with the full text, will filter out only lines with dialogue
# then will export this into a seperate txt file

for book_num in range(len(full_book_file_names)):
    raw_file = full_book_file_names[book_num]
    cleaned_file = full_ct_file_names[book_num]
    
    # first open the book and store the lines as a list
    book_lst = open_book(raw_file)
    
    all_book_quotes = []
    
    # get all the quotes in the book and store it in a list
    for line in book_lst:
        if line.find('"') != -1:
            all_book_quotes.append(line)
            
        elif line.find('“') != -1:
            line = line.replace('“', '"')
            line = line.replace('”', '"')
            all_book_quotes.append(line)
    
    # export this list of dialogues of the book into it's own txt file
    export_text(cleaned_file, all_book_quotes)

# Filtering out the quotes by character

In [13]:
# Using the txt file with just the CT, will now filter out what Annabeth & Percy says
annabeth_quotes = []
percy_quotes = []

for book in full_ct_file_names:
    # first, open the file and store the lines as a list
    book_lst = open_book(book)
    
    quotes_book_annabeth = []
    quotes_book_percy = []
    
    for line in book_lst:
        
        # returns all the quotes from a single line and returns it as a list of strings
        quote_list = get_quotes(line)
        
        # check who said the quote
        dialogue_tag = get_who(line, quote_list)
        
        # check if it's annabeth
        # to check if annabeth said the quotes, check if the text contains 'Annabeth'
        # and doesn't contain 'I', 'I'd
        if get_annabeth(dialogue_tag) == True:
            quotes_book_annabeth.extend(quote_list)
            
        # check if it's percy
        # to check if percy said the quotes, check if the text contains 'I'
        # and doesn't contain 'I', 'I'd so that percy isn't describing something someone else said
        if get_percy(dialogue_tag) == True:
            quotes_book_percy.extend(quote_list)
    
    annabeth_quotes.append(quotes_book_annabeth)
    percy_quotes.append(quotes_book_percy)

# Export character quotes

In [14]:
# count number of quotes
annabeth_num_quotes = [len(i) for i in annabeth_quotes]
percy_num_quotes = [len(i) for i in percy_quotes]

count_df = pd.DataFrame([annabeth_num_quotes, percy_num_quotes], columns = book_titles_caps, index = ['Annabeth', 'Percy'])

In [15]:
count_df

Unnamed: 0,TLT,SOM,TTC,BOL,TLO
Annabeth,271,243,59,392,207
Percy,769,547,749,775,961


In [16]:
# exporting annabeth's quotes
for i in range(len(annabeth_quotes)):
    # name of file for annabeth's quotes
    annabeth_file_name = os.path.join(quotes_folder, "annabeth_" + book_titles[i] + ".txt")
    
    # export as text file
    export_text(annabeth_file_name, annabeth_quotes[i])

In [17]:
# exporting percy's quotes
for i in range(len(percy_quotes)):
    # name of file for percy's quotes
    percy_file_name = os.path.join(quotes_folder, "percy_" + book_titles[i] + ".txt")
    
    # export as text file
    export_text(percy_file_name, percy_quotes[i])