In [653]:
import re
import spacy

nlp = spacy.load('en_core_web_sm')

# Step 1: Load in the Text as a Single String

In [654]:
all_text = ""
with open('Hobbit_sample.txt', encoding="utf8") as f:
    lines = f.readlines()
for line in lines:
    all_text = all_text + (line.strip("\n")+" ")

In [655]:
print(all_text)

“Good Morning!” said Bilbo, and he meant it. The sun was shining, and the grass was very green. But Gandalf looked at him from under long bushy eyebrows that stuck out further than the brim of his shady hat. “What do you mean?” he said. “Do you wish me a good morning, or mean that it is a good morning whether I want it or not; or that you feel good this morning; or that it is a morning to be good on?”  “All of them at once,” said Bilbo. “And a very fine morning for a pipe of tobacco out of doors, into the bargain. If you have a pipe about you, sit down and have a fill of mine! There’s no hurry, we have all the day before us!” Then Bilbo sat down on a seat by his door, crossed his legs, and blew out a beautiful grey ring of smoke that sailed up into the air without breaking and floated away over The Hill.  “Very pretty!” said Gandalf. “But I have no time to blow smoke-rings this morning. I am looking for someone to share in an adventure that I am arranging, and it’s very difficult to fi

# Step 2: Process the Text into Character Sections

In this step, we need to form a list where each element in the list refers to the character's speaking text. Each break represents a new character speaking (including when narration occurs). This list will be sequential according to the flow of the book. To do this, we need:

A function determining breaks in dialogue and narration. This is represented by quotations.

In [656]:
# define a function for splitting at quotes
def split_sentence_with_quotation(sentence):
    doc = nlp(sentence)

    parts = []
    current_part = ""

    for token in doc:
        if token.text == '”':
            current_part += token.text
            parts.append(current_part)
            current_part = ""
        else:
            current_part += token.text + token.whitespace_

    if current_part:
        parts.append(current_part.strip())

    return parts

In [657]:
# define a function for breaking text according to changes in dialogue and narration
def break_text(text):
    splits = nlp(text)
    sentences = [sentence.text for sentence in splits.sents]
    
    all_parts = []
    for sentence in sentences:
        sentence = split_sentence_with_quotation(sentence)
        for part in sentence:
            all_parts.append(part)
        
    return all_parts

In [658]:
all_parts = break_text(all_text)
print(all_parts)

['“Good Morning!”', 'said Bilbo, and he meant it.', 'The sun was shining, and the grass was very green.', 'But Gandalf looked at him from under long bushy eyebrows that stuck out further than the brim of his shady hat.', '“What do you mean?”', 'he said.', '“Do you wish me a good morning, or mean that it is a good morning whether I want it or not; or that you feel good this morning; or that it is a morning to be good on?”', '', '“All of them at once,”', 'said Bilbo.', '“And a very fine morning for a pipe of tobacco out of doors, into the bargain.', 'If you have a pipe about you, sit down and have a fill of mine!', 'There’s no hurry, we have all the day before us!”', 'Then Bilbo sat down on a seat by his door, crossed his legs, and blew out a beautiful grey ring of smoke that sailed up into the air without breaking and floated away over The Hill.', '“Very pretty!”', 'said Gandalf.', '“But I have no time to blow smoke-rings this morning.', 'I am looking for someone to share in an adventur

In [659]:
# define a function that splits the text into groups according to who is speaking or narrator sections
# this takes the output of the break_text function
def break_by_character(broken_up_parts):
    dialogue_starters = []
    dialogue_enders = []
    for part in broken_up_parts:
        if ('“' in part):
            dialogue_starters.append(part)
        if ('”' in part):
            dialogue_enders.append(part)

    all_globs = []
    micro = []
    for part in broken_up_parts:
        if part in dialogue_starters and part in dialogue_enders:
            if len(micro)>0:
                all_globs.append(micro)
                micro = []
            all_globs.append([part])
            
        elif part in dialogue_starters:
            if len(micro)>0:
                all_globs.append(micro)
                micro = []
            micro.append(part)
        
        elif part in dialogue_enders:
            if len(micro)>0:
                micro.append(part)
                all_globs.append(micro)
                micro = []
    
        else:
            micro.append(part)

    return [item for item in all_globs if item != ['']]        

In [660]:
character_voice_sections = break_by_character(all_parts)

for part in character_voice_sections:
    print("-----------------")
    print(part)

-----------------
['“Good Morning!”']
-----------------
['said Bilbo, and he meant it.', 'The sun was shining, and the grass was very green.', 'But Gandalf looked at him from under long bushy eyebrows that stuck out further than the brim of his shady hat.']
-----------------
['“What do you mean?”']
-----------------
['he said.']
-----------------
['“Do you wish me a good morning, or mean that it is a good morning whether I want it or not; or that you feel good this morning; or that it is a morning to be good on?”']
-----------------
['“All of them at once,”']
-----------------
['said Bilbo.']
-----------------
['“And a very fine morning for a pipe of tobacco out of doors, into the bargain.', 'If you have a pipe about you, sit down and have a fill of mine!', 'There’s no hurry, we have all the day before us!”']
-----------------
['Then Bilbo sat down on a seat by his door, crossed his legs, and blew out a beautiful grey ring of smoke that sailed up into the air without breaking and float

# Step 3: Determine Which Character is Speaking Per Section

Since we now have all sections split according to breaks in who is speaking, we need to determine which characters are speaking per each section and label them. This will involve Named Entity Recognition.

In instances where Named Entity Recognition isn't working well, ask the user for a list of characters with dialogue lines in the story. This is implemented below.

In [661]:
# ask the user to input a list of characters with lines in the text
characters = ['Bilbo', 'Gandalf', 'Dwalin', 'Balin', 'Kili', 'Fili', 'Thorin', 'Bifur', 'Bofur', 'Bombur']
narrator = 'Tolkien'

In [662]:
# define a function to turn the character sections into a dictionary for easier processing
def make_dictionary(voice_sections):
    my_dict = {}
    for index, section in enumerate(voice_sections):
        section = " ".join(section)
        my_dict[index] = section
        
    return my_dict

In [663]:
voice_sections = make_dictionary(character_voice_sections)

In [671]:
# define a function that labels the dialogue based on neighboring Names and return as a dictionary
def label_dialogue(sections, characters_of_story, do_print):
    last_speaker = ''
    dialogue_partner = ''
    final_product = []
    dialogue_dictionary = {}
    for key, value in sections.items():
        if ('“' in value and '”' in value):
            
            if (0 < key+1 < len(sections)) and '“' not in sections[key+1] and '”' not in sections[key+1]:
                in_front = sections[key+1]
            else:
                in_front = "None"
            if (0 < key-1 < len(sections)) and '“' not in sections[key-1] and '”' not in sections[key-1]:
                behind = sections[key-1]
            else:
                behind = "None"
            
            current_dialogue = value
            
            current_voice = ''
            
            possible_voices = []
            for name in characters_of_story:
                if name in in_front:
                    possible_voices.append(name)
            if len(possible_voices)>0:
                current_voice = possible_voices[0]
                    
            if len(possible_voices)<1:
                for name in characters_of_story:
                    if name in behind:
                        possible_voices.append(name)
                if len(possible_voices)>0:
                    current_voice = possible_voices[-1]         
                        
            if len(possible_voices) == 0:
                current_voice = 'None'
              
            if current_voice == 'None' and behind != 'None':
                current_voice = last_speaker
            
            if current_voice == 'None':
                current_voice = dialogue_partner
            
            if do_print:
                print(behind)
                print(key, current_dialogue)
                print(in_front)
                print("")
                print('current voice:', current_voice)
                print('last speaker:', last_speaker)
                print('dialogue partner:', dialogue_partner)
                print('-----------')
            
            if current_voice != last_speaker:
                dialogue_partner = last_speaker
            
            last_speaker = current_voice
            
            dialogue_dictionary[current_dialogue] = current_voice
            
    return dialogue_dictionary

In [672]:
my_dialogue_dictionary = label_dialogue(voice_sections, characters, True)

None
0 “Good Morning!”
said Bilbo, and he meant it. The sun was shining, and the grass was very green. But Gandalf looked at him from under long bushy eyebrows that stuck out further than the brim of his shady hat.

current voice: Bilbo
last speaker: 
dialogue partner: 
-----------
said Bilbo, and he meant it. The sun was shining, and the grass was very green. But Gandalf looked at him from under long bushy eyebrows that stuck out further than the brim of his shady hat.
2 “What do you mean?”
he said.

current voice: Gandalf
last speaker: Bilbo
dialogue partner: 
-----------
he said.
4 “Do you wish me a good morning, or mean that it is a good morning whether I want it or not; or that you feel good this morning; or that it is a morning to be good on?”
None

current voice: Gandalf
last speaker: Gandalf
dialogue partner: Bilbo
-----------
None
5 “All of them at once,”
said Bilbo.

current voice: Bilbo
last speaker: Gandalf
dialogue partner: Bilbo
-----------
said Bilbo.
7 “And a very fine 

In [666]:
# define a function to combine narrated parts with labelled dialogue
def label_all_text(sections, dialogue_dict, narrator_of_all):
    final_tuple = []
    for key, value in sections.items():
        if ('“' in value and '”' in value):
            final_tuple.append((value, my_dialogue_dictionary[value]))
        else:
            final_tuple.append((value, narrator))
    return final_tuple

In [667]:
label_all_text(voice_sections, my_dialogue_dictionary, narrator)

[('“Good Morning!”', 'Bilbo'),
 ('said Bilbo, and he meant it. The sun was shining, and the grass was very green. But Gandalf looked at him from under long bushy eyebrows that stuck out further than the brim of his shady hat.',
  'Tolkien'),
 ('“What do you mean?”', 'Gandalf'),
 ('he said.', 'Tolkien'),
 ('“Do you wish me a good morning, or mean that it is a good morning whether I want it or not; or that you feel good this morning; or that it is a morning to be good on?”',
  'Gandalf'),
 ('“All of them at once,”', 'Bilbo'),
 ('said Bilbo.', 'Tolkien'),
 ('“And a very fine morning for a pipe of tobacco out of doors, into the bargain. If you have a pipe about you, sit down and have a fill of mine! There’s no hurry, we have all the day before us!”',
  'Bilbo'),
 ('Then Bilbo sat down on a seat by his door, crossed his legs, and blew out a beautiful grey ring of smoke that sailed up into the air without breaking and floated away over The Hill.',
  'Tolkien'),
 ('“Very pretty!”', 'Gandalf')

The final product given above is the necessary input for the Text-to-Speech audio part of this project. Below I've defined a function that combines all these parts in order to call this from the Text-to-Speech python script for a running project model. This allows for the text file to be processed and automatically generated into audio.

# One Function Performing all Necessary Text Processing:

In [668]:
# the following must be provided for the entire_process function
characters = ['Bilbo', 'Gandalf']
narrator = 'Tolkien'
text_file = 'Hobbit_sample.txt'

In [677]:
# function containing the entire process
def entire_process(txt_file_name, characters_in_story, narrator_of_story):
    my_text = ""
    with open(text_file, encoding="utf8") as f:
        lines = f.readlines()
    for line in lines:
        my_text = my_text + (line.strip("\n")+" ")
        
    my_voice_sections = break_by_character(break_text(my_text))
    my_voice_dic = make_dictionary(my_voice_sections)
    my_dialogue_dictionary = label_dialogue(my_voice_dic, characters_in_story, False)
    
    my_labelled_text = label_all_text(my_voice_dic, my_dialogue_dictionary, narrator)
    
    return my_labelled_text

In [678]:
my_final_product = entire_process(text_file, characters, narrator)
for part in my_final_product:
    print(part)

('“Good Morning!”', 'Bilbo')
('said Bilbo, and he meant it. The sun was shining, and the grass was very green. But Gandalf looked at him from under long bushy eyebrows that stuck out further than the brim of his shady hat.', 'Tolkien')
('“What do you mean?”', 'Gandalf')
('he said.', 'Tolkien')
('“Do you wish me a good morning, or mean that it is a good morning whether I want it or not; or that you feel good this morning; or that it is a morning to be good on?”', 'Gandalf')
('“All of them at once,”', 'Bilbo')
('said Bilbo.', 'Tolkien')
('“And a very fine morning for a pipe of tobacco out of doors, into the bargain. If you have a pipe about you, sit down and have a fill of mine! There’s no hurry, we have all the day before us!”', 'Bilbo')
('Then Bilbo sat down on a seat by his door, crossed his legs, and blew out a beautiful grey ring of smoke that sailed up into the air without breaking and floated away over The Hill.', 'Tolkien')
('“Very pretty!”', 'Gandalf')
('said Gandalf.', 'Tolkien