### Step 1: extracting dialogs and explicit speakers from raw text (deterministic method) => done in Preprocessing.py

In [1]:
import pickle
from src.character import characters

with open("corpus/dataset.pkl", 'rb') as file:
    annotated_lines = pickle.load(file)
    
annotated_lines[0]

{'only_utterance_article': 'My dear Mr_Bennet, [X] have you heard that Netherfield Park is let at last?',
 'only_utterance_us': 'My dear Mr_Bennet, [X] have you heard that Netherfield Park is let at last?',
 'parts': [{'text': 'My dear Mr_Bennet,', 'utterance': True},
  {'speaker_function': 'lady',
   'speaker_gender': 'F',
   'speaker_name': None,
   'text': ' said his lady to him one day, ',
   'utterance': False},
  {'text': 'have you heard that Netherfield Park is let at last?',
   'utterance': True}],
 'source': "``My dear Mr_Bennet,'' said his lady to him one day, ``have you heard that Netherfield Park is let at last?''",
 'target': 'Mrs_Bennet'}

### Step 2: extracting features

In [2]:
import pandas as pd

In [3]:
features = ["character_freq", 
            "character_dialog_mention",
            "character_vocal_mention",
            "character_narrator_mention",
            "gender_as_supposed",
            "character_spoke_last",
            "character_is_target"]

columns = ["dialog", "speaker"] + [feature + "_" + character.name for feature in features for character in characters]

In [4]:
names = [character.name for character in characters]

In [5]:
import traceback, sys

In [6]:
import string
def split_words(text):
    "remove all punctuation and split by spaces"
    table = str.maketrans({key: None for key in string.punctuation})
    return text.translate(table) .split(" ")

In [7]:
dialogs = open("./corpus/curated_dialogs.txt", "r").read().split('\n')[:-1]
dataset = pd.DataFrame([], columns=columns)

dialog_index = 0
line_idx = 0
last_speaker = ""
for count, (phrase, annotated_line) in enumerate(zip(dialogs, annotated_lines)):
    try:
        dialog, speaker, text = phrase.split("\t")

        if int(dialog) > dialog_index:
            character_dialog_mentions = {name: False for name in names}
            dialog_index = int(dialog)
        
        line = {"dialog": int(dialog), "speaker": speaker}
        vocal_mentions = set(names).intersection(set(split_words(text)))
        
        words = set([word for part in annotated_line['parts'] for word in
                     split_words(part['text']) if not part['utterance']])
        narrator_mentions = set(names).intersection(words)
            
        for idx, character in enumerate(characters):
            line["character_freq_" + character.name] = 0.0
            
            line["character_vocal_mention_" + character.name] = character.name in vocal_mentions
            line["character_narrator_mention_" + character.name] = character.name in narrator_mentions
            line["character_dialog_mention_" + character.name] = character_dialog_mentions[character.name]
                
            character_dialog_mentions[character.name] = character_dialog_mentions[character.name] or\
                                                        character.name in vocal_mentions          or\
                                                        character.name in narrator_mentions
            supposed_gender = None
            for part in annotated_line["parts"]:
                if("speaker_gender" in part and part["speaker_gender"] is not None):
                    supposed_gender = part["speaker_gender"]
            
            if(supposed_gender is None):
                gender_as_supposed = 0.5
            elif(supposed_gender == character.gender):
                gender_as_supposed = 1
            else:
                gender_as_supposed = 0
            line["gender_as_supposed_" + character.name] = gender_as_supposed
                
            line["character_spoke_last_" + character.name] = True if last_speaker == character.name else False
            
            if(annotated_line["target"] is not None and annotated_line["target"] == character.name):
                character_is_target = True
            else:
                character_is_target = False
            line["character_is_target_" + character.name] = character_is_target
        
        dataset.loc[line_idx] = line
        line_idx += 1
    except Exception as e:
        print(traceback.format_exception(None, # <- type(e) by docs, but ignored 
                                     e, e.__traceback__),
          file=sys.stderr, flush=True)
        print("line {}, caused a problem: {}".format(count, e))
dataset["dialog"] = dataset.dialog.astype(int)

In [8]:
dataset.head()

Unnamed: 0,dialog,speaker,character_freq_Mrs_Annesley,character_freq_Elizabeth_Bennet,character_freq_Jane_Bennet,character_freq_Lydia_Bennet,character_freq_Kitty_Bennet,character_freq_Mary_Bennet,character_freq_Mrs_Bennet,character_freq_Caroline_Bingley,...,character_is_target_Mr_Jones,character_is_target_Mr_Hurst,character_is_target_Mr_Morris,character_is_target_Mr_Philips,character_is_target_Mr_Pratt,character_is_target_Mr_Robinson,character_is_target_Mr_Stone,character_is_target_Old_Mr_Wickham,character_is_target_Sir_William,character_is_target_Mr_Wickham
0,1,Mrs_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,1,Mrs_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,1,Mrs_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,1,Mr_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,1,Mrs_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False


### Step 3: Train Valid Test split

In [9]:
import numpy as np
np.random.seed(9295)
dialogs = np.arange(dataset.dialog.max())
np.random.shuffle(dialogs)

In [10]:
b1 = len(dialogs) * 8 // 10
b2 = len(dialogs) * 9 // 10
train_dialogs = dialogs[:b1]
valid_dialogs = dialogs[b1:b2]
test_dialogs  = dialogs[b2:]
print("Train set contains {} dialogs.".format(len(train_dialogs)))
print("Valid set contains {} dialogs.".format(len(valid_dialogs)))
print("Test set contains {} dialogs.".format(len(test_dialogs)))

Train set contains 48 dialogs.
Valid set contains 6 dialogs.
Test set contains 7 dialogs.


In [11]:
train_dataset = dataset[dataset.dialog.isin(train_dialogs)]
valid_dataset = dataset[dataset.dialog.isin(valid_dialogs)]
test_dataset  = dataset[dataset.dialog.isin(test_dialogs)]

compute the frequencies based on the train set

In [12]:
pd.options.mode.chained_assignment = None  # default='warn'
for name in names:
    freq = len(train_dataset[train_dataset["speaker"] == name]) / len(train_dataset.index)
    train_dataset["character_freq_" + name] = freq
    valid_dataset["character_freq_" + name] = freq
    test_dataset["character_freq_" + name]  = freq

### Step 4: Training

### Step 5: Results