### Step 0: Initialize knowledge about the characters

In [1]:
from src.character import characters

In [3]:
import re
import nltk
from nltk.parse import stanford

# Add the jar and model via their path (instead of setting environment variables):
#jar = '.\stanford-parser-full-2015-01-30\stanford-parser.jar'
#model = '.\stanford-parser-full-2015-01-30\stanford-parser.jar'

jar = '.\stanford-parser-full-2017-06-09\stanford-parser-3.8.0.jar'
model = '.\stanford-parser-full-2017-06-09\stanford-parser-3.8.0.jar'

#parser = stanford.StanfordParser(model, jar, model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8')
#dep_parser = stanford.StanfordDependencyParser(model, jar, model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8')


In [4]:
from src.curation import curation
curation(characters)

### Step 1: extracting dialogs and explicit speakers from raw text (deterministic method)

In [4]:
from src.explicit import get_annotated_lines

annotated_lines = get_annotated_lines(parser=parser, dep_parser=dep_parser)


``Why will you think so?  It must be his own doing. -- He is his own master.  But you do not know _all_.  I _will_ read you the passage which particularly hurts me.  I will have no reserves from _you_. [X] Darcy is impatient to see his sister, and to confess the truth, we are scarcely less eager to meet her again.  I really do not think Georgiana Darcy has her equal for beauty, elegance, and accomplishments; and the affection she inspires in Louisa and myself is heightened into something still more interesting, from the hope we dare to entertain of her being hereafter our sister.  I do not know whether I ever before mentioned to you my feelings on this subject, but I will not leave the country without confiding them, and I trust you will not esteem them unreasonable.  My brother admires her greatly already, he will have frequent opportunity now of seeing her on the most intimate footing, her relations all wish the connection as much as his own, and a sister's partiality is not mislead

### Step 1 bis: import pre parsed data

In [27]:
import pickle

with open("corpus/dataset.pkl", 'rb') as file:
    annotated_lines = pickle.load(file)
    
annotated_lines[0]

{'only_utterance_article': 'My dear Bennet, [X] have you heard that Netherfield Park is let at last?',
 'only_utterance_us': 'My dear Bennet, [X] have you heard that Netherfield Park is let at last?',
 'parts': [{'text': 'My dear Bennet,', 'utterance': True},
  {'speaker_gender': 'F',
   'speaker_name': None,
   'text': ' said his lady to him one day, ',
   'utterance': False},
  {'text': 'have you heard that Netherfield Park is let at last?',
   'utterance': True}],
 'source': "``My dear Bennet,'' said his lady to him one day, ``have you heard that Netherfield Park is let at last?''",
 'target': 'MrsBennet'}

### Step 2: extracting features

In [69]:
import pandas as pd

In [75]:
features = ["character_freq", 
            "character_dialog_mention",
            "character_vocal_mention",
            "character_narrator_mention",
            "gender_as_supposed",
            "character_spoke_last",
            "character_is_target"]

columns = ["dialog", "speaker"] + [feature + "_" + character.name for feature in features for character in characters]

In [76]:
names = [character.name for character in characters]

In [77]:
import traceback, sys

In [78]:
def split_words(text):
    return text.split(" ")

In [85]:
dialogs = open("./corpus/curated_dialogs.txt", "r").read().split('\n')[:-1]
dataset = pd.DataFrame([], columns=columns)

dialog_index = 0
line_idx = 0
last_speaker = ""
for count, (phrase, annotated_line) in enumerate(zip(dialogs, annotated_lines)):
    try:
        dialog, speaker, text = phrase.split("\t")

        if int(dialog) > dialog_index:
            character_dialog_mentions = {name: False for name in names}
            dialog_index = int(dialog)
        
        line = {"dialog": int(dialog), "speaker": speaker}
        vocal_mentions = set(names).intersection(set(split_words(text)))
        
        words = set([word for part in annotated_line['parts'] for word in
                     split_words(part['text']) if not part['utterance']])
        narrator_mentions = set(names).intersection(words)
            
        for idx, character in enumerate(characters):
            line["character_freq_" + character.name] = 0.0
            
            line["character_vocal_mention_" + character.name] = character.name in vocal_mentions
            line["character_narrator_mention_" + character.name] = character.name in narrator_mentions
            line["character_dialog_mention_" + character.name] = character_dialog_mentions[character.name]
                
            character_dialog_mentions[character.name] = character_dialog_mentions[character.name] or\
                                                        character.name in vocal_mentions          or\
                                                        character.name in narrator_mentions
            supposed_gender = None
            for part in annotated_line["parts"]:
                if("speaker_gender" in part and part["speaker_gender"] is not None):
                    supposed_gender = part["speaker_gender"]
            
            if(supposed_gender is None):
                gender_as_supposed = 0.5
            elif(supposed_gender == character.gender):
                gender_as_supposed = 1
            else:
                gender_as_supposed = 0
            line["gender_as_supposed_" + character.name] = gender_as_supposed
                
            line["character_spoke_last_" + character.name] = True if last_speaker == character.name else False
            
            if(annotated_line["target"] is not None and annotated_line["target"] == character.name):
                character_is_target = True
            else:
                character_is_target = False
            line["character_is_target_" + character.name] = character_is_target
        
        dataset.loc[line_idx] = line
        line_idx += 1
    except Exception as e:
        print(traceback.format_exception(None, # <- type(e) by docs, but ignored 
                                     e, e.__traceback__),
          file=sys.stderr, flush=True)
        print("line {}, caused a problem: {}".format(count, e))
dataset["dialog"] = dataset.dialog.astype(int)

In [87]:
dataset.head()

Unnamed: 0,dialog,speaker,character_freq_MrsBennet,character_freq_Charlotte,character_freq_MrCollins,character_freq_Mary,character_freq_Lydia,character_freq_MissBingley,character_freq_Bingley,character_freq_Darcy,...,character_is_target_MrCollins,character_is_target_Mary,character_is_target_Lydia,character_is_target_MissBingley,character_is_target_Bingley,character_is_target_Darcy,character_is_target_Kitty,character_is_target_Jane,character_is_target_Bennet,character_is_target_Elizabeth
0,1,MrsBennet,0.109737,0.017002,0.030139,0.00541,0.027048,0.042504,0.027048,0.099691,...,False,False,False,False,False,False,False,False,False,False
1,1,MrsBennet,0.109737,0.017002,0.030139,0.00541,0.027048,0.042504,0.027048,0.099691,...,False,False,False,False,False,False,False,False,False,False
2,1,MrsBennet,0.109737,0.017002,0.030139,0.00541,0.027048,0.042504,0.027048,0.099691,...,False,False,False,False,False,False,False,False,False,False
3,1,Bennet,0.109737,0.017002,0.030139,0.00541,0.027048,0.042504,0.027048,0.099691,...,False,False,False,False,False,False,False,False,True,False
4,1,MrsBennet,0.109737,0.017002,0.030139,0.00541,0.027048,0.042504,0.027048,0.099691,...,False,False,False,False,False,False,False,False,False,False


### Step 3: Train Valid Test split

In [106]:
import numpy as np
np.random.seed(9295)
dialogs = np.arange(dataset.dialog.max())
np.random.shuffle(dialogs)

In [107]:
b1 = len(dialogs) * 8 // 10
b2 = len(dialogs) * 9 // 10
train_dialogs = dialogs[:b1]
valid_dialogs = dialogs[b1:b2]
test_dialogs  = dialogs[b2:]

In [111]:
train_dataset = dataset[dataset.dialog.isin(train_dialogs)]
valid_dataset = dataset[dataset.dialog.isin(valid_dialogs)]
test_dataset  = dataset[dataset.dialog.isin(test_dialogs)]

compute the frequencies based on the train set

In [114]:
pd.options.mode.chained_assignment = None  # default='warn'
for name in names:
    freq = len(train_dataset[train_dataset["speaker"] == name]) / len(train_dataset.index)
    train_dataset["character_freq_" + name] = freq
    valid_dataset["character_freq_" + name] = freq
    test_dataset["character_freq_" + name]  = freq