### Step 1: extracting dialogs and explicit speakers from raw text (deterministic method) => done in Preprocessing.py

In [157]:
import pickle
from src.character import characters

with open("corpus/dataset.pkl", 'rb') as file:
    annotated_lines = pickle.load(file)
    
annotated_lines[0]

{'only_utterance_article': 'My dear Mr_Bennet, [X] have you heard that Netherfield Park is let at last?',
 'only_utterance_us': 'My dear Mr_Bennet, [X] have you heard that Netherfield Park is let at last?',
 'parts': [{'text': 'My dear Mr_Bennet,', 'utterance': True},
  {'speaker_function': 'lady',
   'speaker_gender': 'F',
   'speaker_name': None,
   'text': ' said his lady to him one day, ',
   'utterance': False},
  {'text': 'have you heard that Netherfield Park is let at last?',
   'utterance': True}],
 'source': "``My dear Mr_Bennet,'' said his lady to him one day, ``have you heard that Netherfield Park is let at last?''",
 'target': 'Mrs_Bennet'}

### Step 2: extracting features

In [109]:
import pandas as pd

In [127]:
features = ["character_freq", 
            "character_previous_mention",
            "character_dialog_mention",
            "character_vocal_mention",
            "character_narrator_mention",
            "gender_as_supposed",
            "character_spoke_last",
            "character_spoke_before_last",
            "character_is_target",
            "character_last_target",
            "character_already_spoke"]

columns = ["dialog", "speaker"] + [feature + "_" + character.name for feature in features for character in characters]

In [128]:
names = [character.name for character in characters]

In [129]:
import traceback, sys

In [130]:
import string
def split_words(text):
    "remove all punctuation and split by spaces"
    table = str.maketrans({key: None for key in string.punctuation})
    return text.translate(table) .split(" ")

In [131]:
dialogs = open("./corpus/curated_dialogs.txt", "r").read().split('\n')[:-1]
dataset = pd.DataFrame([], columns=columns)

dialog_index = 0
line_idx = 0
last_speaker = ""
before_last_speaker = ""
last_target = ""
previous_mentions = set()
for count, (phrase, annotated_line) in enumerate(zip(dialogs, annotated_lines)):
    try:
        dialog, speaker, text = phrase.split("\t")

        if int(dialog) > dialog_index:
            character_dialog_mentions = {name: False for name in names}
            spokers = set()
            last_speaker = ""
            before_last_speaker = ""
            last_target = ""
            dialog_index = int(dialog)
            previous_mentions = set()
        
        line = {"dialog": int(dialog), "speaker": speaker}
        previous_mentions = vocal_mentions
        vocal_mentions = set(names).intersection(set(split_words(text)))
        
        words = set([word for part in annotated_line['parts'] for word in
                     split_words(part['text']) if not part['utterance']])
        narrator_mentions = set(names).intersection(words)
            
        for idx, character in enumerate(characters):
            line["character_already_spoke_" + character.name] = character.name in spokers
            line["character_freq_" + character.name] = 0.0
            
            line["character_previous_mention_" + character.name] = character.name in previous_mentions
            line["character_vocal_mention_" + character.name] = character.name in vocal_mentions
            line["character_narrator_mention_" + character.name] = character.name in narrator_mentions
            line["character_dialog_mention_" + character.name] = character_dialog_mentions[character.name]
                
            character_dialog_mentions[character.name] = character_dialog_mentions[character.name] or\
                                                        character.name in vocal_mentions          or\
                                                        character.name in narrator_mentions
            supposed_gender = None
            for part in annotated_line["parts"]:
                if("speaker_gender" in part and part["speaker_gender"] is not None):
                    supposed_gender = part["speaker_gender"]
            
            if(supposed_gender is None):
                gender_as_supposed = 0.5
            elif(supposed_gender == character.gender):
                gender_as_supposed = 1
            else:
                gender_as_supposed = 0
            line["gender_as_supposed_" + character.name] = gender_as_supposed
                
            line["character_last_target_" + character.name] = True if last_target == character.name else False
            line["character_spoke_last_" + character.name] = True if last_speaker == character.name else False
            line["character_spoke_before_last_" + character.name] = True if before_last_speaker == character.name else False
            
            if(annotated_line["target"] is not None and annotated_line["target"] == character.name):
                character_is_target = True
            else:
                character_is_target = False
            line["character_is_target_" + character.name] = character_is_target
        
        spokers.add(speaker)
        before_last_speaker = last_speaker
        last_speaker = speaker
        dataset.loc[line_idx] = line
        line_idx += 1
    except Exception as e:
        print(traceback.format_exception(None, # <- type(e) by docs, but ignored 
                                     e, e.__traceback__),
          file=sys.stderr, flush=True)
        print("line {}, caused a problem: {}".format(count, e))
dataset["dialog"] = dataset.dialog.astype(int)

In [132]:
dataset.head()

Unnamed: 0,dialog,speaker,character_freq_Mrs_Annesley,character_freq_Elizabeth_Bennet,character_freq_Jane_Bennet,character_freq_Lydia_Bennet,character_freq_Kitty_Bennet,character_freq_Mary_Bennet,character_freq_Mrs_Bennet,character_freq_Caroline_Bingley,...,character_already_spoke_Mr_Jones,character_already_spoke_Mr_Hurst,character_already_spoke_Mr_Morris,character_already_spoke_Mr_Philips,character_already_spoke_Mr_Pratt,character_already_spoke_Mr_Robinson,character_already_spoke_Mr_Stone,character_already_spoke_Old_Mr_Wickham,character_already_spoke_Sir_William,character_already_spoke_Mr_Wickham
0,1,Mrs_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,1,Mrs_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,1,Mrs_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,1,Mr_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,1,Mrs_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False


### Step 3: Train Valid Test split

In [133]:
import numpy as np
np.random.seed(9295)
dialogs = np.arange(dataset.dialog.max())
np.random.shuffle(dialogs)

In [134]:
b1 = len(dialogs) * 8 // 10
b2 = len(dialogs) * 9 // 10
train_dialogs = dialogs[:b1]
valid_dialogs = dialogs[b1:b2]
test_dialogs  = dialogs[b2:]
print("Train set contains {} dialogs.".format(len(train_dialogs)))
print("Valid set contains {} dialogs.".format(len(valid_dialogs)))
print("Test set contains {} dialogs.".format(len(test_dialogs)))

Train set contains 48 dialogs.
Valid set contains 6 dialogs.
Test set contains 7 dialogs.


In [135]:
train_dataset = dataset[dataset.dialog.isin(train_dialogs)]
valid_dataset = dataset[dataset.dialog.isin(valid_dialogs)]
test_dataset  = dataset[dataset.dialog.isin(test_dialogs)]

compute the frequencies based on the train set

In [136]:
pd.options.mode.chained_assignment = None  # default='warn'
for name in names:
    freq = len(train_dataset[train_dataset["speaker"] == name]) / len(train_dataset.index)
    train_dataset["character_freq_" + name] = freq
    valid_dataset["character_freq_" + name] = freq
    test_dataset["character_freq_" + name]  = freq

### Step 4: Training

In [137]:
from sklearn.ensemble import GradientBoostingClassifier

In [146]:
GB = GradientBoostingClassifier(learning_rate=0.02, n_estimators=500, subsample=0.95, min_samples_split=2,
                                min_samples_leaf=2, max_depth=4, verbose=2)

In [147]:
y = train_dataset.speaker.values
X = train_dataset.drop(["dialog", "speaker"], axis=1).values.astype(np.float32)

In [148]:
GB.fit(X, y)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1        2807.1687          26.4646           44.53s
         2        2502.4773          15.3370           44.19s
         3        2281.2072          11.9679           45.43s
         4        2102.6851           9.5275           46.81s
         5        1953.7560           7.9367           48.34s
         6        1825.9792           6.8071           50.77s
         7        1713.3813           5.7712           52.01s
         8        1612.4555           5.2153           53.12s
         9        1524.7259           4.8053           54.15s
        10        1441.2118           4.1829           54.81s
        11        1369.2039           3.9792           55.49s
        12        1297.0401           3.4490           55.98s
        13        1237.1015           3.3175           56.55s
        14        1179.1204           3.1400           57.05s
        15        1124.3536           2.8732           57.61s
       

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.02, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=2,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=500, presort='auto', random_state=None,
              subsample=0.95, verbose=2, warm_start=False)

### Step 5: Results

In [175]:
pred = []
truth = []

dialogs_done = set()
last_speaker = ""
spokers = set()
for idx in valid_dataset.index:
    speaker_name = None
    for part in annotated_lines[idx]["parts"]:
        if 'speaker_name' in part and part['speaker_name'] is not None:
            speaker_name = part['speaker_name']
            break
    if speaker_name is not None:
        speaker = speaker_name
    else:
        line = valid_dataset.loc[idx]
        if line.dialog not in dialogs_done:
            spokers = set()
            last_speaker = ""
            dialogs_done.add(line.dialog)

        for character in characters:
            line["character_already_spoke_" + character.name] = character.name in spokers
            line["character_spoke_last_" + character.name] = True if last_speaker == character.name else False

        truth.append(speaker)

        speaker = GB.predict(line.values[2:].astype(np.float32).reshape(1,-1))[0]
    pred.append(speaker)
    last_speaker = speaker
    spokers.add(speaker)

In [176]:
print("Precision: {:.02f}%".format(100*sum([p == t for p, t in zip(pred, truth)]) / len(truth)))

Precision: 31.73%


### Explicit mention of speaker

In [170]:
dialogs = open("./corpus/curated_dialogs.txt", "r").read().split('\n')[:-1]
found = 0
correct = 0
for count, (phrase, annotated_line) in enumerate(zip(dialogs, annotated_lines)):
    dialog, speaker, text = phrase.split("\t")
    speaker_name = None
    for part in annotated_line["parts"]:
        if 'speaker_name' in part and part['speaker_name'] is not None:
            speaker_name = part['speaker_name']
            break
    if speaker_name is not None:
        found += 1
        if speaker_name == speaker:
            correct += 1
print("identified speaker: {:.02f}".format(100*found/len(dialogs)))
print("correct identification: {:.02f}".format(100*correct/found))
print("total precision: {:.02f}".format(100*correct/len(dialogs)))

identified speaker: 25.19
correct identification: 87.73
total precision: 22.10
