In [1]:
import json
import pickle as pkl
import re
from nltk.translate import bleu_score
from tqdm import tqdm
import jiwer
from matplotlib import pyplot as plt

#### Load Source Data of both Episode and Subtitles

In [2]:
# Load Open Subtitle
with open('en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('zh_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

In [3]:
# Load Memor Dataset
with open('memor/data.json') as f:
    data = json.load(f)

### Pre-Process Data for one episode

In [6]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

In [7]:
# Process episode
utterances = []
for item in data:
    if item.strip().split('_')[0]=='S01E01':
        utterances.extend(data[item]['sentences'])

segments = []
pattern = r'\.|\,|\?|\!|\;'
for utt in utterances:
    temp = re.split(pattern, utt.strip(' ,.-!'))
    for item in temp:
        item = transformation(item)
        if item!="":
            segments.append(item)

In [8]:
# Process Subtitle Subset
base = 4424331
bias = 400

subtitles = []
for item in en_subtitle[base-bias: base+bias]:
    item = transformation(item)
    if item!="":
        subtitles.append(item)

### Perform Alignment with Word Error Rate (WER)

In [9]:
def get_optimal_wer(ground_truth, hypothesis_pool):
    scores = []
    for i, hypothesis in enumerate(hypothesis_pool):
        scores.append(jiwer.compute_measures(ground_truth, hypothesis)['wer'])
    return min(scores), hypothesis_pool[scores.index(min(scores))], utt, scores.index(min(scores))

In [10]:
indexs = []
count = 0
for utt in segments:
    result = get_optimal_wer(utt, subtitles)
    if 0<result[0]<1:
        indexs.append(result[-1])
        for item in result:
            count += 1
            print(item)
        print('=='*50)
print(count)

0.08
if a photon is directed through a plane with two slits in it and either is observed it will not go through both
if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits
397
0.3333333333333333
if unobserved it will
if it is unobserved it will
398
0.125
if it is observed after it left the plane before it hits its target
if it is observed after it is left the plane but before it hits its target
399
0.25
agreed what is your point
what is your point
401
0.75
is this agreed
there is no point
7
0.36363636363636365
there is no point i just think it is a good idea for a tshirt
i just think it is a good idea for a tshirt
402
0.5
twentysix across is mcm
one across is aegean
404
0.75
eemeli what is it
eight down is nabakov
47
0.6666666666666666
nobody is
fourteen down is
61
0.6666666666666666
quit your yapping
move your finger
3
0.4
phylum which makes 14 across portauprince
which makes fourteen across portauprince
406
0.8
is th