In [1]:
from utility import *
from constants import *
import constants
from tqdm import tqdm
from unidecode import unidecode
from prefixspan import PrefixSpan
import re
import copy
from collections import defaultdict

In [2]:
instances, discarded = extract_text(splits, data_path, non_ascii_exceptions, bad_ascii)
instances = clean(instances, splits, patterns_remove)
discard_symbol = 0
for split in constants.splits:
    insts_split = instances[split]
    new_insts_split = []
    for ins in tqdm(insts_split):
        match_result, ratio = align_strings(ins["BookText"], ins["ASRTranscript"])
        if ratio < constants.threshold_ratio:
            discarded += 1
            continue
        if any([not x.isalnum() and 
                x not in constants.ascii_exceptions and 
                x not in constants.non_ascii_exceptions 
                for x in ins["BookText"].strip().replace(" ", "")]):
            discard_symbol += 1
            discarded += 1
            continue

        ins = clean_mismatched(ins, match_result, constants.ascii_exceptions, constants.non_ascii_exceptions)
        # update match_result
        ins["match_result"], ins["ratio"] = align_strings(ins["BookText"], ins["ASRTranscript"])
        new_insts_split.append(ins)
    instances[split] = new_insts_split

print("original number of instances: ", discarded+sum([len(i) for i in instances.values()]))
print("After cleaning: ", sum([len(i) for i in instances.values()]))
print("dropped instances: ", discarded)
print("dropped instances because of nonexception ascii symbols:", discard_symbol)
print("dropped percentage: ", discarded /(discarded+sum([len(i) for i in instances.values()])))

Extracting text from: /home/mazhang/pt/libriheavy/libriheavy_cuts_small.jsonl
number of instances for the small split: 114581
Extracting text from: /home/mazhang/pt/libriheavy/libriheavy_cuts_medium.jsonl
number of instances for the medium split: 1047513
Extracting text from: /home/mazhang/pt/libriheavy/libriheavy_cuts_dev.jsonl
number of instances for the dev split: 5129
Extracting text from: /home/mazhang/pt/libriheavy/libriheavy_cuts_test_clean.jsonl
number of instances for the test_clean split: 2433
Extracting text from: /home/mazhang/pt/libriheavy/libriheavy_cuts_test_clean_large.jsonl
number of instances for the test_clean_large split: 24960
Extracting text from: /home/mazhang/pt/libriheavy/libriheavy_cuts_test_other.jsonl
number of instances for the test_other split: 2718
Extracting text from: /home/mazhang/pt/libriheavy/libriheavy_cuts_test_other_large.jsonl
number of instances for the test_other_large split: 23833
total number of instances before discarding selected instances:

100%|██████████| 114581/114581 [00:22<00:00, 5116.57it/s]
100%|██████████| 1047513/1047513 [03:53<00:00, 4490.76it/s]
100%|██████████| 5129/5129 [00:01<00:00, 4932.69it/s]
100%|██████████| 2433/2433 [00:00<00:00, 5143.69it/s]
100%|██████████| 24960/24960 [00:04<00:00, 5171.58it/s]
100%|██████████| 2718/2718 [00:00<00:00, 4976.59it/s]
100%|██████████| 23833/23833 [00:04<00:00, 4976.91it/s]

original number of instances:  1285094
After cleaning:  1203004
dropped instances:  82090
dropped instances because of nonexception ascii symbols: 12424
dropped percentage:  0.06387859565136869





In [3]:
patterns = get_frequent_patterns(instances)
print(len(patterns))
for pattern in patterns[:100]:
    print(pattern)

1407088
['an', "'", '|', 'and']
['lived', '|', 'live']
['child', '|', 'girl']
['two', '|', 'too']
["'", '|']
['an', "'", '|', 'and']
["'", '|']
["'", '|']
['an', "'", 'git', '|', 'and', 'get']
["'", '|']
['frame', '|', 'framed']
['re', '|', 'rie']
['an', "'", '|', 'and']
['beautiful', 'creatures', '|', 'beautifullest', 'creeturs']
['|', 'in']
['an', "'", '|', 'and']
["'", '|']
["'", 's', '|', 'is']
['an', "'", '|', 'and']
['an', "'", '|', 'and']
['down', '|', 'adown']
['|', 'had']
['host', '|', 'hostst']
["'", '|']
['jes', "'", 'natcherly', '|', 'just', 'naturally']
["'", '|']
['an', "'", '|', 'and']
["'", '|']
["'", '|']
['an', "'", '|', 'and']
['an', "'", '|', 'and']
['dive', '|', 'dives']
['an', "'", '|', 'and']
['knew', '|', 'know']
['knew', '|', 'know']
['diff', "'", 'rence', '|', 'difference']
['an', "'", '|', 'and']
['an', "'", '|', 'and']
['divers', '|', 'a', 'verse']
["'", '|']
['acrost', 'a', '|', 'across', 'the']
['marvelous', '|', 'marvellous']
['mayre', '|', 'mary']
["'", 

In [4]:
solver = PrefixSpan(patterns)
frequent_patterns = solver.frequent(20)
frequent_patterns.sort(key=lambda x: x[0], reverse=True)
print(len(frequent_patterns))

75898


In [5]:
filtered_frequent_patterns = [x for x in frequent_patterns if "|" in x[1] and len(x[1])>1]
print(len(filtered_frequent_patterns))

38301


In [6]:
for pattern in filtered_frequent_patterns:
    if "can" in pattern[1]:
        print(pattern)

(1337, ['|', 'can'])
(1207, ['can', '|'])
(669, ['can', '|', 'cannot'])
(634, ['can', 'not', '|'])
(634, ['can', 'not', '|', 'cannot'])
(217, ['can', '|', 'could'])
(165, ['could', '|', 'can'])
(161, ['kin', '|', 'can'])
(129, ['|', 'can', "'"])
(126, ['|', 'can', "'", 't'])
(126, ['|', 'can', 't'])
(77, ['can', "'", '|'])
(63, ['can', "'", 't', '|'])
(63, ['can', 't', '|'])
(61, ['cannot', '|', 'can'])
(59, ['c', '|', 'can'])
(59, ['cant', '|', 'can'])
(58, ["'", '|', 'can'])
(56, ['cant', '|', 'can', "'"])
(56, ['cant', '|', 'can', "'", 't'])
(56, ['cant', '|', 'can', 't'])
(45, ['|', 'can', 'never'])
(40, ['|', 'i', 'can'])
(38, ['ken', '|', 'can'])
(33, ['|', 'you', 'can'])
(27, ['|', 'can', 'tell'])
(26, ['|', 'can', 'be'])
(26, ['|', 'can', 'you'])
(25, ['can', "'", 't', '|', 'cannot'])
(25, ['can', "'", '|', 'cannot'])
(25, ['can', 't', '|', 'cannot'])
(23, ['cannot', '|', 'can', "'"])
(23, ['cannot', '|', 'can', "'", 't'])
(23, ['cannot', '|', 'can', 't'])
(22, ['|', 'can', 'i'

In [7]:
sequence_patterns_dict = defaultdict(lambda:[])
for pattern in filtered_frequent_patterns:
    sep_idx = pattern[1].index("|")
    left = pattern[1][:sep_idx]
    right = pattern[1][sep_idx+1:]
    sequence_patterns_dict[" ".join(left)].append(pattern)


In [8]:
len(sequence_patterns_dict)

11523

In [9]:
keys_high_confidence = []
keys_empty_rhs = []
for key, value in sequence_patterns_dict.items():
    value.sort(key=lambda x:x[0], reverse=True)
    if len(value)>1:
        highest_confidence = value[1][0]/value[0][0]
        if highest_confidence>0.9:
            keys_high_confidence.append(key)
    else:
        keys_empty_rhs.append(key)
    

In [10]:
keys_to_pop = keys_empty_rhs+keys_high_confidence
for key in keys_to_pop:
    sequence_patterns_dict.pop(key)
print(len(sequence_patterns_dict))

3613


In [12]:
len(keys_empty_rhs)

4869

In [11]:
sequence_patterns_dict

defaultdict(<function __main__.<lambda>()>,
            {"'": [(230983, ["'", '|']),
              (6177, ["'", '|', 'and']),
              (2038, ["'", '|', 'is']),
              (1853, ["'", '|', 'of']),
              (1742, ["'", '|', 'the']),
              (1482, ["'", '|', 'am']),
              (1461, ["'", '|', 'you']),
              (1350, ["'", '|', 'have']),
              (1344, ["'", '|', 'to']),
              (1060, ["'", '|', 'him']),
              (1044, ["'", '|', 'are']),
              (869, ["'", '|', 'a']),
              (861, ["'", '|', 'mister']),
              (858, ["'", '|', 'in']),
              (856, ["'", '|', 'going']),
              (852, ["'", '|', 'will']),
              (629, ["'", '|', 'it']),
              (618, ["'", '|', 'monsieur']),
              (609, ["'", '|', 'he']),
              (593, ["'", '|', 'do']),
              (493, ["'", '|', 'had']),
              (483, ["'", '|', 'missus']),
              (481, ["'", '|', 'i']),
              (466, ["

In [5]:
right_empty_frequent_patterns = [x for x in frequent_patterns if "|" in x[1] and len(x[1])>1 and x[1].index("|") == len(x[1])-1]
print(len(right_empty_frequent_patterns))

1718


In [8]:
right_empty_frequent_patterns[:10000]

[(230983, ["'", '|']),
 (52433, ['mr', '|']),
 (26320, ['mrs', '|']),
 (24164, ['the', '|']),
 (20376, ['a', '|']),
 (17196, ["'", "'", '|']),
 (14748, ['s', '|']),
 (13668, ['and', '|']),
 (12949, ["'", 's', '|']),
 (12918, ['in', '|']),
 (9758, ['d', '|']),
 (8760, ['an', '|']),
 (8247, ['to', '|']),
 (7811, ["'", 'd', '|']),
 (7406, ['of', '|']),
 (6250, ['is', '|']),
 (5557, ['an', "'", '|']),
 (5379, ['that', '|']),
 (5277, ['st', '|']),
 (5195, ['o', '|']),
 (5040, ['dr', '|']),
 (4663, ['i', '|']),
 (4642, ['it', '|']),
 (4042, ['m', '|']),
 (3600, ['on', '|']),
 (3568, ['at', '|']),
 (3451, ['t', '|']),
 (3401, ['he', '|']),
 (3112, ['had', '|']),
 (3038, ['one', '|']),
 (2981, ['as', '|']),
 (2920, ['for', '|']),
 (2912, ['this', '|']),
 (2877, ['o', "'", '|']),
 (2868, ['are', '|']),
 (2748, ['re', '|']),
 (2736, ['1', '|']),
 (2656, ['his', '|']),
 (2582, ['you', '|']),
 (2567, ['de', '|']),
 (2471, ['have', '|']),
 (2436, ['was', '|']),
 (2376, ['2', '|']),
 (2339, ['am', '

In [6]:
def is_subsequence(current_pattern, pattern_to_check):
    if len(pattern_to_check) == 0:
        return True
    start = 0
    current = 0
    temp = 0
    left_exclude = []
    right_exclude = []
    left = True
    left_exclude_current_scan = []
    right_exclude_current_scan = []
    while start<=len(current_pattern)-len(pattern_to_check):

        if current_pattern[current]==pattern_to_check[temp]:
            if current_pattern[current] == "|":
                left = False
            current += 1
            temp += 1
        else:
            if left:
                left_exclude_current_scan.append(current_pattern[current])
            else:
                right_exclude_current_scan.append(current_pattern[current])
            current += 1
        if temp == len(pattern_to_check):
            for word in current_pattern[current:]:
                right_exclude_current_scan.append(word)
            left_exclude.extend(left_exclude_current_scan)
            right_exclude.extend(right_exclude_current_scan)
            return True, left_exclude, right_exclude
        if len(current_pattern) - current < len(pattern_to_check) - temp:
            if current_pattern[start]=="|":
                return False, None, None
            left_exclude.append(current_pattern[start])
            left_exclude_current_scan = []
            right_exclude_current_scan = []
            start += 1
            current = start
            temp = 0

    return False, None, None

In [40]:
processed_frequent_patterns = [[pattern[0], copy.deepcopy(pattern[1])] for pattern in filtered_frequent_patterns]
processed_frequent_patterns.sort(key=lambda x:len(x[1]), reverse=True)
processed_frequent_patterns

In [41]:
for idx in range(len(processed_frequent_patterns)):
    current_pattern = processed_frequent_patterns[idx]
    for i in range(idx+1, len(processed_frequent_patterns)):
        shorter_pattern = processed_frequent_patterns[i]
        if len(current_pattern[1]) == len(shorter_pattern[1]):
            continue
        issub, left, right = is_subsequence(current_pattern[1], shorter_pattern[1])
        if issub:
            if shorter_pattern[0] < current_pattern[0]:
                print("Warning: ", end="")
            print(f'subtracting {shorter_pattern[0]} by {current_pattern[0]} because \"{" ".join(shorter_pattern[1])}\" is a subsequence of \"{" ".join(current_pattern[1])}\"')
            shorter_pattern[0] -= current_pattern[0]

subtracting 213 by 206 because "obj 1 | to objection" is a subsequence of "obj 1 | to objection one"
subtracting 717 by 206 because "obj | to objection" is a subsequence of "obj 1 | to objection one"
subtracting 215 by 206 because "obj 1 | to" is a subsequence of "obj 1 | to objection one"
subtracting 212 by 206 because "1 | to one" is a subsequence of "obj 1 | to objection one"
subtracting 208 by 206 because "| to objection one" is a subsequence of "obj 1 | to objection one"
subtracting 2153 by 206 because "1 | one" is a subsequence of "obj 1 | to objection one"
subtracting 1134 by 206 because "obj | objection" is a subsequence of "obj 1 | to objection one"
subtracting 724 by 206 because "| to objection" is a subsequence of "obj 1 | to objection one"
subtracting 724 by 206 because "obj | to" is a subsequence of "obj 1 | to objection one"
subtracting 282 by 206 because "| to one" is a subsequence of "obj 1 | to objection one"
subtracting 229 by 206 because "1 | to" is a subsequence of 

In [30]:
processed_frequent_patterns

[[206, ['obj', '1', '|', 'to', 'objection', 'one']],
 [198, ['obj', '2', '|', 'to', 'objection', 'two']],
 [181, ['obj', '3', '|', 'to', 'objection', 'three']],
 [129, ['d', "'", 'ye', '|', 'do', 'you']],
 [125, ['mr', 'crisparkle', '|', 'mister', 'cris', 'sparkle']],
 [114, ['mrs', 'van', 'warmelo', '|', 'missus', 'von']],
 [101, ['mrs', 'delvile', '|', 'missus', 'd', "'"]],
 [414, ["'", 'an', "'", '|', 'and']],
 [-25, ['obj', '2', '|', 'objection', 'two']],
 [353, ['mr', 'grimm', '|', 'mister', 'grim']],
 [-26, ['obj', '3', '|', 'objection', 'three']],
 [314, ['we', "'", 'll', '|', 'will']],
 [297, ['an', "'", "'", '|', 'and']],
 [286, ['mrs', 'hurtle', '|', 'missus', 'hurdle']],
 [237, ['m', "'", 'sieur', '|', 'monsieur']],
 [236, ['s', "'", 'pose', '|', 'suppose']],
 [223, ['its', '|', 'it', "'", 's']],
 [223, ['asad', '|', 'as', "'", 'ad']],
 [217, ['mrs', 'assingham', '|', 'missus', 'asingham']],
 [214, ['heav', "'", 'n', '|', 'heaven']],
 [-182, ['obj', '2', '|', 'to', 'objectio

In [50]:
sequence_patterns_dict_processed = copy.deepcopy(sequence_patterns_dict)
for group, items in sequence_patterns_dict_processed.items():
    for idx in range(len(items)):
        items[idx] = [items[idx][0], items[idx][1]]
    items.sort(key=lambda x: len(x[1]), reverse=True)
    for idx in range(len(items)):
        current_pattern = items[idx]
        for i in range(idx+1, len(items)):
            if is_subsequence(current_pattern[1][current_pattern[1].index("|")+1:], items[i][1][items[i][1].index("|")+1:]):
                items[i][0] -= current_pattern[0]

In [51]:
sequence_patterns_dict_processed

{"'": [[153, ["'", '|', 'do', 'you']],
  [6177, ["'", '|', 'and']],
  [2038, ["'", '|', 'is']],
  [1853, ["'", '|', 'of']],
  [1742, ["'", '|', 'the']],
  [1482, ["'", '|', 'am']],
  [1308, ["'", '|', 'you']],
  [1350, ["'", '|', 'have']],
  [1344, ["'", '|', 'to']],
  [1060, ["'", '|', 'him']],
  [1044, ["'", '|', 'are']],
  [869, ["'", '|', 'a']],
  [861, ["'", '|', 'mister']],
  [858, ["'", '|', 'in']],
  [856, ["'", '|', 'going']],
  [852, ["'", '|', 'will']],
  [629, ["'", '|', 'it']],
  [618, ["'", '|', 'monsieur']],
  [609, ["'", '|', 'he']],
  [440, ["'", '|', 'do']],
  [493, ["'", '|', 'had']],
  [483, ["'", '|', 'missus']],
  [481, ["'", '|', 'i']],
  [466, ["'", '|', 'with']],
  [379, ["'", '|', 'your']],
  [365, ["'", '|', 'them']],
  [346, ["'", '|', 'ye']],
  [341, ["'", '|', 'that']],
  [327, ["'", '|', 'nothing']],
  [325, ["'", '|', 'for']],
  [303, ["'", '|', 'his']],
  [295, ["'", '|', 'not']],
  [286, ["'", '|', 'all']],
  [275, ["'", '|', 'or']],
  [262, ["'", '|',

In [5]:
def find_instances_with(instances, symbols, exclude):
    num_print = 1000
    count = 0
    for split in splits:
        insts = instances[split]
        for ins in insts:
            if ins["BookText"].find(symbols)!=-1 and (not exclude or ins["BookText"].find(exclude)==-1) and count<num_print:
                count+=1
                print("BookText:",ins["BookText"])
                print("ASRTranscript:",ins["ASRTranscript"])

In [6]:
# to check !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
# normal !"$&',.:;?
# just remove: ()*<anywords>[illustration] [num] [FN#] [] _ ` ~
# discard instance +=@\|
# dealing with $ r'(?<!\$)\b[^$\s]+\$\D[^$\s]*\b' to detect corrupted string
# should take note of starting or ending mismatches
# mismatches that have the symbols not specified above should be removed entirely
find_instances_with(instances, "-", "")

In [None]:
# strategy:
# normal symbols and will be kept : !"$&',.:;?
# 1. discard instances containing: +=@\|
# 2. remove the following from the written form:()*<anywords>[illustration] [num] [FN#] [] _ ` ~
# 3. remove the substring with the following r'(?<!\$)\b[^$\s]+\$\D[^$\s]*\b'
# 4. remove mismatches at the beginning and end of sentences
# 5. remove mismatches that one side is empty
# 6. remove mismatches containing symbols not in the normal set

In [None]:
pattern = r'(?<!\$)\b[^$\s]+\$\D[^$\s]*\b'
match = re.findall(pattern, "as an aper$u of the")
print(match)