In [1]:
import pandas as pd
from collections import defaultdict
import re

In [2]:
splits = ['small', 'medium', 'large', 'dev', 'test_clean', 'test_clean_large', 'test_other', 'test_other_large']
# splits = ['small']

In [3]:
instances = {}
for split in splits:
    file_name = f"libriheavy_clean_{split}.jsonl"
    print("reading:", file_name)
    jsonObj = pd.read_json(path_or_buf=file_name, lines=True).to_dict('list')
    instances[split] = jsonObj

reading: libriheavy_clean_small.jsonl
reading: libriheavy_clean_medium.jsonl
reading: libriheavy_clean_large.jsonl
reading: libriheavy_clean_dev.jsonl
reading: libriheavy_clean_test_clean.jsonl
reading: libriheavy_clean_test_clean_large.jsonl
reading: libriheavy_clean_test_other.jsonl
reading: libriheavy_clean_test_other_large.jsonl


In [4]:
def contains_roman_numerals(text):
    # Regular expression to match Roman numerals followed by optional "th", "st", or "nd"
    pattern = r"\bM{0,3}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})(?:th|st|nd)?\b"
    
    # Search for Roman numerals in the text
    matches = re.findall(pattern, text)
    matches = [match for match in matches if match]

    for match in matches:
        if match != "I" and match!="th" and match!="st" and match!="nd":
            return True
        
    return False

def contains_cap_letters(text):
    # Regular expression to match fully capitalized words separated by periods or periods with space
    pattern = r"\b[A-Z]+(?:\.(?:\s)?[A-Z]+)*\b"
    
    # Search for fully capitalized words in the text
    matches = re.findall(pattern, text)
    for match in matches:
        if len(match)>1:
            # print(match)
            # print(text)
            return True
    
    return False

In [5]:
count = 0
instances_with_digits = defaultdict(lambda:[])
instances_with_roman = defaultdict(lambda:[])
instances_with_letters = defaultdict(lambda:[])
for split in splits:
    for idx, ins in enumerate(instances[split]["BookText"]):
        if any(char.isdigit() for char in ins):
            instances_with_digits[split].append(idx)
            count += 1
        elif contains_roman_numerals(ins):
            instances_with_roman[split].append(idx)
            count += 1
        elif contains_cap_letters(ins):
            instances_with_letters[split].append(idx)
            count += 1
for split in splits:
    print(split)
    print(len(instances_with_digits[split]))
    print(len(instances_with_roman[split]))
    print(len(instances_with_letters[split]))
    
print(count)

small
4511
1052
3166
medium
26950
11113
19916
large
310236
107084
179641
dev
132
41
163
test_clean
40
14
35
test_clean_large
413
194
416
test_other
79
25
137
test_other_large
672
225
1008
667263


In [11]:
splits = ["small", "medium", "large"]
total_length = 0
digit_length = 0
roman_length = 0
cap_length = 0
for split in splits:
    total_length += len(instances[split]["BookText"])
    digit_length += len(instances_with_digits[split])
    roman_length += len(instances_with_roman[split])
    cap_length += len(instances_with_letters[split])
    print(split)
    print(total_length)
    print(digit_length)
    print(roman_length)
    print(cap_length)
    print(digit_length/total_length)
    print(roman_length/total_length)
    print(cap_length/total_length)


small
113587
4511
1052
3166
0.03971405178409501
0.009261623249139426
0.02787290799123139
medium
1144609
31461
12165
23082
0.02748624202675324
0.010628083476540897
0.02016583829063025
large
11367830
341697
119249
202723
0.030058243305890395
0.010490040755359642
0.017833042893850452


In [12]:
import json

In [14]:
def get_dict(instances_split:dict, idx):
    return {key: value[idx] for key, value in instances_split.items()}

In [15]:
# save ITN instances
splits = ['small', 'medium', 'large', 'dev', 'test_clean', 'test_clean_large', 'test_other', 'test_other_large']
for split in splits:
    out_file = f"libriheavy_ITN_{split}.jsonl"
    with open(out_file, "w") as file:
        for idx in instances_with_digits[split]:
            file.write(json.dumps(get_dict(instances[split], idx)).strip()+"\n")
        for idx in instances_with_letters[split]:
            file.write(json.dumps(get_dict(instances[split], idx)).strip()+"\n")
        for idx in instances_with_roman[split]:
            file.write(json.dumps(get_dict(instances[split], idx)).strip()+"\n")
        

In [5]:
for i in instances_with_digits[:100]:
    print("Written:", instances["small"]["BookText"][i])
    print("Spoken:", instances["small"]["ASRTranscript"][i])
    print()

Written: CHAPTER 2 THE MERMAIDS The next morning, as soon as Trot had helped wipe the breakfast dishes and put them away in the cupboard, the little girl and Cap'n Bill started out toward the bluff. The air was soft and warm and the sun turned the edges of the waves into sparkling diamonds.
Spoken: CHAPTER TWO THE MERMAIDS THE NEXT MORNING AS SOON AS TROT HAD HELPED WIPE THE BREAKFAST DISHES AND PUT THEM AWAY IN THE CUPBOARD THE LITTLE GIRL AND CAP'N BILL STARTED OUT TOWARD THE BLUFF THE AIR WAS SOFT AND WARM AND THE SUN TURNED THE EDGES OF THE WAVES INTO SPARKLING DIAMONDS

Written: CHAPTER 3 THE DEPTHS OF THE DEEP BLUE SEA Cap'n Bill stood up in the boat as if undecided what to do.
Spoken: CHAPTER THREE THE DEPTHS OF THE DEEP BLUE SEA CAP'N BILL STOOD UP IN THE BOAT AS IF UNDECIDED WHAT TO DO

Written: CHAPTER 22 TROT LIVES TO TELL THE TALE Aquareine was thoughtful for a time.
Spoken: CHAPTER TWENTY TWO TROT LIVES TO TELL THE TALE ALCORINE WAS THOUGHTFUL FOR A TIME

Written: Dust, to

In [6]:
semiotic_classes = ["LETTERS", "CARDINAL", "VERBATIM", "ORDINAL", "DECIMAL", "ELECTRONIC", "DIGIT", "MONEY", "FRACTION", "TIME", "ADDRESS"]

In [10]:
examples_of_each_semiotic_classes = defaultdict(lambda:[])
GTN_file_path = "/home/mazhang/pt/inverse-text-normailzation/GTN_ini/output-00000-of-00100"
with open(GTN_file_path, "r") as file:
    for line in file:
        line = line.strip().split("\t")
        if len(line) == 3 and line[0] in semiotic_classes:
            if len(examples_of_each_semiotic_classes[line[0]])<100:
                examples_of_each_semiotic_classes[line[0]].append(line[1])

In [11]:
examples_of_each_semiotic_classes

defaultdict(<function __main__.<lambda>()>,
            {'LETTERS': ['IUCN',
              'BC',
              'ALCS',
              'C.',
              'J.',
              'G.',
              'U.S.',
              'LP',
              'Aceh',
              'Aceh',
              'NJ',
              'Lviv',
              'Ukh',
              'NGO',
              'SSP',
              'UK',
              'BSE',
              'WFE',
              'ZRs',
              'BBC',
              'W.',
              'W.F.',
              'D.',
              'WWE',
              'LC',
              'PMN',
              'TM-',
              'POMZ',
              'Boev',
              'Z.',
              'PDF',
              'CIAC',
              'GPS',
              'IASI',
              'POLARCAT',
              'ACSOE',
              'PDF',
              'T.',
              'M',
              'N.',
              'DoD',
              'Revd',
              'NRL',
              'E.',
              'LT'