In [100]:
import json
import nltk
import nltk.data
import re
import gzip
import collections
import pyllist
#import heapq
#import datetime
from sortedcontainers import SortedList

In [70]:
with gzip.open('../data/simplewiki/simplewiki-20171103.sentences.json.gz', 'rt', encoding='utf8') as f:
    sentences = json.load(f)

In [71]:
word_freqs = collections.Counter()
for s in sentences:
    word_freqs.update(nltk.word_tokenize(s['text']))
word_freqs = word_freqs.most_common()

In [72]:
orig_word_freqs = word_freqs

In [118]:
word_freqs = collections.Counter(nltk.word_tokenize('the cat and the hat')).most_common()

In [115]:
word_freqs = orig_word_freqs

In [134]:
word_freqs = orig_word_freqs.most_common()

In [116]:
class Bigram():
    def __init__(self, u0, u1, freq):
        self.u0 = u0
        self.u1 = u1
        self.freq = freq
        self.word_next = None
        self.word_prev = None
        self.index_next = None
        self.index_prev = None
        self.exists = freq > 0
    
    def __repr__(self):
        return 'Bigram(u0=%s, u1=%s, freq=%d)' % (self.u0, self.u1, self.freq)

In [138]:
words = {}

index = {}

unigram_dict = {}
terminal_unigrams = set()

def remove_bigram(b):
    global index
    
    b.index_prev.index_next = b.index_next
    b.index_next.index_prev = b.index_prev
    b.index_next = None
    b.index_prev = None
    
    b.word_prev.word_next = b.word_next
    b.word_next.word_prev = b.word_prev
    b.word_next = None
    b.word_prev = None
    
    index_head = index[(b.u0, b.u1)]
    index_head.freq -= b.freq
    
    if index_head.freq == 0:
        assert not index_head.index_next.exists
        del index[(b.u0, b.u1)]

def insert_bigram(b, after):
    global index
    
    b.word_prev = after
    b.word_next = after.word_next
    after.word_next.word_prev = b
    after.word_next = b
    
    index_head = index.get((b.u0, b.u1))
    if not index_head:
        index[(b.u0, b.u1)] = index_head = Bigram(b.u0, b.u1, 0)
        index_tail = Bigram(None, None, 0)
        index_head.index_next = index_tail
        index_tail.index_prev = index_head

    b.index_prev = index_head
    b.index_next = index_head.index_next
    index_head.index_next.index_prev = b
    index_head.index_next = b

    index_head.freq += b.freq

def check_invariants():
    global index
    global words
    
    return
    
    for b, head in index.items():
        assert not head.exists
        freq = 0
        n = head.index_next
        while n.exists:
            assert n.u0 == b[0]
            assert n.u1 == b[1]
            assert n.index_next.index_prev == n
            assert n.index_prev.index_next == n
            assert n.word_next.word_prev == n
            assert n.word_prev.word_next == n
            freq += n.freq
            n = n.index_next
        assert head.freq == freq, '%s: head.freq (%d) != freq (%d)' % (b, head.freq, freq)
    
    for _, head in words.items():
        assert not head.exists
        n = head.word_next
        while n.exists:
            assert n.index_next.index_prev == n
            assert n.index_prev.index_next == n
            assert n.word_next.word_prev == n
            assert n.word_prev.word_next == n
            if n.word_prev.exists:
                assert n.word_prev.u1 == n.u0
            if n.word_next.exists:
                assert n.word_next.u0 == n.u1
            n = n.word_next

for i, (word, freq) in enumerate(word_freqs):
    if i % 10000 == 0:
        print('indexed: %d/%d' % (i, len(word_freqs)))
    
    word_head = Bigram(None, None, 0)
    word_tail = Bigram(None, None, 0)
    word_head.word_next = word_tail
    word_tail.word_prev = word_head
    
    words[word] = word_prev = word_head
    
    for j in range(len(word) - 1):
        u0 = word[j]
        u1 = word[j+1]
        b = Bigram(u0, u1, freq)
        insert_bigram(b, word_prev)
        word_prev = b

print('indexed: %d/%d' % (len(word_freqs), len(word_freqs)))

check_invariants()

indexed: 0/459687
indexed: 10000/459687
indexed: 20000/459687
indexed: 30000/459687
indexed: 40000/459687
indexed: 50000/459687
indexed: 60000/459687
indexed: 70000/459687
indexed: 80000/459687
indexed: 90000/459687
indexed: 100000/459687
indexed: 110000/459687
indexed: 120000/459687
indexed: 130000/459687
indexed: 140000/459687
indexed: 150000/459687
indexed: 160000/459687
indexed: 170000/459687
indexed: 180000/459687
indexed: 190000/459687
indexed: 200000/459687
indexed: 210000/459687
indexed: 220000/459687
indexed: 230000/459687
indexed: 240000/459687
indexed: 250000/459687
indexed: 260000/459687
indexed: 270000/459687
indexed: 280000/459687
indexed: 290000/459687
indexed: 300000/459687
indexed: 310000/459687
indexed: 320000/459687
indexed: 330000/459687
indexed: 340000/459687
indexed: 350000/459687
indexed: 360000/459687
indexed: 370000/459687
indexed: 380000/459687
indexed: 390000/459687
indexed: 400000/459687
indexed: 410000/459687
indexed: 420000/459687
indexed: 430000/459687
in

In [142]:
start = datetime.datetime.now()

for i in range(10000):
    if i % 1 == 0:
        print('iteration:', i)
        
    if len(index) == 0:
        break
    
    start_max = datetime.datetime.now()
    index_head = max(index.values(), key=lambda x: x.freq)
    u0 = index_head.u0
    u1 = index_head.u1
    finish_max = finish = datetime.datetime.now()
    
    elapsed = (finish - start).total_seconds() * 1000.0
    elapsed_max = (finish_max - start_max).total_seconds() * 1000.0
    
    print(elapsed, elapsed_max)
    
    new_unigram = len(unigram_dict)

    n = index_head.index_next
    while n.exists:
        check_invariants()
        if not n.word_prev.exists and not n.word_next.exists:
            terminal_unigrams.add(new_unigram)
        if n.word_prev.exists:
            tmp = n.word_prev
            insert_bigram(Bigram(n.word_prev.u0, new_unigram, n.freq), n.word_prev)
            remove_bigram(tmp)
        if n.word_next.exists:
            tmp = n.word_next
            insert_bigram(Bigram(new_unigram, n.word_next.u1, n.freq), n.word_next)
            remove_bigram(tmp)
        tmp = n.index_next
        remove_bigram(n)
        n = tmp
        
        check_invariants()

    w0 = unigram_dict[u0] if isinstance(u0, int) else u0
    w1 = unigram_dict[u1] if isinstance(u1, int) else u1
    unigram_dict[new_unigram] = w0 + w1

iteration: 0
13.417 12.775
iteration: 1
201.977 12.295
iteration: 2
727.078 11.688
iteration: 3
1264.613 10.678
iteration: 4
1307.399 10.174000000000001
iteration: 5
1770.612 10.784
iteration: 6
2132.851 10.793000000000001
iteration: 7
2383.963 10.53
iteration: 8
2622.9719999999998 11.086
iteration: 9
2864.913 10.964
iteration: 10
3158.1839999999997 11.098
iteration: 11
3442.307 13.527000000000001
iteration: 12
3586.536 11.67
iteration: 13
3769.175 11.811
iteration: 14
4105.285 11.898000000000001
iteration: 15
4414.758 11.753
iteration: 16
4656.611 11.690000000000001
iteration: 17
4690.0199999999995 11.052
iteration: 18
4870.33 12.283000000000001
iteration: 19
4946.566 14.058
iteration: 20
7038.111 11.934999999999999
iteration: 21
7224.634 11.991999999999999
iteration: 22
7305.959 13.924000000000001
iteration: 23
7447.674 12.468
iteration: 24
7582.246999999999 12.104000000000001
iteration: 25
7713.7880000000005 13.802999999999999
iteration: 26
7878.594999999999 13.189
iteration: 27
796

20715.762000000002 29.731
iteration: 228
20839.02 29.854
iteration: 229
20899.693 28.115000000000002
iteration: 230
20941.685 29.812
iteration: 231
20975.659 27.973000000000003
iteration: 232
21003.494 26.908
iteration: 233
21035.77 30.398999999999997
iteration: 234
21081.814000000002 33.249
iteration: 235
21230.733 33.236000000000004
iteration: 236
21283.963 29.813
iteration: 237
21333.945 31.196
iteration: 238
21384.732 32.025
iteration: 239
21421.098 32.194
iteration: 240
21466.075 28.934
iteration: 241
21503.709 28.587
iteration: 242
21595.365999999998 30.334
iteration: 243
21630.929 28.139999999999997
iteration: 244
21685.656000000003 33.062
iteration: 245
21730.41 29.295
iteration: 246
21859.438000000002 31.094
iteration: 247
21898.959000000003 30.962
iteration: 248
21944.355000000003 32.686
iteration: 249
21988.001 29.266000000000002
iteration: 250
22018.278 28.869
iteration: 251
22048.879 28.563000000000002
iteration: 252
22084.205 28.537
iteration: 253
22119.769 28.562
iterati

31396.132 39.488
iteration: 454
31449.862 41.7
iteration: 455
31500.975000000002 42.577999999999996
iteration: 456
31567.518 43.361999999999995
iteration: 457
31611.421 39.894
iteration: 458
31686.793999999998 42.077999999999996
iteration: 459
31732.512 41.439
iteration: 460
31782.561 43.824000000000005
iteration: 461
31823.488999999998 40.214
iteration: 462
31871.594 41.784
iteration: 463
31915.778 41.190999999999995
iteration: 464
31961.131 40.144
iteration: 465
32001.063000000002 39.660000000000004
iteration: 466
32044.915000000005 39.845
iteration: 467
32088.538999999997 41.304
iteration: 468
32133.888 41.781
iteration: 469
32185.442000000003 41.711999999999996
iteration: 470
32234.730999999996 43.43000000000001
iteration: 471
32277.526 42.069
iteration: 472
32322.689 44.586
iteration: 473
32371.772 48.550000000000004
iteration: 474
32434.092 50.698
iteration: 475
32489.427 50.031
iteration: 476
32543.340999999997 42.535999999999994
iteration: 477
32599.187 41.37
iteration: 478
326

iteration: 675
42981.158 53.923
iteration: 676
43037.373 50.840999999999994
iteration: 677
43087.354 49.058
iteration: 678
43138.487 49.105000000000004
iteration: 679
43190.441 49.086
iteration: 680
43259.97 53.614000000000004
iteration: 681
43317.252 51.913000000000004
iteration: 682
43374.756 52.161
iteration: 683
43425.460999999996 49.764
iteration: 684
43487.296 52.329
iteration: 685
43548.627 55.542
iteration: 686
43609.710999999996 51.116
iteration: 687
43661.142 49.53
iteration: 688
43712.342 49.778999999999996
iteration: 689
43764.085999999996 50.115
iteration: 690
43817.171 52.411
iteration: 691
43870.895 50.74
iteration: 692
43923.796 52.331999999999994
iteration: 693
44050.676 60.241
iteration: 694
44108.673 56.592000000000006
iteration: 695
44163.538 52.238
iteration: 696
44222.873 55.254
iteration: 697
44272.865000000005 49.658
iteration: 698
44327.685000000005 49.730000000000004
iteration: 699
44378.936 51.045
iteration: 700
44434.832 54.103
iteration: 701
44490.727 55.45

56168.939 57.803
iteration: 896
56240.454999999994 57.323
iteration: 897
56307.734000000004 59.745
iteration: 898
56378.039000000004 68.182
iteration: 899
56436.857 58.032000000000004
iteration: 900
56499.293000000005 58.973
iteration: 901
56563.999 60.158
iteration: 902
56626.055 60.773
iteration: 903
56686.231999999996 59.833999999999996
iteration: 904
56750.657 61.217
iteration: 905
56814.362 61.155
iteration: 906
56872.133 57.327999999999996
iteration: 907
56931.502 57.016999999999996
iteration: 908
56993.307 57.813
iteration: 909
57053.865000000005 59.113
iteration: 910
57112.923 57.378
iteration: 911
57172.74 57.858
iteration: 912
57236.439 59.62
iteration: 913
57298.024 59.19
iteration: 914
57357.944 59.765
iteration: 915
57418.183999999994 57.138
iteration: 916
57476.958000000006 56.644
iteration: 917
57540.441 57.695
iteration: 918
57598.439999999995 56.663
iteration: 919
57659.756 57.039
iteration: 920
57720.092000000004 58.482
iteration: 921
57783.213 59.462
iteration: 922
5

70756.17 65.3
iteration: 1116
70823.128 65.263
iteration: 1117
70895.04100000001 68.684
iteration: 1118
70970.794 74.741
iteration: 1119
71049.363 66.333
iteration: 1120
71119.75200000001 66.861
iteration: 1121
71188.44799999999 67.871
iteration: 1122
71263.139 69.622
iteration: 1123
71337.27 70.638
iteration: 1124
71406.508 68.649
iteration: 1125
71483.95700000001 69.51599999999999
iteration: 1126
71550.49699999999 64.374
iteration: 1127
71620.565 69.536
iteration: 1128
71695.436 71.659
iteration: 1129
71764.004 67.273
iteration: 1130
71832.212 65.60000000000001
iteration: 1131
71901.281 65.773
iteration: 1132
72020.292 72.354
iteration: 1133
72094.91 67.733
iteration: 1134
72160.871 63.282000000000004
iteration: 1135
72227.192 64.08999999999999
iteration: 1136
72292.066 63.417
iteration: 1137
72362.079 66.82100000000001
iteration: 1138
72428.638 63.531000000000006
iteration: 1139
72497.206 63.67399999999999
iteration: 1140
72566.03099999999 68.554
iteration: 1141
72644.717 73.364
ite

iteration: 1332
86559.297 69.589
iteration: 1333
86634.89300000001 72.333
iteration: 1334
86705.23400000001 69.773
iteration: 1335
86775.599 68.81
iteration: 1336
86846.148 69.59700000000001
iteration: 1337
86916.29299999999 69.598
iteration: 1338
86990.67599999999 71.952
iteration: 1339
87059.877 69.022
iteration: 1340
87129.661 68.899
iteration: 1341
87199.982 70.063
iteration: 1342
87275.373 72.369
iteration: 1343
87345.508 69.017
iteration: 1344
87415.383 69.107
iteration: 1345
87488.939 69.592
iteration: 1346
87563.552 72.02
iteration: 1347
87633.575 69.51899999999999
iteration: 1348
87705.21599999999 69.123
iteration: 1349
87778.74699999999 70.04599999999999
iteration: 1350
87852.666 72.316
iteration: 1351
87926.03300000001 70.073
iteration: 1352
87995.602 69.069
iteration: 1353
88066.942 70.08399999999999
iteration: 1354
88146.51999999999 72.44399999999999
iteration: 1355
88252.429 70.017
iteration: 1356
88325.16699999999 69.66
iteration: 1357
88395.75899999999 69.786
iteration:

iteration: 1548
104574.427 75.71700000000001
iteration: 1549
104656.364 78.818
iteration: 1550
104732.327 75.80099999999999
iteration: 1551
104810.315 75.321
iteration: 1552
104907.331 76.369
iteration: 1553
104986.84 78.818
iteration: 1554
105062.44900000001 75.09
iteration: 1555
105138.406 75.47
iteration: 1556
105215.696 76.02499999999999
iteration: 1557
105295.288 77.75999999999999
iteration: 1558
105376.852 77.10000000000001
iteration: 1559
105453.04299999999 75.161
iteration: 1560
105533.006 76.229
iteration: 1561
105613.253 78.602
iteration: 1562
105689.588 75.428
iteration: 1563
105792.489 75.828
iteration: 1564
105871.33 78.02199999999999
iteration: 1565
105948.732 76.683
iteration: 1566
106025.266 75.327
iteration: 1567
106102.184 76.369
iteration: 1568
106182.867 79.154
iteration: 1569
106260.914 75.58800000000001
iteration: 1570
106337.736 75.672
iteration: 1571
106417.056 78.41199999999999
iteration: 1572
106502.575 76.621
iteration: 1573
106581.327 75.48
iteration: 1574
1

123721.997 79.432
iteration: 1763
123804.053 80.41
iteration: 1764
123897.311 81.43599999999999
iteration: 1765
123979.027 79.597
iteration: 1766
124062.056 80.175
iteration: 1767
124145.427 83.073
iteration: 1768
124228.414 79.661
iteration: 1769
124307.98400000001 79.258
iteration: 1770
124388.761 80.421
iteration: 1771
124472.076 82.688
iteration: 1772
124558.12299999999 80.439
iteration: 1773
124638.993 79.887
iteration: 1774
124724.291 80.386
iteration: 1775
124808.381 82.716
iteration: 1776
124889.00200000001 79.15700000000001
iteration: 1777
124968.945 79.628
iteration: 1778
125053.235 82.331
iteration: 1779
125136.477 80.39399999999999
iteration: 1780
125216.41200000001 79.108
iteration: 1781
125298.365 80.26700000000001
iteration: 1782
125383.788 83.122
iteration: 1783
125464.943 79.85000000000001
iteration: 1784
125546.337 79.888
iteration: 1785
125628.814 81.54400000000001
iteration: 1786
125711.584 81.83600000000001
iteration: 1787
125791.937 79.727
iteration: 1788
125879.3

iteration: 1976
149891.244 84.138
iteration: 1977
149985.443 86.575
iteration: 1978
150068.803 82.786
iteration: 1979
150153.309 83.87899999999999
iteration: 1980
150240.71699999998 86.20700000000001
iteration: 1981
150325.805 83.775
iteration: 1982
150409.097 82.91
iteration: 1983
150496.465 85.252
iteration: 1984
150581.653 84.40899999999999
iteration: 1985
150668.189 82.789
iteration: 1986
150751.633 83.08
iteration: 1987
150838.77000000002 85.305
iteration: 1988
150924.788 85.518
iteration: 1989
151009.123 83.397
iteration: 1990
151100.277 83.97699999999999
iteration: 1991
151187.434 86.862
iteration: 1992
151272.46600000001 83.096
iteration: 1993
151357.06 84.194
iteration: 1994
151443.81999999998 86.381
iteration: 1995
151528.094 83.515
iteration: 1996
151612.201 83.47
iteration: 1997
151697.20899999997 84.688
iteration: 1998
151784.112 85.83
iteration: 1999
151867.4 83.05
iteration: 2000
151954.022 85.33
iteration: 2001
152040.357 85.95899999999999
iteration: 2002
152126.011 84.

iteration: 2189
168927.458 90.789
iteration: 2190
169021.275 92.844
iteration: 2191
169112.94 90.66499999999999
iteration: 2192
169205.99300000002 89.936
iteration: 2193
169300.52800000002 92.06
iteration: 2194
169393.641 92.141
iteration: 2195
169493.61099999998 93.579
iteration: 2196
169590.42299999998 94.655
iteration: 2197
169683.996 91.09299999999999
iteration: 2198
169787.15099999998 102.602
iteration: 2199
169891.677 96.303
iteration: 2200
169980.573 88.01100000000001
iteration: 2201
170075.028 94.171
iteration: 2202
170170.219 92.874
iteration: 2203
170263.102 92.605
iteration: 2204
170358.648 94.776
iteration: 2205
170461.517 93.465
iteration: 2206
170567.307 105.147
iteration: 2207
170663.54 95.515
iteration: 2208
170758.062 93.12299999999999
iteration: 2209
170862.879 100.44699999999999
iteration: 2210
170970.84 106.429
iteration: 2211
171068.28100000002 95.813
iteration: 2212
171192.574 98.716
iteration: 2213
171288.429 93.785
iteration: 2214
171390.17799999999 93.851
itera

KeyboardInterrupt: 

In [143]:
len(index)

184027

In [145]:
list(index.items())[:100]

[(('８', '〜'), Bigram(u0=８, u1=〜, freq=1)),
 (('m', 1677), Bigram(u0=m, u1=1677, freq=18)),
 ((144, 'v'), Bigram(u0=144, u1=v, freq=2)),
 (('V', 'i'), Bigram(u0=V, u1=i, freq=1860)),
 (('=', 2100), Bigram(u0==, u1=2100, freq=2)),
 ((164, 1394), Bigram(u0=164, u1=1394, freq=1)),
 ((139, 1344), Bigram(u0=139, u1=1344, freq=5)),
 ((30, 287), Bigram(u0=30, u1=287, freq=2)),
 ((270, 302), Bigram(u0=270, u1=302, freq=34)),
 ((328, 'ç'), Bigram(u0=328, u1=ç, freq=2)),
 (('F', '0'), Bigram(u0=F, u1=0, freq=16)),
 ((661, 2163), Bigram(u0=661, u1=2163, freq=12)),
 (('Y', 224), Bigram(u0=Y, u1=224, freq=22)),
 ((703, 896), Bigram(u0=703, u1=896, freq=1)),
 ((878, 50), Bigram(u0=878, u1=50, freq=2)),
 ((290, 555), Bigram(u0=290, u1=555, freq=7)),
 (('Á', 1193), Bigram(u0=Á, u1=1193, freq=2)),
 ((1590, 39), Bigram(u0=1590, u1=39, freq=3)),
 ((753, 132), Bigram(u0=753, u1=132, freq=1)),
 ((806, 142), Bigram(u0=806, u1=142, freq=3)),
 ((300, '-'), Bigram(u0=300, u1=-, freq=9)),
 ((167, 448), Bigram(u0

In [131]:
index

{}

In [146]:
terminal_unigrams

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 67,
 68,
 69,
 71,
 72,
 73,
 74,
 75,
 76,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 110,
 111,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 131,
 132,
 133,
 134,
 135,
 136,
 138,
 139,
 140,
 142,
 143,
 144,
 145,
 147,
 148,
 149,
 150,
 151,
 152,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 169,
 170,
 171,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 181,
 182,
 183,
 185,
 187,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197,
 200,
 201,
 202,
 203,
 204,
 205,


In [147]:
unigram_dict

{0: 'he',
 1: 'in',
 2: 'an',
 3: 'the',
 4: 'er',
 5: 'on',
 6: 're',
 7: 'is',
 8: 'or',
 9: 'al',
 10: 'at',
 11: 'ed',
 12: 'as',
 13: 'ar',
 14: 'en',
 15: 'es',
 16: 'of',
 17: 'it',
 18: 'and',
 19: '||',
 20: 'ic',
 21: 'to',
 22: 'ou',
 23: 'ing',
 24: 'st',
 25: 'le',
 26: 'th',
 27: 'ro',
 28: 'ion',
 29: 'The',
 30: 'am',
 31: 'il',
 32: 'was',
 33: 'ent',
 34: 'om',
 35: 'se',
 36: 'ac',
 37: 'be',
 38: 'ol',
 39: 'ad',
 40: '19',
 41: 'ir',
 42: 'for',
 43: 'ch',
 44: 'el',
 45: "''",
 46: 'ig',
 47: 'ay',
 48: 'all',
 49: 'ter',
 50: 'iv',
 51: 'ur',
 52: 'id',
 53: 'us',
 54: 'ow',
 55: 'im',
 56: '20',
 57: 'are',
 58: 'un',
 59: 'ul',
 60: 'ag',
 61: 'ot',
 62: 'ce',
 63: 'op',
 64: 'ver',
 65: 'mo',
 66: 'ation',
 67: 'pe',
 68: 'tr',
 69: 'de',
 70: 'oun',
 71: 'pl',
 72: 'by',
 73: 'It',
 74: 'ith',
 75: 'te',
 76: '||0',
 77: 'ers',
 78: 'his',
 79: 'ts',
 80: 'av',
 81: 'wh',
 82: 'ly',
 83: 'rom',
 84: 'In',
 85: 'that',
 86: 'with',
 87: 'ap',
 88: 'um',
 89: '