In [2]:
import unicodedata
from typing import Callable, Generator, Any
import json
from unidecode import unidecode
import csv
from collections import Counter

In [3]:
def memoize(f: Callable) -> Callable:
    """
    memoization decorator for a function taking ONLY a single argument
    src: http://code.activestate.com/recipes/578231-probably-the-fastest-memoization-decorator-in-the-/
    """

    class MemoDict(dict):
        def __missing__(self, key):
            ret = self[key] = f(key)
            return ret

    return MemoDict().__getitem__

@memoize
def is_word_char(char: str) -> bool:
    return unicodedata.category(char) in {'Lu', 'Ll', 'Lt', 'Lm', 'Lo',  # letters
                                          # 'Nd', 'Nl', 'No',  # numbers
                                          'Mn', 'Mc', 'Me',  # diacritics, etc
                                          # 'Co',  # private use char class
                                          }

def words(text: str) -> Generator[str, Any, None]:
    word_buffer = []
    for char in text:
        # char is part of word
        if is_word_char(char):
            word_buffer.append(char)

        # char is non-text AND buffer is text
        elif word_buffer:
            yield f''.join(word_buffer)#.casefold()
            word_buffer = []

    # yield remainder
    if word_buffer:
        yield f''.join(word_buffer)#.casefold()

In [9]:
rows = []
with open('MASTER.LINGUISTIC.1GRAM.big.csv', 'rt' ,encoding='utf8', newline='') as f:
    c = csv.reader(f)
    for i, row in enumerate(c):
        if (i + 1) % 1000000 == 0:
            print(i + 1)
        if row[0] == 'MALAY':
            rows.append(row)

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000
31000000
32000000
33000000
34000000
35000000
36000000
37000000
38000000
39000000
40000000
41000000
42000000
43000000
44000000
45000000
46000000
47000000
48000000
49000000
50000000
51000000
52000000
53000000
54000000
55000000
56000000
57000000
58000000
59000000
60000000
61000000
62000000
63000000
64000000
65000000
66000000
67000000
68000000
69000000
70000000
71000000
72000000
73000000
74000000
75000000
76000000
77000000
78000000
79000000
80000000
81000000
82000000
83000000
84000000
85000000
86000000
87000000
88000000
89000000
90000000
91000000
92000000
93000000
94000000
95000000
96000000
97000000
98000000
99000000
100000000
101000000
102000000
103000000
104000000
105000000
106000000
107000000
108000000
109000000
110000000
11100000

In [10]:
word_counts = Counter()
for row in rows:
    for word in words(row[1]):
        word_counts[word.casefold()] += int(row[2])

In [12]:
word_counts.most_common(1000)

[('rm', 1874016),
 ('to', 1575740),
 ('in', 1485760),
 ('for', 1364739),
 ('di', 1178209),
 ('the', 1115668),
 ('of', 1088133),
 ('s', 1050269),
 ('sqm', 1022913),
 ('sqft', 1022892),
 ('properties', 976728),
 ('malaysia', 919802),
 ('more', 693704),
 ('yang', 659794),
 ('ini', 635270),
 ('video', 631991),
 ('news', 563769),
 ('dengan', 508396),
 ('a', 498495),
 ('new', 477752),
 ('dan', 462913),
 ('untuk', 458248),
 ('on', 454915),
 ('ke', 432370),
 ('berita', 427828),
 ('and', 425206),
 ('no', 422817),
 ('tak', 414164),
 ('rent', 408566),
 ('dalam', 389925),
 ('najib', 385717),
 ('anak', 362569),
 ('tidak', 354724),
 ('is', 352756),
 ('sale', 341295),
 ('mahathir', 339233),
 ('dunia', 326064),
 ('lifestyle', 324680),
 ('umno', 316934),
 ('with', 300981),
 ('tahun', 298081),
 ('sukan', 292616),
 ('negara', 291113),
 ('pas', 277138),
 ('jadi', 275003),
 ('tv', 273781),
 ('buat', 268337),
 ('nasional', 261815),
 ('saya', 261464),
 ('rtb', 261208),
 ('as', 256333),
 ('m', 253828),
 ('foo

In [13]:
rows = []
with open('MASTER.LINGUISTIC.2GRAM.big.csv', 'rt' ,encoding='utf8', newline='') as f:
    c = csv.reader(f)
    for i, row in enumerate(c):
        if (i + 1) % 1000000 == 0:
            print(i + 1)
        if row[0] == 'MALAY':
            rows.append(row)

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000
31000000
32000000
33000000
34000000
35000000
36000000
37000000
38000000
39000000
40000000
41000000
42000000
43000000
44000000
45000000
46000000
47000000
48000000
49000000
50000000
51000000
52000000
53000000
54000000
55000000
56000000
57000000
58000000
59000000
60000000
61000000
62000000
63000000
64000000
65000000
66000000
67000000
68000000
69000000
70000000
71000000
72000000
73000000
74000000
75000000
76000000
77000000
78000000
79000000
80000000
81000000
82000000
83000000
84000000
85000000
86000000
87000000
88000000
89000000
90000000
91000000
92000000
93000000
94000000
95000000
96000000
97000000
98000000
99000000
100000000
101000000
102000000
103000000
104000000
105000000
106000000
107000000
108000000
109000000
110000000
11100000

In [17]:
def twogram(words):
    tmp = None
    for word in words:
        if tmp is not None:
            yield f'{tmp} {word}'
        tmp = word

In [18]:
word_2_grams = Counter()
for row in rows:
    for word in twogram(words(row[1])):
        word_2_grams[word.casefold()] += int(row[2])

In [19]:
len(word_2_grams)

1075490

In [20]:
word_2_grams.most_common(100)

[('properties for', 498190),
 ('for rent', 340869),
 ('for sale', 329056),
 ('read more', 223679),
 ('sale in', 197081),
 ('rent in', 196651),
 ('new properties', 183540),
 ('yang lalu', 183199),
 ('mark leo', 176144),
 ('properties in', 157588),
 ('jam yang', 148698),
 ('ministry of', 147831),
 ('mt webmaster', 143138),
 ('bootstrap slideshow', 116312),
 ('to be', 106372),
 ('no max', 104912),
 ('no min', 104912),
 ('raja petra', 97350),
 ('kuala lumpur', 96221),
 ('privacy policy', 96108),
 ('petra kamarudin', 93679),
 ('sea games', 91063),
 ('guan eng', 87677),
 ('dr m', 87426),
 ('of the', 83337),
 ('negeri sembilan', 80239),
 ('s pura', 76009),
 ('news list', 75102),
 ('rtb station', 74262),
 ('gaya hidup', 73822),
 ('nigel dusanjh', 71391),
 ('about us', 70444),
 ('tahun harakah', 67300),
 ('meninggal dunia', 65657),
 ('hubungi kami', 64722),
 ('gowri krishnan', 64402),
 ('rd minion', 63949),
 ('log in', 63407),
 ('contact us', 62569),
 ('di sini', 62506),
 ('azwa rahman', 61564)

In [222]:
[(w,c) for w, c in word_2_grams.most_common() if w.startswith('air ')]

[('air mata', 8673),
 ('air sejuk', 3835),
 ('air asia', 3768),
 ('air liur', 2447),
 ('air di', 2268),
 ('air terjun', 1600),
 ('air sukarkan', 1560),
 ('air selangor', 1551),
 ('air bawah', 1421),
 ('air dalam', 1368),
 ('air tangan', 1354),
 ('air kencing', 1316),
 ('air panas', 1182),
 ('air bersih', 1166),
 ('air hostess', 1060),
 ('air luah', 1040),
 ('air dengan', 828),
 ('air vent', 776),
 ('air by', 754),
 ('air again', 733),
 ('air jt', 727),
 ('air yang', 725),
 ('air terhempas', 704),
 ('air on', 703),
 ('air ketum', 702),
 ('air percuma', 678),
 ('air paling', 665),
 ('air bunga', 607),
 ('air zam', 580),
 ('air to', 520),
 ('air minuman', 509),
 ('air sungai', 458),
 ('air quality', 455),
 ('air milo', 446),
 ('air pollution', 437),
 ('air about', 417),
 ('air kecil', 412),
 ('air dan', 404),
 ('air travel', 382),
 ('air teh', 379),
 ('air keruh', 379),
 ('air menurut', 374),
 ('air passengers', 353),
 ('air putih', 351),
 ('air pulih', 348),
 ('air kosong', 336),
 ('air 

In [24]:
word_1_grams = Counter()
for words, count in word_2_grams.most_common():
    for word in words.split():
        word_1_grams[word] += count

In [25]:
len(word_1_grams)

77707

In [26]:
word_1_grams.most_common(100)

[('to', 3020198),
 ('in', 2712982),
 ('for', 2559722),
 ('di', 2276706),
 ('of', 2074228),
 ('the', 1843135),
 ('s', 1300218),
 ('yang', 1273597),
 ('properties', 1215215),
 ('malaysia', 1026458),
 ('dengan', 970989),
 ('untuk', 888952),
 ('dan', 886084),
 ('on', 860463),
 ('a', 827675),
 ('and', 802888),
 ('ini', 783483),
 ('ke', 757577),
 ('dalam', 738041),
 ('tak', 706016),
 ('new', 658240),
 ('rm', 658126),
 ('tidak', 643949),
 ('is', 627542),
 ('more', 606008),
 ('rent', 593839),
 ('with', 552149),
 ('sale', 545045),
 ('jadi', 512983),
 ('anak', 509116),
 ('buat', 498053),
 ('at', 464527),
 ('no', 444388),
 ('be', 431315),
 ('akan', 430866),
 ('najib', 423704),
 ('as', 412970),
 ('mahathir', 398924),
 ('pada', 390717),
 ('umno', 390672),
 ('m', 379212),
 ('boleh', 365123),
 ('not', 345206),
 ('saya', 345137),
 ('hari', 343108),
 ('perlu', 342857),
 ('pas', 332684),
 ('you', 332485),
 ('tahun', 332213),
 ('nak', 331671),
 ('dr', 326718),
 ('dari', 326464),
 ('anda', 325273),
 ('kep

In [27]:
word_counts.most_common(100)

[('rm', 1874016),
 ('to', 1575740),
 ('in', 1485760),
 ('for', 1364739),
 ('di', 1178209),
 ('the', 1115668),
 ('of', 1088133),
 ('s', 1050269),
 ('sqm', 1022913),
 ('sqft', 1022892),
 ('properties', 976728),
 ('malaysia', 919802),
 ('more', 693704),
 ('yang', 659794),
 ('ini', 635270),
 ('video', 631991),
 ('news', 563769),
 ('dengan', 508396),
 ('a', 498495),
 ('new', 477752),
 ('dan', 462913),
 ('untuk', 458248),
 ('on', 454915),
 ('ke', 432370),
 ('berita', 427828),
 ('and', 425206),
 ('no', 422817),
 ('tak', 414164),
 ('rent', 408566),
 ('dalam', 389925),
 ('najib', 385717),
 ('anak', 362569),
 ('tidak', 354724),
 ('is', 352756),
 ('sale', 341295),
 ('mahathir', 339233),
 ('dunia', 326064),
 ('lifestyle', 324680),
 ('umno', 316934),
 ('with', 300981),
 ('tahun', 298081),
 ('sukan', 292616),
 ('negara', 291113),
 ('pas', 277138),
 ('jadi', 275003),
 ('tv', 273781),
 ('buat', 268337),
 ('nasional', 261815),
 ('saya', 261464),
 ('rtb', 261208),
 ('as', 256333),
 ('m', 253828),
 ('foo

In [217]:
len([(word, count) for word, count in word_counts.most_common() if word not in word_1_grams])

18958

In [220]:
[(word, count) for word, count in word_counts.most_common() if word not in word_1_grams and (word in ms_words or word not in eng_words_large)][:100]

[('sqm', 1022913),
 ('sqft', 1022892),
 ('jpg', 151044),
 ('seagames', 35652),
 ('ekoniaga', 26220),
 ('eksentrik', 25636),
 ('entertainmet', 19713),
 ('fourthofficial', 17828),
 ('australiaopen', 17826),
 ('suganya', 17794),
 ('mhtv', 17749),
 ('myresipi', 16567),
 ('suudu', 16528),
 ('ebiz', 15616),
 ('newsbriefs', 15502),
 ('rojakdaily', 13137),
 ('arenakereta', 13077),
 ('amazingnara', 13077),
 ('mybooks', 12878),
 ('webtv', 12517),
 ('ipa', 12387),
 ('advancedsearch', 12386),
 ('sgbagus', 12354),
 ('aizam', 11727),
 ('hiadiputra', 11178),
 ('qibod', 9658),
 ('gedjet', 9589),
 ('shafika', 9448),
 ('enewsletter', 9144),
 ('natonline', 9041),
 ('worldsbk', 9002),
 ('onerror', 8521),
 ('kathirasen', 8519),
 ('mhcover', 8464),
 ('iklaneka', 8276),
 ('mhbreakfirst', 8009),
 ('klasified', 6843),
 ('rapsodi', 5976),
 ('sukansinar', 5874),
 ('gpsbestari', 5485),
 ('kampusuols', 5483),
 ('twitterdev', 5471),
 ('hijabster', 4427),
 ('terimakasihcikgu', 3724),
 ('hmm', 3532),
 ('movc', 3492),

In [33]:
[word for word, _ in word_1_grams.most_common() if word not in word_counts]

['energia']

In [30]:
sum(word_counts.values())

168836600

In [29]:
word_counts['advancedsearch'] / sum(word_counts.values())

7.336087080644837e-05

In [31]:
word_1_grams['energia']

1

In [180]:
def similarity(n1, n2):
    return min(n1, n2) / max(n1, n2)

In [182]:
all_ms_words = set(word_counts.keys()).union(set(word_1_grams.keys()))

In [200]:
sum(word_counts.values()) / 1000 / 1000

168.8366

In [199]:
sum(word_1_grams.values()) / 1000 / 1000

219.384828

In [204]:
len(all_ms_words)

96665

In [213]:
word_sim = Counter({word: similarity(word_counts[word], word_1_grams[word])
                    for word in all_ms_words
                    if (word not in eng_words_large) 
                    and (similarity(word_counts[word], word_1_grams[word]) > 0)
                    and (similarity(word_counts[word], word_1_grams[word]) < 1)
                    and (word_counts[word] > 160 or word_1_grams[word] > 210)
                   })
len(word_sim)

15198

In [215]:
word_sim.most_common()[4000::-1]

[('lada', 0.7623947614593077),
 ('ustazah', 0.7624286878565607),
 ('menunggang', 0.7624336069531628),
 ('ucapkan', 0.7625368731563422),
 ('pekerjaan', 0.7625428168446504),
 ('futsal', 0.7626774847870182),
 ('dahulu', 0.762719428741446),
 ('semangat', 0.7627469646107854),
 ('muslimah', 0.7627705627705628),
 ('chatime', 0.7627952755905512),
 ('manjakan', 0.7628032345013477),
 ('juruwang', 0.7628571428571429),
 ('renjer', 0.7630057803468208),
 ('berseorangan', 0.7630522088353414),
 ('fitri', 0.7631086142322098),
 ('pencegahan', 0.7631233595800525),
 ('mendominasi', 0.7631578947368421),
 ('menular', 0.7632311977715878),
 ('ditunda', 0.7632612966601179),
 ('comfortdelgro', 0.7633136094674556),
 ('shafee', 0.7633644169513434),
 ('propertyguru', 0.7634050081654872),
 ('perlawanan', 0.7634458672875436),
 ('mendebarkan', 0.7634803921568627),
 ('mydin', 0.7635933806146572),
 ('luqman', 0.7637260950030845),
 ('ketokohan', 0.7637614678899083),
 ('demokrat', 0.7637795275590551),
 ('وانيتا', 0.76377

In [195]:
print(word_counts['vjc'])
print(word_1_grams['vjc'])

30
60


In [4]:
with open('words_ms.txt', encoding='utf8') as f:
    ms_words = set(w.casefold() for w in set(words(f.read())))

In [8]:
# with open('words_ms.txt', 'wt', encoding='utf8') as f:
#     for word in sorted(ms_words):
#         f.write(word + '\n')

In [5]:
from bhanot_dictionary import definitions

In [6]:
for word in definitions:
    ms_words.add(word.casefold())

In [7]:
len(ms_words)

49651

In [164]:
with open('words_en.txt', encoding='utf8') as f:
    eng_words = set(w.casefold() for w in set(words(f.read())))

In [167]:
len(eng_words)

41120

In [168]:
eng_words_large = set()
with open('american-english-insane.big.txt', 'rt' ,encoding='utf8') as f:
    for word in words(f.read()):
        eng_words_large.add(word.casefold())
        eng_words_large.add(unidecode(word).casefold())
with open('british-english-insane.big.txt', 'rt' ,encoding='utf8') as f:
    for word in words(f.read()):
        eng_words_large.add(word.casefold())
        eng_words_large.add(unidecode(word).casefold())
with open('canadian-english-insane.big.txt', 'rt' ,encoding='utf8') as f:
    for word in words(f.read()):
        eng_words_large.add(word.casefold())
        eng_words_large.add(unidecode(word).casefold())

In [169]:
len(eng_words_large)

497577

In [170]:
eng_words_small = set()
with open('english1k.txt', 'rt' ,encoding='utf8') as f:
    for word in words(f.read()):
        eng_words_small.add(word.casefold())
        eng_words_small.add(unidecode(word).casefold())

In [171]:
len(eng_words_small)

1017

In [62]:
len(word_1_grams)

77707

In [172]:
len([(word, count) for word, count in word_1_grams.most_common() if (word in ms_words or word not in eng_words) and len(word) > 1])

57434

In [173]:
len(word_counts)

96664

In [174]:
len([(word, count) for word, count in word_counts.most_common() if (word in ms_words or word not in eng_words) and len(word) > 1])

76032

In [177]:
[(word, count) for word, count in word_counts.most_common()[:100] if (word in ms_words or word not in eng_words) and len(word) > 1]

[('rm', 1874016),
 ('di', 1178209),
 ('sqm', 1022913),
 ('sqft', 1022892),
 ('malaysia', 919802),
 ('yang', 659794),
 ('ini', 635270),
 ('dengan', 508396),
 ('dan', 462913),
 ('untuk', 458248),
 ('ke', 432370),
 ('berita', 427828),
 ('tak', 414164),
 ('dalam', 389925),
 ('najib', 385717),
 ('anak', 362569),
 ('tidak', 354724),
 ('mahathir', 339233),
 ('dunia', 326064),
 ('umno', 316934),
 ('tahun', 298081),
 ('sukan', 292616),
 ('negara', 291113),
 ('jadi', 275003),
 ('buat', 268337),
 ('nasional', 261815),
 ('saya', 261464),
 ('rtb', 261208),
 ('dr', 242053),
 ('terkini', 239025),
 ('hari', 232002),
 ('akan', 225498),
 ('johor', 215602),
 ('pada', 213014),
 ('anwar', 211300),
 ('lagi', 209326),
 ('lalu', 206732),
 ('kl', 202656),
 ('kami', 200022),
 ('jam', 198049),
 ('lebih', 196917),
 ('boleh', 196523),
 ('isu', 196461),
 ('utama', 195099),
 ('nak', 191906),
 ('orang', 191204),
 ('korang', 186482),
 ('raja', 184775),
 ('anda', 183868),
 ('pru', 182380),
 ('hiburan', 182349),
 ('ada'

In [178]:
[(word, count) for word, count in word_1_grams.most_common()[:100] if (word in ms_words or word not in eng_words) and len(word) > 1]

[('di', 2276706),
 ('yang', 1273597),
 ('malaysia', 1026458),
 ('dengan', 970989),
 ('untuk', 888952),
 ('dan', 886084),
 ('ini', 783483),
 ('ke', 757577),
 ('dalam', 738041),
 ('tak', 706016),
 ('rm', 658126),
 ('tidak', 643949),
 ('jadi', 512983),
 ('anak', 509116),
 ('buat', 498053),
 ('akan', 430866),
 ('najib', 423704),
 ('mahathir', 398924),
 ('pada', 390717),
 ('umno', 390672),
 ('boleh', 365123),
 ('saya', 345137),
 ('hari', 343108),
 ('perlu', 342857),
 ('tahun', 332213),
 ('nak', 331671),
 ('dr', 326718),
 ('dari', 326464),
 ('anda', 325273),
 ('kepada', 317330),
 ('korang', 315428),
 ('lebih', 309124),
 ('ada', 304873),
 ('dunia', 301223),
 ('negara', 299454),
 ('orang', 298827),
 ('wanita', 267419),
 ('raya', 264462),
 ('rtb', 260748),
 ('lagi', 256625),
 ('bakal', 255744),
 ('lelaki', 253324),
 ('kini', 247105),
 ('bukan', 244913),
 ('anwar', 243647),
 ('petra', 237672),
 ('mahu', 231719),
 ('rumah', 231077),
 ('kongsi', 226476),
 ('johor', 222524),
 ('dua', 222317),
 ('is