In [1]:
import os
import re
import sys
import json
import asyncio
from collections import Counter

import numpy as np
from tqdm import tqdm

sys.path.append('../')
sys.path.append('../utils')

from utils.edit_distance import generate_phonetic_edits, load_words, generate_1_edit_tokens, generate_2_edit_tokens, tokenize_word, edit_distance
from utils.file_utils import get_files_recursively
from utils.preprocess import get_suffixes, process_word_suffix, split_sentences, remove_punctuation, clean_word

In [2]:
ONLINEKHABAR_PATH = '/home/bibek/projects/scrapeet/_scraped/onlinekhabar'
CORPUS_PATH = '/home/bibek/projects/scrapeet/_scraped'
VOCAB = set(load_words())
SUFFIXES = get_suffixes()

In [3]:
def get_phonetic_match(word):   
    phonetics = [x for x in generate_phonetic_edits(word)[:2] if x in VOCAB]
    #phonetics = [word]
    if len(phonetics) == 1:
        # print(phonetics[0], 'present after phonetic edits', word)
        return phonetics[0]
    return word

    
def get_correct_word(word):
    #if word in pre_existing:
    #    return pre_existing[word]
    
    if word in VOCAB:
        #pre_existing[word] = word
        return word
    '''
    phonetic_match = get_phonetic_match(word)
    if phonetic_match:
        pre_existing[word] = phonetic_match
        return phonetic_match
    '''
    
    word_suffix = process_word_suffix(SUFFIXES, word)
    if not word_suffix:
        #pre_existing[word] = word
        return word
    if word_suffix[0] in VOCAB:
        # pre_existing[word] = word
        return word
    phonetic_match = get_phonetic_match(word_suffix[0])
    return phonetic_match + ''.join(word_suffix[1:])
        
    
def get_trigrams(sent):
    l = len(sent)
    trigrams = []
    for x in range(0, l - 3 + 1, 3):
        trigrams.append(tuple(sent[x:x+3]))
    return trigrams


def get_bigrams(sent):
    l = len(sent)
    bigrams = []
    for x in range(1, l-2 + 1, 2):
        bigrams.append(tuple(sent[x:x+2]))
    return bigrams

In [4]:
from multiprocessing import Pool, Manager

OK_FILES = get_files_recursively(CORPUS_PATH)
files_count = len(OK_FILES)
WORD_IDS = {}

def add_word(word):
    global WORD_IDS
    if word in WORD_IDS:
        return
    WORD_IDS[word] = len(WORD_IDS)


all_sentences = []
NEW_VOCAB = Counter()
TRIGRAMS = Counter()
BIGRAMS = Counter()

CHUNK_SIZE = 12

async def read_file(fname):
    with open(fname) as f:
        f.readline()  # the link
        return remove_punctuation(f.read())
    
START = '<s>'
END = '<e>'
add_word(START)
add_word(END)


def process_file(content):
    pre_existing_cache = {}
    sentences = split_sentences(content)
    if sentences[-1].startswith('प्रकाशित'):
        sentences = sentences[:-1]
    for sentence in sentences:
        words = [clean_word(x) for x in sentence.split() if x]
        # [add_word(x) for x in words]
        words = [get_correct_word(clean_word(x)) for x in sentence.split() if x]
        gram_words = ['<s>', '<s>', *words, '<e>']

        return words, get_bigrams(gram_words), get_trigrams(gram_words)
        
for i in tqdm(range(0, files_count, CHUNK_SIZE)):
    processed_contents = await asyncio.gather(*[read_file(OK_FILES[i+x]) for x in range(CHUNK_SIZE) if i+x < files_count])
    with Pool(16) as p:
        res = map(process_file, processed_contents)
        # p.join()
        for w, bi, tri in res:
            NEW_VOCAB.update(w)
            BIGRAMS.update(bi)
            TRIGRAMS.update(tri)

100%|██████████| 19561/19561 [37:06<00:00,  8.79it/s]


In [5]:
print(len(NEW_VOCAB))

print('WRIGING UNI_COUNTS')
with open('VOCAB_FREQS.json', 'w', encoding='utf-8') as f:
    json.dump(dict(NEW_VOCAB), f, indent=4, ensure_ascii=False)
print("written uni_counts")

print("Writing bigrams")
with open('BIGRAMS.txt', 'w', encoding='utf-8') as f:
    for k, v in BIGRAMS.items():
        f.write(' '.join(map(str, k)) + " " + str(v))
        f.write('\n')
print("Written bigrams")

print("Writing trigrams")
with open('TRIGRAMS.txt', 'w', encoding='utf-8') as f:
    for k, v in TRIGRAMS.items():
        f.write(' '.join(map(str, k)) + " " + str(v))
        f.write('\n')
print("Written trigrams")

172912
WRIGING UNI_COUNTS
written uni_counts
Writing bigrams
Written bigrams
Writing trigrams
Written trigrams


In [4]:
WORD_IDS = json.load(open('word_ids.json'))

In [6]:
# Correct the vocab
original_vocab = set(WORD_IDS.keys())
reverse_word_ids = {v: k for k, v in WORD_IDS.items()}
fixed_count = 0
not_in_original_vocab = 0
fixed = []
for i, each in enumerate(original_vocab):
    if i % 2000 == 0:
        print(i)
    corrected = get_correct_word(each)
    if corrected != each:
        fixed.append((each, corrected))
        fixed_count += 1
print('FIXED', fixed_count, 'OUT OF ', len(original_vocab))
print(fixed)

0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
46000
48000
50000
52000
54000
56000
58000
60000
62000
64000
66000
68000
70000
72000
74000
76000
78000
80000
82000
84000
86000
88000
90000
92000
94000
96000
98000
100000
102000
104000
106000
108000
110000
112000
114000
116000
118000
120000
122000
124000
126000
128000
130000
132000
134000
136000
138000
140000
142000
144000
146000
148000
150000
152000
154000
156000
158000
160000
162000
164000
166000
168000
170000
172000
174000
FIXED 3169 OUT OF  175610
[('जन्जीरको', 'जन्जिरको'), ('पूल', 'पुल'), ('कफिको', 'कफीको'), ('एकनाशको', 'एकनासको'), ('भाईरस', 'भाइरस'), ('फ्लाइटसँगै', 'फ्लाइटसँग'), ('पेटिकामा', 'पेटीकामा'), ('गच्छदारसँगै', 'गच्छदारसँग'), ('कांग्रेसले', 'काँग्रेसले'), ('ईच्छापत्र', 'इच्छापत्र'), ('बोलि', 'बोली'), ('दरवार', 'दरबार'), ('वी', 'बि'), ('ललाइफकाई', 'ललाइफकाइ'), ('हारजीतका', 'हारजितका'), ('गुरूङकी', 'गुरुङकी'), ('परेसँगै', 'परेसँग'), ('फूपु', 'फुपू

In [7]:
w = 'त्यहि'
print(generate_phonetic_edits(w))
from edit_distance import edit_distance

print(edit_distance(tokenize_word('त्यहि'), tokenize_word('त्यही')))
print(edit_distance(tokenize_word('त्यहि'), tokenize_word('त्यै')))

{'त्यहि', 'त्येहि', 'त्येही', 'त्एही', 'त्यही', 'त्एहि'}
3
4
