In [1]:
import os
import re
import sys
import json
import asyncio
from collections import Counter

import numpy as np
from tqdm import tqdm

sys.path.append('../')
sys.path.append('../utils')

from utils.edit_distance import generate_phonetic_edits, load_words, generate_1_edit_tokens, generate_2_edit_tokens, tokenize_word, edit_distance
from utils.file_utils import get_files_recursively
from utils.preprocess import get_suffixes, process_word_suffix, split_sentences, remove_punctuation, clean_word

In [2]:
ONLINEKHABAR_PATH = '/home/bibek/projects/scrapeet/_scraped/onlinekhabar'
CORPUS_PATH = '/home/bibek/projects/scrapeet/_scraped'
VOCAB = set(load_words())
SUFFIXES = get_suffixes()

In [3]:
def get_phonetic_match(word):   
    if word in VOCAB:
        return word
    phonetics = [x for x in generate_phonetic_edits(word) if x in VOCAB]
    #phonetics = [word]
    if len(phonetics) == 1:
        # print(phonetics[0], 'present after phonetic edits', word)
        return phonetics[0]
    return word

    
def get_correct_word(word, pre_existing = {}):
    #if word in pre_existing:
    #    return pre_existing[word]
    
    if word in VOCAB:
        # pre_existing[word] = word
        return word
    '''
    phonetic_match = get_phonetic_match(word)
    if phonetic_match:
        pre_existing[word] = phonetic_match
        return phonetic_match
    '''
    
    word_suffix = process_word_suffix(SUFFIXES, word)
    if not word_suffix:
        #pre_existing[word] = word
        return word
    if word_suffix[0] in VOCAB:
        # pre_existing[word] = word
        return word
    phonetic_match = get_phonetic_match(word_suffix[0])
    if phonetic_match:
        ret = phonetic_match + ''.join(word_suffix[1:])
        #pre_existing[word] = ret
        return ret
    
    # pre_existing[word] = word
    return word

def get_trigrams(sent):
    l = len(sent)
    trigrams = []
    for x in range(0, l - 3 + 1, 3):
        trigrams.append(tuple(sent[x:x+3]))
    return trigrams


def get_bigrams(sent):
    l = len(sent)
    bigrams = []
    for x in range(1, l-2 + 1, 2):
        bigrams.append(tuple(sent[x:x+2]))
    return bigrams

In [None]:
from multiprocessing import Pool, Manager

OK_FILES = get_files_recursively(CORPUS_PATH)
files_count = len(OK_FILES)
WORD_IDS = {}

all_sentences = []
NEW_VOCAB = Counter()
TRIGRAMS = Counter()
BIGRAMS = Counter()

CHUNK_SIZE = 8

async def read_file(fname):
    with open(fname) as f:
        f.readline()  # the link
        return remove_punctuation(f.read())
    
def process_file(content):
    pre_existing_cache = {}
    sentences = split_sentences(content)
    if sentences[-1].startswith('प्रकाशित'):
        sentences = sentences[:-1]
    for sentence in sentences:
        words = [get_correct_word(clean_word(x), pre_existing_cache) for x in sentence.split() if x]
        #NEW_VOCAB.update(words)
        gram_words = ['<s>', '<s>', *words, '<e>']
        #words.append('<e>')
        # For trigrams insert two startings
        # words.insert(0, '<s>')
        # words.insert(0, '<s>')
        return words, get_bigrams(gram_words), get_trigrams(gram_words)
        #BIGRAMS.update(get_bigrams(words))
        # TRIGRAMS.update(get_trigrams(words))

for i in tqdm(range(0, files_count, CHUNK_SIZE)):
    processed_contents = await asyncio.gather(*[read_file(OK_FILES[i+x]) for x in range(CHUNK_SIZE) if i+x < files_count])
    with Pool(8) as p:
        res = p.map(process_file, processed_contents)
        # p.join()
        for w, bi, tri in res:
            NEW_VOCAB.update(w)
            BIGRAMS.update(bi)
            TRIGRAMS.update(tri)

 99%|█████████▉| 29154/29341 [37:20<00:18, 10.03it/s]    

In [None]:
print(len(NEW_VOCAB))
with open('VOCAB_FREQS.json', 'w', encoding='utf-8') as f:
    json.dump(dict(NEW_VOCAB), f, indent=4, ensure_ascii=False)
print("written new vocab")

print("Writing bigrams")
with open('BIGRAMS.txt', 'w', encoding='utf-8') as f:
    for k, v in BIGRAMS.items():
        f.write(' '.join(k) + " " + str(v))
        f.write('\n')
print("Written bigrams")

print("Writing trigrams")
with open('TRIGRAMS.txt', 'w', encoding='utf-8') as f:
    for k, v in TRIGRAMS.items():
        f.write(' '.join(k) + " " + str(v))
        f.write('\n')
print("Written trigrams")

In [10]:
print(len(FIXED_NEW_VOCAB))
with open("VOCAB.txt", 'w') as f:
    f.write('\n'.join(FIXED_NEW_VOCAB))

39511


In [7]:
w = 'त्यहि'
print(generate_phonetic_edits(w))
from edit_distance import edit_distance

print(edit_distance(tokenize_word('त्यहि'), tokenize_word('त्यही')))
print(edit_distance(tokenize_word('त्यहि'), tokenize_word('त्यै')))

{'त्यहि', 'त्येहि', 'त्येही', 'त्एही', 'त्यही', 'त्एहि'}
3
4
