In [1]:
import os,nltk,re,codecs,string,pickle
nltk.download('punkt')
from nltk import FreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter
from itertools import groupby

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bluthund\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [2]:
charmap = [# vowel grouping. Customize as reqd, though this is most optimal, 9 keys
           ['ா அஆ','a'],
           ['ி ீ இஈ','i'],
           ['ு ூ உஊ','u'],
           ['ெ ே எஏ','e'],
           ['ை ஐ','y'],
           ['ொ ோ ஒஓ','o'],
           ['ௌ ஔ','w'],
           ['ஃ','k'],
           ['்','x'],
           # consonant grouping. Might experiment with other central shapes and sizes, 10 keys
           ['கங','0'],
           ['சஞ','1'],
           ['டண','2'],
           ['தந','3'],
           ['பம','4'],
           ['யர','5'],
           ['லவழ','6'],
           ['ளறன','7'],
           ['ஜஶஷ','8'],
           ['ஸஹ','9']]

# characters to remove
rmvstr = '௦௧௨௩௪௫௬௭௮௯௰' # to remove all words containing Tamil numerals

def codegen(str):
    code = str.rsplit(None, 1)[-1]
    for map in charmap:
        for i in range(0,len(map[0])):
            code=re.sub(map[0][i],map[1],code)
        # introducing the missing 'a' vowel between two digits
        code=re.sub(r'([0-9])([0-9])',r'\1a\2',code)
        code=re.sub(r'(\d\s*$)',r'\1a',code)
        # introducing spaces between two digits
    code=re.sub(r'([a-z])',r'\1',code)
    return code

def codegroup(file_name,file_temp):
    # file_name = fn_bgm
    g = codecs.open(file_temp,"w+","utf-8")
    # sorting lines
    with codecs.open(file_name,"r",'utf-8') as f:
        lines = f.readlines()
        lines.sort(key=len)
        lines.sort()
        f.seek(0)
        g.writelines(lines)
    f.close()
    block = 0 # writeblock for aberrant codes and word pairs
    g = codecs.open(file_name,"w","utf-8")
    with codecs.open(file_temp,"r","utf-8") as fin:
        lines = (line.split(None, 1) for line in fin if line.strip())
        for i,j in groupby(lines, lambda L: L[:1]):
            lines = [el[1] for el in j]
            # cleaning square brackets from final output
            w = str(i)[2:-2]
            if re.search(r"^[a-z]?([0-9][a-z])+$",w):
                g.write('>'+w+'\r\n')
                block = 0
            else:
                block = 1
            for pair in lines:
                # reverse bigram words for subsequent grouping operation
                if block == 0:
                    rev=pair.split()
                    rev.reverse()
                    pair = " ".join(rev)
                    g.write(pair+'\r\n')
        fin.close()
    g.close()
    
def ltf(file_name,file_temp,list):
    f = codecs.open(file_name,"w",'utf-8')
    for item in list:
        # removing unwanted characters
        item_clean = re.sub("[(),\',0-9]*", "",str(item))
        # removing excess P<space> strings
        item_clean = re.sub("(P\s)+","P ",item_clean)
        # removing more unwanted Ps
        item_clean = re.sub(r"P([^\s\r])",r"\1",item_clean)
        # removing trailing spaces
        item_clean = re.sub("\s$","",item_clean)
        if item_clean[-1] != 'P':
            code = codegen(item_clean)
            f.write(code + ' ' + item_clean + '\r\n')
    f.close()
    codegroup(file_name,file_temp)
        
def ftl(file_name):
    results = []
    f = codecs.open(file_name,'r','utf-8')
    for line in f:
        results.append(line.replace('\r\n', ''))
    f.close()
    return results

def textclean(file_name,file_temp):
    with codecs.open(file_name, "r","utf-8") as fin:
        with codecs.open(file_temp, "w","utf-8") as fout:
            for line in fin:
                # punctuation conversion
                line = re.sub("[!\.\?\(\)\'/\"\-\:\[\]\|]+"," P ",line)
                line = line.replace(",","")
                # removing words with Tamil numerals (infrequent usage compared to Arabic numerals i.e 0123456789)
                line = re.sub("\s[^\s]*[%s]+[^\s]*\s" %rmvstr," ",line)
                
                # keeping only Tamil unicode charset
                # adding letters from charmap to regex string
                regex = ''
                for map in charmap:
                    for i in range(0,len(map[0])):
                        regex += map[0][i]
                line=re.sub('[^P %s]' %regex,"",line)
                    
                # removing extra whitespaces and Ps
                line = re.sub("(P )+","P ",line)
                line = re.sub("\s+"," ",line)
                
                k = re.split(r'\s+',line)
                for i in k:
                    if i != '':
                        fout.write(i+' ')
                fout.write('\r\n')
    fin.close()
    fout.close()

In [47]:
fn_src = "sample"
fn_bgm = "bgm"
fn_tmp = "temp" # stores cleaned source text

textclean(fn_src,fn_tmp)

rwtxt = codecs.open(fn_tmp,"r","utf-8")
bitxt = codecs.open(fn_bgm,"w+","utf-8")
rwtxt_r = rwtxt.read()

token = word_tokenize(rwtxt_r)

#Create your bigrams
bigrm = ngrams(token,2)
bigrm_count = Counter(bigrm)
bigrm_list = list(bigrm_count.items())

ltf(fn_bgm,fn_tmp,bigrm_list)