# Russian Tokenization Experiments - PROGRESS

In [7]:
import os, sys
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path) 

from os import listdir
from os.path import isfile, join

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import re

#force reimport
if 'pygents.util' in sys.modules:
    del sys.modules['pygents.util']
if 'pygents.text' in sys.modules:
    del sys.modules['pygents.text']
if 'pygents.plot' in sys.modules:
    del sys.modules['pygents.plot']
if 'pygents.token' in sys.modules:
    del sys.modules['pygents.token']

from pygents.util import * 
from pygents.text import * 
from pygents.plot import * 
from pygents.token import * 

lex_eng = "https://raw.githubusercontent.com/aigents/aigents-java/master/lexicon_english.txt"


In [49]:
from pygents.text import tokenize_with_sorted_lexicon

class LexiconTokenizer(Tokenizer):

    def __init__(self, name=None, lexicon=None, cased=False, url=None, debug=False):
        Tokenizer.__init__(self,debug=debug)
        self.name = name
        if not lexicon is None: 
            self.alex = list(lexicon) #copy
        else:
            lex_lines = url_lines(url)
            self.alex = [re.split('\t| |,|;|\n|\r',line)[0] for line in lex_lines] #load from url
            # TODO load from file
        self.compile()
        self.cased = cased

    def compile(self):
        self.alex.sort(key=len,reverse=True) #precompile

    def tokenize(self,text):
        return tokenize_with_sorted_lexicon(self.alex,text,cased=self.cased)

assert str(LexiconTokenizer(lexicon=['tuna','is','fish','cat','mammal']).tokenize("tunaisafish.catisamammal"))=="['tuna', 'is', 'a', 'fish', '.', 'cat', 'is', 'a', 'mammal']"    
assert str(LexiconTokenizer(lexicon=['tuna','is','fish','cat','mammal']).tokenize("Tunaisafish.Catisamammal"))=="['Tuna', 'is', 'a', 'fish', '.Cat', 'is', 'a', 'mammal']"
assert str(LexiconTokenizer(lexicon=['tuna','is','fish','cat','mammal'],cased=True).tokenize("Tunaisafish.Catisamammal"))=="['Tuna', 'is', 'a', 'fish', '.', 'Cat', 'is', 'a', 'mammal']"


In [50]:
def prefixed_match_from_list(lst,text):
    for item in lst:
        if text.startswith(item):
            return item
    return None

def prefixed_match(prefixed_dict,text):
    letter = text[0]
    if not letter in prefixed_dict:
        return None
    return prefixed_match_from_list(prefixed_dict[letter],text)

def tokenize_with_prexied_sorted_lexicon(prefixed_dict,text,cased=False):
    original = text
    if cased: #if need to spend time on lowercasing non-lowercased text
        text = text.lower()
    tokens = []
    start = 0
    cur = 0
    length = len(text)
    while cur < length:
        subtext = text[cur:]
        al = prefixed_match(prefixed_dict,subtext)
        #print(al)
        if not al is None:
            if start < cur:
                tokens.append(original[start:cur])
                #print(original[start:cur])
            tokens.append(original[cur:cur+len(al)])
            #print(original[cur:cur+len(al)])
            cur += len(al)
            start = cur
        else:
            cur += 1
            #print('yo')
    if start < cur:
        tokens.append(original[start:cur])
        #print(original[start:cur])
    return tokens

class LexiconIndexedTokenizer(LexiconTokenizer):

    def __init__(self, name=None, lexicon=None, cased=False, debug=False, url=None):
        LexiconTokenizer.__init__(self,name=name,lexicon=lexicon,cased=cased,url=url,debug=debug)

    def compile(self):
        #print('compile!')
        self.alex.sort(key=len,reverse=True) #precompile
        self.dict = {}
        for word in self.alex:
            if len(word) > 0:
                letter = word[0]
                if not letter in self.dict:
                    self.dict[letter] = set()
                self.dict[letter].add(word)
        #print(self.dict['f'])
        for key in self.dict:
            lst = list(self.dict[key])
            lst.sort(key=len,reverse=True)
            self.dict[key] = lst
        #print(self.dict['f'])

    def tokenize(self,text):
        return tokenize_with_prexied_sorted_lexicon(self.dict,text,cased=self.cased)
        
assert str(LexiconIndexedTokenizer(lexicon=['tuna','is','fish','cat','mammal']).tokenize("tunaisafish.catisamammal"))=="['tuna', 'is', 'a', 'fish', '.', 'cat', 'is', 'a', 'mammal']"    
assert str(LexiconIndexedTokenizer(lexicon=['tuna','is','fish','cat','mammal']).tokenize("Tunaisafish.Catisamammal"))=="['Tuna', 'is', 'a', 'fish', '.Cat', 'is', 'a', 'mammal']"
assert str(LexiconIndexedTokenizer(lexicon=['tuna','is','fish','cat','mammal'],cased=True).tokenize("Tunaisafish.Catisamammal"))=="['Tuna', 'is', 'a', 'fish', '.', 'Cat', 'is', 'a', 'mammal']"


In [51]:
lt = LexiconIndexedTokenizer(url = lex_eng)

In [52]:
lt.dict['i']

['immunoelectrophoresis',
 'institutionalization',
 'institutionalisation',
 'internationalization',
 'immunohistochemistry',
 'internationalisation',
 'interdenominational',
 'intellectualization',
 'immunocytochemistry',
 'incomprehensibility',
 'interdisciplinarity',
 'immunoprecipitation',
 'interorganizational',
 'immunohistochemical',
 'immunocytochemical',
 'institutionalizing',
 'incommensurability',
 'interchangeability',
 'immunofluorescence',
 'interrelationships',
 'intercommunication',
 'interconnectedness',
 'industrialisation',
 'institutionalised',
 'intraperitoneally',
 'individualization',
 'incompatibilities',
 'interrelationship',
 'interdependencies',
 'intergovernmental',
 'interdisciplinary',
 'immunosuppression',
 'internationalized',
 'inappropriateness',
 'immunosuppressive',
 'immunocompromised',
 'interprofessional',
 'intercorrelations',
 'institutionalists',
 'industrialization',
 'institutionalized',
 'indistinguishable',
 'indestructibility',
 'individua

In [53]:
lt.tokenize("tunaisa")

['tuna', 'isa']

In [54]:
lt.tokenize("tunaisafish.catisamammal")


['tuna', 'isa', 'fish', '.', 'cati', 'sama', 'mma', 'l']