# Russian Tokenization Experiments - PROGRESS

In [112]:
import os, sys
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path) 

from os import listdir
from os.path import isfile, join

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import re
import math

#force reimport
if 'pygents.util' in sys.modules:
    del sys.modules['pygents.util']
if 'pygents.text' in sys.modules:
    del sys.modules['pygents.text']
if 'pygents.plot' in sys.modules:
    del sys.modules['pygents.plot']
if 'pygents.token' in sys.modules:
    del sys.modules['pygents.token']

from pygents.util import * 
from pygents.text import * 
from pygents.plot import * 
from pygents.token import * 

lex_en = "https://raw.githubusercontent.com/aigents/aigents-java/master/lexicon_english.txt"
lex_ru = "https://raw.githubusercontent.com/aigents/aigents-java/master/lexicon_russian.txt"


In [113]:
from pygents.text import tokenize_with_sorted_lexicon

class LexiconTokenizer(Tokenizer):

    def __init__(self, name=None, lexicon=None, cased=False, url=None, debug=False):
        Tokenizer.__init__(self,debug=debug)
        self.name = name
        if not lexicon is None: 
            self.alex = list(lexicon) #copy
        else:
            lex_lines = url_lines(url)
            self.alex = [re.split('\t| |,|;|\n|\r',line)[0] for line in lex_lines] #load from url
            # TODO load from file
        self.compile()
        self.cased = cased

    def compile(self):
        self.alex.sort(key=len,reverse=True) #precompile

    def tokenize(self,text):
        return tokenize_with_sorted_lexicon(self.alex,text,cased=self.cased)

assert str(LexiconTokenizer(lexicon=['tuna','is','fish','cat','mammal']).tokenize("tunaisafish.catisamammal"))=="['tuna', 'is', 'a', 'fish', '.', 'cat', 'is', 'a', 'mammal']"    
assert str(LexiconTokenizer(lexicon=['tuna','is','fish','cat','mammal']).tokenize("Tunaisafish.Catisamammal"))=="['Tuna', 'is', 'a', 'fish', '.Cat', 'is', 'a', 'mammal']"
assert str(LexiconTokenizer(lexicon=['tuna','is','fish','cat','mammal'],cased=True).tokenize("Tunaisafish.Catisamammal"))=="['Tuna', 'is', 'a', 'fish', '.', 'Cat', 'is', 'a', 'mammal']"


In [115]:
def prefixed_match_from_list(lst,text):
    for item in lst:
        if text.startswith(item[0]):
            return item
    return None

def prefixed_match(prefixed_dict,text):
    letter = text[0]
    if not letter in prefixed_dict:
        return None
    return prefixed_match_from_list(prefixed_dict[letter],text)

def tokenize_with_prexied_sorted_lexicon(prefixed_dict,text,cased=False):
    original = text
    if cased: #if need to spend time on lowercasing non-lowercased text
        text = text.lower()
    tokens = []
    start = 0
    cur = 0
    length = len(text)
    sum_weight = 0
    while cur < length:
        subtext = text[cur:]
        word_weight = prefixed_match(prefixed_dict,subtext)
        #print(al)
        if not word_weight is None:
            word_len = len(word_weight[0])
            if start < cur:
                tokens.append(original[start:cur])
            tokens.append(original[cur:cur+word_len])
            sum_weight += word_weight[1]
            cur += word_len
            start = cur
        else:
            cur += 1
            #print('yo')
    if start < cur:
        tokens.append(original[start:cur])
        #print(original[start:cur])
    return tokens, sum_weight

def tabbed_line2tuple(line):
    lst = re.split('\t| |,|;|\n|\r',line)
    if len(lst) > 1:
        return (lst[0],float(lst[1]))
    else:
        return (lst[0],1.0)

class LexiconIndexedTokenizer(Tokenizer):

    def __init__(self, name=None, lexicon=None, cased=False, debug=False, url=None, sortmode=0):
        Tokenizer.__init__(self,debug=debug)
        self.name = name
        if not lexicon is None: 
            self.alex = [(word,1.0) for word in lexicon] #copy
        else:
            lex_lines = url_lines(url)
            self.alex = [tabbed_line2tuple(line) for line in lex_lines] #load from url
            # TODO load from file
        self.sortmode = sortmode
        self.compile()
        self.cased = cased

    def compile(self):
        self.dict = {}
        self.fulldict = dict(self.alex) # for debugging only!?
        for entry in self.alex:
            word = entry[0]
            if len(word) > 0:
                letter = word[0]
                if not letter in self.dict:
                    self.dict[letter] = set()
                self.dict[letter].add(entry)
        #print(self.dict['f'])
        for key in self.dict:
            lst = list(self.dict[key])
            if self.sortmode == 0:
                lst.sort(key=lambda s: len(s[0]), reverse=True)
            elif self.sortmode == 1:
                lst.sort(key=lambda s: s[1], reverse=True)
            else:
                lst.sort(key=lambda s: math.log10(s[1])*len(s[0]), reverse=True)
            self.dict[key] = lst
        #print(self.dict['f'])

    def tokenize(self,text):
        tokens, weight = tokenize_with_prexied_sorted_lexicon(self.dict,text,cased=self.cased)
        return tokens

    def tokenize_weight(self,text):
        return tokenize_with_prexied_sorted_lexicon(self.dict,text,cased=self.cased)

assert str(LexiconIndexedTokenizer(lexicon=['tuna','is','fish','cat','mammal']).tokenize("tunaisafish.catisamammal"))=="['tuna', 'is', 'a', 'fish', '.', 'cat', 'is', 'a', 'mammal']"    
assert str(LexiconIndexedTokenizer(lexicon=['tuna','is','fish','cat','mammal']).tokenize("Tunaisafish.Catisamammal"))=="['Tuna', 'is', 'a', 'fish', '.Cat', 'is', 'a', 'mammal']"
assert str(LexiconIndexedTokenizer(lexicon=['tuna','is','fish','cat','mammal'],cased=True).tokenize("Tunaisafish.Catisamammal"))=="['Tuna', 'is', 'a', 'fish', '.', 'Cat', 'is', 'a', 'mammal']"


## English

In [116]:
lt0 = LexiconIndexedTokenizer(url = lex_en, sortmode=0)
print(lt0.tokenize_weight("tunaisafish.catisamammal"))


(['tuna', 'isa', 'fish', '.', 'cati', 'sama', 'mma', 'l'], 182006.0)


In [117]:
lt1 = LexiconIndexedTokenizer(url = lex_en, sortmode=1)
print(lt1.tokenize_weight("tunaisafish.catisamammal"))


(['t', 'un', 'a', 'is', 'a', 'f', 'is', 'h', '.', 'c', 'a', 't', 'is', 'a', 'm', 'a', 'm', 'm', 'a', 'l'], 118294882.0)


In [118]:
lt2 = LexiconIndexedTokenizer(url = lex_en, sortmode=2)
print(lt2.tokenize("tunaisafish.catisamammal"))


['tuna', 'is', 'af', 'is', 'h', '.', 'cat', 'is', 'am', 'am', 'mal']


## Russian

In [119]:
lt0 = LexiconIndexedTokenizer(url = lex_ru, sortmode=0)
print(lt0.tokenize_weight("расцветалияблониигруши,поплылитуманынадрекой"))


(['расцвета', 'ли', 'яблони', 'игру', 'ши,', 'поплыли', 'туманы', 'над', 'рекой'], 350286.0)


In [120]:
lt1 = LexiconIndexedTokenizer(url = lex_ru, sortmode=1)
print(lt0.tokenize_weight("расцветалияблониигруши,поплылитуманынадрекой"))

(['расцвета', 'ли', 'яблони', 'игру', 'ши,', 'поплыли', 'туманы', 'над', 'рекой'], 350286.0)


In [121]:
lt2 = LexiconIndexedTokenizer(url = lex_ru, sortmode=2)
print(lt0.tokenize_weight("расцветалияблониигруши,поплылитуманынадрекой"))

(['расцвета', 'ли', 'яблони', 'игру', 'ши,', 'поплыли', 'туманы', 'над', 'рекой'], 350286.0)


In [124]:
lt2.dict['л']

[('любопытством', 4120.0),
 ('литературного', 1886.0),
 ('лаборатории', 4977.0),
 ('литературных', 2377.0),
 ('литературной', 2291.0),
 ('литературы', 9495.0),
 ('ликвидировать', 1078.0),
 ('литературе', 7932.0),
 ('литературный', 1629.0),
 ('литературные', 1573.0),
 ('любознательность', 249.0),
 ('лекарственных', 869.0),
 ('любознательности', 230.0),
 ('любопытство', 2439.0),
 ('легкомысленно', 660.0),
 ('любопытства', 2110.0),
 ('литература', 4196.0),
 ('лесохозяйственных', 134.0),
 ('литературным', 1001.0),
 ('лесопользования', 237.0),
 ('литературном', 886.0),
 ('легкомысленный', 330.0),
 ('литературное', 812.0),
 ('литературу', 3075.0),
 ('литературой', 1467.0),
 ('литературная', 785.0),
 ('литературными', 451.0),
 ('ленинградского', 281.0),
 ('лабораторию', 1289.0),
 ('лихорадочно', 1265.0),
 ('литераторов', 1261.0),
 ('литературную', 694.0),
 ('ликвидации', 2503.0),
 ('лейтенант', 5911.0),
 ('легкомысленного', 181.0),
 ('лицензирования', 260.0),
 ('лабораториях', 646.0),
 ('логи

In [None]:
#TODO compute score dividing by number of tokens?
#TODO compute score with account to number of letters in token AND / OR log of frequency?


