# Russian Tokenization Experiments - PROGRESS

In [1]:
import os, sys
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path) 

from os import listdir
from os.path import isfile, join

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import re
import math

#force reimport
if 'pygents.util' in sys.modules:
    del sys.modules['pygents.util']
if 'pygents.text' in sys.modules:
    del sys.modules['pygents.text']
if 'pygents.plot' in sys.modules:
    del sys.modules['pygents.plot']
if 'pygents.token' in sys.modules:
    del sys.modules['pygents.token']

from pygents.util import * 
from pygents.text import * 
from pygents.plot import * 
from pygents.token import * 

lex_en = "https://raw.githubusercontent.com/aigents/aigents-java/master/lexicon_english.txt"
lex_ru = "https://raw.githubusercontent.com/aigents/aigents-java/master/lexicon_russian.txt"


In [2]:
from pygents.text import tokenize_with_sorted_lexicon

class LexiconTokenizer(Tokenizer):

    def __init__(self, name=None, lexicon=None, cased=False, url=None, debug=False):
        Tokenizer.__init__(self,debug=debug)
        self.name = name
        if not lexicon is None: 
            self.alex = list(lexicon) #copy
        else:
            lex_lines = url_lines(url)
            self.alex = [re.split('\t| |,|;|\n|\r',line)[0] for line in lex_lines] #load from url
            # TODO load from file
        self.compile()
        self.cased = cased

    def compile(self):
        self.alex.sort(key=len,reverse=True) #precompile

    def tokenize(self,text):
        return tokenize_with_sorted_lexicon(self.alex,text,cased=self.cased)

assert str(LexiconTokenizer(lexicon=['tuna','is','fish','cat','mammal']).tokenize("tunaisafish.catisamammal"))=="['tuna', 'is', 'a', 'fish', '.', 'cat', 'is', 'a', 'mammal']"    
assert str(LexiconTokenizer(lexicon=['tuna','is','fish','cat','mammal']).tokenize("Tunaisafish.Catisamammal"))=="['Tuna', 'is', 'a', 'fish', '.Cat', 'is', 'a', 'mammal']"
assert str(LexiconTokenizer(lexicon=['tuna','is','fish','cat','mammal'],cased=True).tokenize("Tunaisafish.Catisamammal"))=="['Tuna', 'is', 'a', 'fish', '.', 'Cat', 'is', 'a', 'mammal']"


In [3]:
def prefixed_match_from_list(lst,text):
    for item in lst:
        if text.startswith(item[0]):
            return item
    return None

def prefixed_match(prefixed_dict,text):
    letter = text[0]
    if not letter in prefixed_dict:
        return None
    return prefixed_match_from_list(prefixed_dict[letter],text)

def tokenize_with_prexied_sorted_lexicon(prefixed_dict,text,cased=False):
    original = text
    if cased: #if need to spend time on lowercasing non-lowercased text
        text = text.lower()
    tokens = []
    start = 0
    cur = 0
    length = len(text)
    sum_weight = 0
    while cur < length:
        subtext = text[cur:]
        word_weight = prefixed_match(prefixed_dict,subtext)
        #print(al)
        if not word_weight is None:
            word_len = len(word_weight[0])
            if start < cur:
                tokens.append(original[start:cur])
            tokens.append(original[cur:cur+word_len])
            sum_weight += word_weight[1]
            cur += word_len
            start = cur
        else:
            cur += 1
            #print('yo')
    if start < cur:
        tokens.append(original[start:cur])
        #print(original[start:cur])
    return tokens, sum_weight

def tabbed_line2tuple(line,log=True):
    lst = re.split('\t| |,|;|\n|\r',line)
    if len(lst) > 1:
        return (lst[0],float(lst[1]) if not log else math.log10(1+float(lst[1])))
    else:
        return (lst[0],1.0)

def weightedlist2dict(lst,lower=False): # (key,weight) -> sum weigts by keys, keys may be lowercased
    dic = {}
    for item in lst:
        dictcount(dic,item[0].lower() if lower else item[0],item[1])
    return dic

class LexiconIndexedTokenizer(Tokenizer):

    def __init__(self, name=None, lexicon=None, cased=False, debug=False, url=None, sortmode=0):
        Tokenizer.__init__(self,debug=debug)
        self.name = name
        if not lexicon is None: 
            self.freqlist = [(word,1.0) for word in lexicon] #copy
        else:
            lex_lines = url_lines(url)
            self.freqlist = [tabbed_line2tuple(line) for line in lex_lines] #load from url
            # TODO load from file
        self.sortmode = sortmode
        self.compile()
        self.cased = cased

    def compile(self):
        self.dict = {}
        self.fulldict = weightedlist2dict(self.freqlist,lower=True) # save for debugging only!?
        for key in self.fulldict:
            value = self.fulldict[key]
            if len(key) > 0:
                letter = key[0]
                if not letter in self.dict:
                    self.dict[letter] = set()
                self.dict[letter].add((key,value))
        #print(self.dict['f'])
        for key in self.dict:
            lst = list(self.dict[key])
            if self.sortmode == 0:
                lst.sort(key=lambda s: len(s[0]), reverse=True)
            elif self.sortmode == 1:
                lst.sort(key=lambda s: s[1], reverse=True)
            else:
                lst.sort(key=lambda s: math.log10(s[1])*len(s[0]), reverse=True)
            self.dict[key] = lst
        #print(self.dict['f'])

    def tokenize(self,text):
        tokens, weight = tokenize_with_prexied_sorted_lexicon(self.dict,text,cased=self.cased)
        return tokens

    def tokenize_weight(self,text):
        tokens, weight = tokenize_with_prexied_sorted_lexicon(self.dict,text,cased=self.cased)
        length = len(tokens)
        return tokens, 0 if length == 0 else weight / length 

assert str(LexiconIndexedTokenizer(lexicon=['tuna','is','fish','cat','mammal']).tokenize("tunaisafish.catisamammal"))=="['tuna', 'is', 'a', 'fish', '.', 'cat', 'is', 'a', 'mammal']"    
assert str(LexiconIndexedTokenizer(lexicon=['tuna','is','fish','cat','mammal']).tokenize("Tunaisafish.Catisamammal"))=="['Tuna', 'is', 'a', 'fish', '.Cat', 'is', 'a', 'mammal']"
assert str(LexiconIndexedTokenizer(lexicon=['tuna','is','fish','cat','mammal'],cased=True).tokenize("Tunaisafish.Catisamammal"))=="['Tuna', 'is', 'a', 'fish', '.', 'Cat', 'is', 'a', 'mammal']"


## English

In [4]:
text = "tunaisafish.catisamammal"
expected = ['tuna', 'is', 'a', 'fish', '.', 'cat', 'ia', 'a', 'mammal']

In [5]:
lt0 = LexiconIndexedTokenizer(url = lex_en, sortmode=0)
tokens, weight = lt0.tokenize_weight(text)
print(tokens)
print(weight,calc_f1(expected,tokens))


['tuna', 'isa', 'fish', '.', 'cati', 'sama', 'mma', 'l']
2.9807663087309058 0.35294117647058826


In [6]:
lt1 = LexiconIndexedTokenizer(url = lex_en, sortmode=1)
tokens, weight = lt1.tokenize_weight(text)
print(tokens)
print(weight,calc_f1(expected,tokens))


['t', 'un', 'a', 'is', 'a', 'f', 'is', 'h', '.', 'c', 'a', 't', 'is', 'a', 'm', 'a', 'm', 'm', 'a', 'l']
5.729628877142061 0.2758620689655173


In [7]:
lt2 = LexiconIndexedTokenizer(url = lex_en, sortmode=2)
tokens, weight = lt2.tokenize_weight(text)
print(tokens)
print(weight,calc_f1(expected,tokens))


['tuna', 'is', 'af', 'is', 'h', '.', 'cat', 'is', 'ama', 'mm', 'al']
4.544507159677375 0.39999999999999997


In [8]:
text = "Tuna is a fish. Cat is a mammal"
expected = tokenize_split_with_delimiters_and_quotes(text)
print(expected)

['Tuna', ' ', 'is', ' ', 'a', ' ', 'fish', '.', ' ', 'Cat', ' ', 'is', ' ', 'a', ' ', 'mammal']


In [9]:
lt0 = LexiconIndexedTokenizer(url = lex_en, sortmode=0, cased = True)
tokens, weight = lt0.tokenize_weight(text)
print(tokens)
print(expected)
print(weight,calc_f1(expected,tokens))

['Tuna', ' ', 'is', ' ', 'a', ' ', 'fish', '. ', 'Cat', ' ', 'is', ' ', 'a', ' ', 'mammal']
['Tuna', ' ', 'is', ' ', 'a', ' ', 'fish', '.', ' ', 'Cat', ' ', 'is', ' ', 'a', ' ', 'mammal']
2.923531848929005 0.9032258064516129


## Russian

In [10]:
text = "расцветалияблониигруши,поплылитуманынадрекой"
expected = ['расцветали', 'яблони', 'игруши', ',', 'поплыли', 'туманы', 'над', 'рекой']

In [11]:
lt0 = LexiconIndexedTokenizer(url = lex_ru, sortmode=0)
tokens, weight = lt0.tokenize_weight(text)
print(tokens)
print(weight,calc_f1(expected,tokens))


['расцвета', 'лия', 'бл', 'они', 'игру', 'ши', ',', 'поплыли', 'туманы', 'над', 'рекой']
4.034137991553761 0.5263157894736842


In [12]:
lt1 = LexiconIndexedTokenizer(url = lex_ru, sortmode=1)
tokens, weight = lt1.tokenize_weight(text)
print(tokens)
print(weight,calc_f1(expected,tokens))


['ра', 'с', 'цвет', 'али', 'яблони', 'игр', 'уши', ',', 'по', 'плыли', 'ту', 'ма', 'ны', 'на', 'др', 'е', 'ко', 'й']
5.526441728891956 0.15384615384615383


In [13]:
lt2 = LexiconIndexedTokenizer(url = lex_ru, sortmode=2)
tokens, weight = lt2.tokenize_weight(text)
print(tokens)
print(weight,calc_f1(expected,tokens))


['расцвета', 'ли', 'яблони', 'игру', 'ши', ',', 'поплыли', 'туман', 'ы', 'над', 'рекой']
4.027652477481268 0.5263157894736842


In [14]:
lt2.fulldict['туманы']

2.734799829588847

In [15]:
lt2.fulldict['авва']

5.053785038134658

In [16]:
text = "Расцветали яблони и груши, поплыли туманы над рекой"
expected = tokenize_split_with_delimiters_and_quotes(text)
print(expected)

['Расцветали', ' ', 'яблони', ' ', 'и', ' ', 'груши', ',', ' ', 'поплыли', ' ', 'туманы', ' ', 'над', ' ', 'рекой']


In [17]:
lt0 = LexiconIndexedTokenizer(url = lex_ru, sortmode=0, cased = True)
tokens, weight = lt0.tokenize_weight(text)
print(tokens)
print(expected)
print(weight,calc_f1(expected,tokens))

['Расцвета', 'ли', ' ', 'яблони', ' и ', 'груши', ', ', 'поплыли', ' ', 'туманы', ' ', 'над', ' ', 'рекой']
['Расцветали', ' ', 'яблони', ' ', 'и', ' ', 'груши', ',', ' ', 'поплыли', ' ', 'туманы', ' ', 'над', ' ', 'рекой']
2.667408660328346 0.6666666666666666


In [18]:
#TODO compute score with account to number of letters in token AND / OR log of frequency?
#TODO build alternative graphs and score them!?


