# Russian Tokenization Experiments - PROGRESS

In [1]:
import os, sys
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path) 

from os import listdir
from os.path import isfile, join

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

#force reimport
if 'pygents.util' in sys.modules:
    del sys.modules['pygents.util']
if 'pygents.text' in sys.modules:
    del sys.modules['pygents.text']
if 'pygents.plot' in sys.modules:
    del sys.modules['pygents.plot']
if 'pygents.token' in sys.modules:
    del sys.modules['pygents.token']

from pygents.util import * 
from pygents.text import * 
from pygents.plot import * 
from pygents.token import * 

lex_en = "https://raw.githubusercontent.com/aigents/aigents-java/master/lexicon_english.txt"
lex_ru = "https://raw.githubusercontent.com/aigents/aigents-java/master/lexicon_russian.txt"


## English

In [2]:
text = "tunaisafish.catisamammal"
expected = ['tuna', 'is', 'a', 'fish', '.', 'cat', 'ia', 'a', 'mammal']

In [3]:
lt0 = LexiconIndexedTokenizer(url = lex_en, sortmode=0)
tokens, weight = lt0.tokenize_weight(text)
print(tokens)
print(weight,calc_f1(expected,tokens))


['tuna', 'isa', 'fish', '.', 'cati', 'sama', 'mma', 'l']
2.9807663087309058 0.35294117647058826


In [4]:
lt1 = LexiconIndexedTokenizer(url = lex_en, sortmode=1)
tokens, weight = lt1.tokenize_weight(text)
print(tokens)
print(weight,calc_f1(expected,tokens))


['t', 'un', 'a', 'is', 'a', 'f', 'is', 'h', '.', 'c', 'a', 't', 'is', 'a', 'm', 'a', 'm', 'm', 'a', 'l']
5.729628877142061 0.2758620689655173


In [5]:
lt2 = LexiconIndexedTokenizer(url = lex_en, sortmode=2)
tokens, weight = lt2.tokenize_weight(text)
print(tokens)
print(weight,calc_f1(expected,tokens))


['tuna', 'is', 'af', 'is', 'h', '.', 'cat', 'is', 'ama', 'mm', 'al']
4.544507159677375 0.39999999999999997


In [6]:
text = "Tuna is a fish. Cat is a mammal"
expected = tokenize_split_with_delimiters_and_quotes(text)
print(expected)

['Tuna', ' ', 'is', ' ', 'a', ' ', 'fish', '.', ' ', 'Cat', ' ', 'is', ' ', 'a', ' ', 'mammal']


In [7]:
lt0 = LexiconIndexedTokenizer(url = lex_en, sortmode=0, cased = True)
tokens, weight = lt0.tokenize_weight(text)
print(tokens)
print(expected)
print(weight,calc_f1(expected,tokens))

['Tuna', ' ', 'is', ' ', 'a', ' ', 'fish', '. ', 'Cat', ' ', 'is', ' ', 'a', ' ', 'mammal']
['Tuna', ' ', 'is', ' ', 'a', ' ', 'fish', '.', ' ', 'Cat', ' ', 'is', ' ', 'a', ' ', 'mammal']
2.923531848929005 0.9032258064516129


## Russian

In [8]:
text = "расцветалияблониигруши,поплылитуманынадрекой"
expected = ['расцветали', 'яблони', 'игруши', ',', 'поплыли', 'туманы', 'над', 'рекой']

In [9]:
lt0 = LexiconIndexedTokenizer(url = lex_ru, sortmode=0)
tokens, weight = lt0.tokenize_weight(text)
print(tokens)
print(weight,calc_f1(expected,tokens))


['расцвета', 'лия', 'бл', 'они', 'игру', 'ши', ',', 'поплыли', 'туманы', 'над', 'рекой']
4.034137991553761 0.5263157894736842


In [10]:
lt1 = LexiconIndexedTokenizer(url = lex_ru, sortmode=1)
tokens, weight = lt1.tokenize_weight(text)
print(tokens)
print(weight,calc_f1(expected,tokens))


['ра', 'с', 'цвет', 'али', 'яблони', 'игр', 'уши', ',', 'по', 'плыли', 'ту', 'ма', 'ны', 'на', 'др', 'е', 'ко', 'й']
5.526441728891956 0.15384615384615383


In [11]:
lt2 = LexiconIndexedTokenizer(url = lex_ru, sortmode=2)
tokens, weight = lt2.tokenize_weight(text)
print(tokens)
print(weight,calc_f1(expected,tokens))


['расцвета', 'ли', 'яблони', 'игру', 'ши', ',', 'поплыли', 'туман', 'ы', 'над', 'рекой']
4.027652477481268 0.5263157894736842


In [12]:
lt2.fulldict['туманы']

2.734799829588847

In [13]:
lt2.fulldict['авва']

5.053785038134658

In [14]:
text = "Расцветали яблони и груши, поплыли туманы над рекой"
expected = tokenize_split_with_delimiters_and_quotes(text)
print(expected)

['Расцветали', ' ', 'яблони', ' ', 'и', ' ', 'груши', ',', ' ', 'поплыли', ' ', 'туманы', ' ', 'над', ' ', 'рекой']


In [15]:
lt0 = LexiconIndexedTokenizer(url = lex_ru, sortmode=0, cased = True)
tokens, weight = lt0.tokenize_weight(text)
print(tokens)
print(expected)
print(weight,calc_f1(expected,tokens))

['Расцвета', 'ли', ' ', 'яблони', ' и ', 'груши', ', ', 'поплыли', ' ', 'туманы', ' ', 'над', ' ', 'рекой']
['Расцветали', ' ', 'яблони', ' ', 'и', ' ', 'груши', ',', ' ', 'поплыли', ' ', 'туманы', ' ', 'над', ' ', 'рекой']
2.667408660328346 0.6666666666666666


In [16]:
print(lt0.fulldict['расцвета'],'расцвета' in lt0.fulldict,'расцветали' in lt0.fulldict)


2.850033257689769 True False


## Chinese

In [17]:
# see https://github.com/yishn/chinese-tokenizer
# This tokenizer uses a simple greedy algorithm: It always looks for the longest word in the CC-CEDICT dictionary that matches the input, one at a time.
import jieba


In [18]:
path = '../../nlp/corpora/Chinese/'
cld_df = pd.read_csv(os.path.join(path,'lexicon/chineselexicaldatabase2.1.txt'))
wordlist = list(cld_df['Word'])
print(len(wordlist))
print(wordlist[:5])

  exec(code_obj, self.user_global_ns, self.user_ns)


48644
['中东', '马队', '门徒', '申讨', '曲']


In [19]:
zhlt0 = LexiconIndexedTokenizer(lexicon = wordlist)

In [20]:
zhlt0.tokenize('中东马队门徒申讨曲')

['中东', '马队', '门徒', '申讨', '曲']

In [21]:
#'Dogs, cats, mice and pigs are all mammals'
expected = [r[0] for r in jieba.tokenize('狗、猫、老鼠和猪都是哺乳动物')]
tokens = zhlt0.tokenize('狗、猫、老鼠和猪都是哺乳动物')
print(expected)
print(tokens)
print(calc_f1(expected,tokens))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/lb/1m7gbdp17h578qq48pbbtxf40000gn/T/jieba.cache
Loading model cost 0.945 seconds.
Prefix dict has been built successfully.


['狗', '、', '猫', '、', '老鼠', '和', '猪', '都', '是', '哺乳动物']
['狗', '、', '猫', '、', '老鼠', '和', '猪', '都', '是', '哺乳', '动物']
0.8571428571428572


In [22]:
#'Цвели яблони и груши, над рекой плыл туман.'
expected = [r[0] for r in jieba.tokenize('苹果树和梨树开花，雾气飘过河面。')]
tokens = zhlt0.tokenize('苹果树和梨树开花，雾气飘过河面。')
print(expected)
print(tokens)
print(calc_f1(expected,tokens))

['苹果树', '和', '梨树', '开花', '，', '雾气', '飘过', '河面', '。']
['苹果树', '和', '梨树', '开花', '，', '雾气', '飘', '过', '河面', '。']
0.8421052631578948


In [23]:
#TODO compute score with account to number of letters in token AND / OR log of frequency?
#TODO build alternative graphs and score them (by SOME scoring function)!?

