In [1]:
# simple char RNN for POS-tagging russian

# data 
# http://opencorpora.org/files/export/dict/dict.opcorpora.txt.zip

In [1]:
import numpy as np 
import pandas as pd 
# import keras

In [2]:
words, pos = [], []
with open('./dict.opcorpora.txt', encoding='utf-8') as f:
    line = f.readline()
    while True:
        line = line.split('\t')
        if len(line) > 1 and not set(["'", '’', '0','1','2','3','4','5','6','7','8','9']) & set(line[0]):
            words.append(line[0])
            pos.append(line[1].split(',')[0].strip())
        line = f.readline()
        if line == '': break

In [3]:
chars = [c for w in words for c in w]
np.unique(chars)

array(['-', 'Ё', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'Й', 'К',
       'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч',
       'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я'], dtype='<U1')

In [4]:
# labels to int 
pos2int = {}
for i, p in enumerate(np.unique(pos)):
    pos2int[p] = i

pos_int = []
for p in pos:
    pos_int.append(pos2int[p])

int2pos = {i: p for p, i in pos2int.items()}

In [5]:
from collections import Counter

words_x_pos = {}
for w, p in zip(words, pos_int):
    words_x_pos.setdefault(w, Counter()).update([p])

In [6]:
n_samples = len(words_x_pos.keys())
n_classes = np.unique(pos_int).shape[0]

In [7]:
# preprocessing /one-hot/
# different labels for same words if possible (multilabels)

# prepare multilabels
y = np.zeros((n_samples, n_classes))

for i, k in enumerate(words_x_pos):
    c = words_x_pos[k]
    if len(c) == 1:
        v = [cc for cc in c.keys()][0]
        y[i, v] = 1
    else:
        s = sum(words_x_pos[k].values())
        for v, cnt in words_x_pos[k].items():
            y[i, v] = cnt/s
    if i % 500000 == 0: print(i)


0
500000
1000000
1500000
2000000
2500000
3000000


In [8]:
# prepare char sequences 

# char2int 
char2int = {}
for i, c in enumerate(np.unique(chars)):
    char2int[c] = i + 1  # 0 char is not a char (for padding)

int2char = {i: c for c, i in char2int.items()}

words_int = []
max_len = 0
for w in words_x_pos.keys():
    word_int = [char2int[c] for c in w]
    words_int.append(word_int)
    
    ln = len(word_int)
    if ln > max_len: max_len = ln
        
max_len, len(words_int), char2int

(37,
 3039624,
 {'-': 1,
  'Ё': 2,
  'А': 3,
  'Б': 4,
  'В': 5,
  'Г': 6,
  'Д': 7,
  'Е': 8,
  'Ж': 9,
  'З': 10,
  'И': 11,
  'Й': 12,
  'К': 13,
  'Л': 14,
  'М': 15,
  'Н': 16,
  'О': 17,
  'П': 18,
  'Р': 19,
  'С': 20,
  'Т': 21,
  'У': 22,
  'Ф': 23,
  'Х': 24,
  'Ц': 25,
  'Ч': 26,
  'Ш': 27,
  'Щ': 28,
  'Ъ': 29,
  'Ы': 30,
  'Ь': 31,
  'Э': 32,
  'Ю': 33,
  'Я': 34})

In [9]:
n_chars = len(char2int)+1
n_chars, max_len

(35, 37)

In [14]:
import pickle 
with open('X.pkl', 'wb') as f: pickle.dump(words_int, f)
with open('y.pkl', 'wb') as f: pickle.dump(y, f)

In [16]:
with open('char2int.pkl', 'wb') as f: pickle.dump(char2int, f)
with open('int2pos.pkl', 'wb') as f: pickle.dump(int2pos, f)

In [19]:
Counter(pos_int)

Counter({0: 252803,
         1: 67326,
         2: 235682,
         3: 67336,
         4: 168345,
         5: 202025,
         6: 168340,
         7: 31313,
         8: 9176,
         9: 9187,
         10: 10270,
         11: 9185,
         12: 4066,
         13: 2,
         14: 1,
         15: 4,
         16: 1,
         17: 38515,
         18: 14278,
         19: 5,
         20: 7142,
         21: 185,
         22: 3,
         23: 4,
         24: 2,
         25: 70943,
         26: 32374,
         27: 301,
         28: 1,
         29: 4,
         30: 1415797,
         31: 243,
         32: 2,
         33: 2,
         34: 16,
         35: 13,
         36: 70,
         37: 44,
         38: 48,
         39: 9,
         40: 53,
         41: 8,
         42: 51,
         43: 11,
         44: 50,
         45: 9,
         46: 8,
         47: 53,
         48: 131,
         49: 1,
         50: 1,
         51: 3,
         52: 1,
         53: 80,
         54: 126,
         55: 1,
         56: 13