In [1]:
n_classes, n_chars, max_len = 60, 35, 37
n_val = 100000

In [2]:
import numpy as np 

import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, load_model
from keras.layers import Dropout, Input, Dense, CuDNNLSTM, Bidirectional, concatenate, Reshape
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D
from keras.optimizers import Adam
from keras import backend as K
from keras.callbacks import ModelCheckpoint

import gc
import pickle 
import os 

In [3]:
with open('X.pkl', 'rb') as f: X = pickle.load(f)
with open('y.pkl', 'rb') as f: y = pickle.load(f)

assert len(X) == y.shape[0]
y.shape

(3039624, 60)

In [4]:
# padding
print("padding..")
X = pad_sequences(X, maxlen=max_len)

# shuffle
print("shuffling..")
idx = np.arange(X.shape[0])
np.random.seed(100)
np.random.shuffle(idx)
X, y = X[idx], y[idx]

# splitting
print("splitting..")
X_train, X_valid = X[:-n_val], X[-n_val:]
y_train, y_valid = y[:-n_val], y[-n_val:]

# to categorical valid
X_valid = to_categorical(X_valid, num_classes=n_chars)

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

padding..
shuffling..
splitting..


((2939624, 37), (100000, 37, 35), (2939624, 60), (100000, 60))

In [5]:
# input -> LSTM layer -> Dropout -> FC layer /sigmoid activation/ (multilabel)

def simple_lstm_model():   
    sequence_input = Input(shape=(max_len, n_chars, ))
    x = CuDNNLSTM(256)(sequence_input)
    x = Dropout(0.2)(x)
    preds = Dense(n_classes, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'])
    return model

def bidir_lstm_model():   
    sequence_input = Input(shape=(max_len, n_chars, ))
    x = Bidirectional(CuDNNLSTM(128))(sequence_input)
    x = Dropout(0.2)(x)   
    preds = Dense(n_classes, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'])
    return model

def bidir_lstm_conv_model():   
    # input
    sequence_input = Input(shape=(max_len, n_chars, ))
    
    # Bidirectional LSTM
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(sequence_input)
    x = Dropout(0.2)(x)   
    
    # convolutions
    conv_kernel_2 = Conv1D(64, kernel_size = 2, padding = "same", kernel_initializer = "glorot_uniform")(x)
    conv_kernel_3 = Conv1D(64, kernel_size = 3, padding = "same", kernel_initializer = "glorot_uniform")(x)
    x = concatenate([conv_kernel_2, conv_kernel_3])
    x = Dropout(0.2)(x)
    
    # pooling 
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool])
    x = Dropout(0.2)(x)
    
    # sigmoid fc layer
    preds = Dense(n_classes, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'])
    return model


In [6]:
def batch_generator(X, y, batch_size=100000):
    
    X_copy = np.array(X)
    y_copy = np.array(y)
    for i in range(0, X.shape[0], batch_size):
        yield to_categorical(X_copy[i:i+batch_size, :], num_classes=n_chars), y_copy[i:i+batch_size], i+batch_size

In [7]:
del X, y
gc.collect()

237

In [26]:
MODEL_DIR = './model/'
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)

In [96]:
batch_size = 128
epochs = 1

# model = bidir_lstm_conv_model()
# model = simple_lstm_model()
model = bidir_lstm_model()
print(model.summary())

for epoch in range(epochs):
    print('*'*10, 'EPOCH {}/{}'.format(str(epoch+1), str(epochs)), '*'*10)
    batches = batch_generator(X_train, y_train, 300000)
    for batch in batches:
        X_batch, y_batch, i = batch
        print('*'*5, 'BATCH {}/{}'.format(str(min(i, X_train.shape[0])), 
                                          str(X_train.shape[0])))
        checkpoint = ModelCheckpoint(os.path.join(MODEL_DIR, 
                                                  "model-epoch{:d}-batch{:d}k.h5".format(epoch+1, i//1000)))
        
        model.fit(X_batch, y_batch, batch_size=batch_size, epochs=1, 
                  validation_data=(X_valid, y_valid), 
                  verbose=1, callbacks=[checkpoint]
                 )


In [91]:
with open('char2int.pkl', 'rb') as f: char2int = pickle.load(f)
with open('int2pos.pkl', 'rb') as f: int2pos = pickle.load(f)
model = load_model(os.path.join(MODEL_DIR, 'model-epoch1-batch3000k.h5'))

def predict(word):

    s = word.upper() 
    X_pred = [char2int[c] for c in s]
    X_pred = pad_sequences([X_pred], maxlen=max_len)
    X_pred = to_categorical(X_pred, num_classes=n_chars)
    y_pred = model.predict(X_pred)[0]
    pred = [(int2pos[p], y_pred[p]) for p in np.argsort(y_pred)[::-1][:3]]
    
    print('word:', word)
    for p, pr in pred:
        print('{}: {:.2f}'.format(p, pr))
    print()

In [97]:
for w in ['небо','стол','мебель','добро','слава','комсомол',
          'дятел','число','олово','серебро','делать','думал','открывал',
          'мышь', 'слышь', 'крыша', 'мастер', 'крипта', 'валюта', 'солнце', 'тролль'
         ]:
    predict(w)

In [None]:
### ideas: 
# validate only on short words
# validate only on most frequent words
# train only on short words
# cut down the list of POS, union to `main groups`