In [127]:
import os
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, LSTM, SimpleRNN
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tqdm import tqdm

In [2]:
from scipy.stats import zscore

In [3]:
def one_hot_encoding_symbol_without_masking(x, dictionary):
    vector = [0]*len(dictionary)
    vector[dictionary.index(x)] = 1
    vector = zscore(vector)
    return vector

In [4]:
def get_X_Y(words_pad, spells_pad, graphemes_dict, phonemes_dict):
    words_one_hot = []
    for word in tqdm(words_pad):
        word_one_hot = []
        for grapheme in word:
            word_one_hot.append(one_hot_encoding_symbol_without_masking(grapheme, graphemes_dict))
        words_one_hot.append(word_one_hot)
    
    spells_one_hot = []
    for spell in tqdm(spells_pad):
        spell_one_hot = []
        for phoneme in spell:
            spell_one_hot.append(one_hot_encoding_symbol_without_masking(phoneme, phonemes_dict))
        spells_one_hot.append(spell_one_hot)
    return words_one_hot, spells_one_hot

In [5]:
with open('data/ru.txt', 'r') as f:
    lines = f.readlines()
    lines = [line.strip() for line in lines if not '(' in line]
    lines = lines[9:]

words_pad = []
words_len = []
spells_pad = []
spells_len = []
for line in tqdm(lines):
    chunks = line.split()
    graphs = list(chunks[0])
    phones = chunks[1:]
    if len(graphs) > 8 or len(phones) > 8:
        continue
    words_pad.append(graphs)
    spells_pad.append(phones)
    spells_len.append(len(phones))
    
# words_pad = pad_sequences(words_pad, value='#', maxlen=8, dtype=object)
# spells_pad = pad_sequences(spells_pad, value='#', maxlen=8, dtype=object)

with open('data/phonemes.txt', 'r') as f:
    lines_1 = f.readlines()
    phonemes_dict = [line.strip() for line in lines_1]

with open('data/graphemes.txt', 'r') as f:
    lines_2 = f.readlines()
    graphemes_dict = [line.strip() for line in lines_2]
    
    
X, Y = get_X_Y(words_pad, spells_pad, graphemes_dict, phonemes_dict)
# X = [np.array(matrix) for matrix in X]
# X = [np.expand_dims(matrix, axis=0) for matrix in X]
# X = np.row_stack((X))
# Y = [np.array(matrix) for matrix in Y]
# Y = [np.expand_dims(matrix, axis=0) for matrix in Y]
# Y = np.row_stack((Y))
# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=27)
# train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))

100%|██████████| 533911/533911 [00:01<00:00, 335860.03it/s]
100%|██████████| 253403/253403 [01:53<00:00, 2224.37it/s]
100%|██████████| 253403/253403 [01:57<00:00, 2158.66it/s]


In [6]:
X = [np.array(matrix) for matrix in X]
X = [matrix.transpose().dot(matrix) for matrix in X]
X = [np.expand_dims(matrix, axis=0) for matrix in X]
X = np.row_stack((X))

In [7]:
X = pad_sequences(X, maxlen=49)

In [8]:
Y = [np.array(matrix) for matrix in Y]
Y = [matrix.transpose().dot(matrix) for matrix in Y]
Y = [np.expand_dims(matrix, axis=0) for matrix in Y]
Y = np.row_stack((Y))

In [75]:
Y.shape

(253403, 49, 49)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=27)

In [25]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))

In [11]:
early_stopping=EarlyStopping(monitor='loss', patience=3)

In [12]:
batch_size = 128
train_dataset = train_dataset.batch(batch_size)

In [13]:
train_dataset

<BatchDataset shapes: ((None, 49, 35), (None, 49, 49)), types: (tf.int32, tf.float64)>

In [14]:
inputs = Input((49, 35))
rnn = tf.keras.layers.SimpleRNN(128, activation='tanh', return_sequences=True, input_shape=(8, 35))
x = rnn(inputs)
dense = Dense(49, activation='softmax',
             kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01, seed=42),
             bias_initializer=tf.keras.initializers.Zeros())
outputs = dense(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')

In [16]:
histoty = model.fit(train_dataset, epochs=10, verbose=1, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
ds = tf.data.Dataset([[1, 2], [1, 2, 3], [1, 2, 6, 7]])

TypeError: Can't instantiate abstract class DatasetV2 with abstract methods _inputs, element_spec

In [92]:
dataset = tf.data.Dataset.from_tensors(([1, 2, 3], [2, 3]))

In [93]:
for i in dataset:
    print(i)

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 2, 3], dtype=int32)>, <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 3], dtype=int32)>)


In [94]:
dataset = dataset.padded_batch(3)

In [96]:
for i in dataset:
    print(i)

(<tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[1, 2, 3]], dtype=int32)>, <tf.Tensor: shape=(1, 2), dtype=int32, numpy=array([[2, 3]], dtype=int32)>)


In [None]:
def gen():
    ragged_tensor = tf.ragged.constant([[1, 2], [3]])
    yield 42, ragged_tensor

In [73]:
dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))

TypeError: in user code:


    TypeError: <lambda>() takes 1 positional argument but 2 were given


In [None]:
dataset = dataset.shuffle(100)

In [74]:
for i in dataset:
    print(i)

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 2, 3], dtype=int32)>, <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 3], dtype=int32)>)


In [75]:
for i in dataset:
    print(i)

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 2, 3], dtype=int32)>, <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 3], dtype=int32)>)


In [76]:
for i in dataset:
    print(i)

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 2, 3], dtype=int32)>, <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 3], dtype=int32)>)


In [77]:
dataset = dataset.padded_batch(4, padded_shapes=(None,))

ValueError: The two structures don't have the same sequence length. Input structure has length 1, while shallow structure has length 2.

In [None]:
dataset = dataset.shuffle(100)

In [78]:
for i in dataset:
    print(i)

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 2, 3], dtype=int32)>, <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 3], dtype=int32)>)


In [79]:
a =  dataset.take(4)

In [69]:
for i in a:
    print(i)

tf.Tensor(
[[ 8  8  8  8  8  8  8  8  0  0  0]
 [ 9  9  9  9  9  9  9  9  9  0  0]
 [10 10 10 10 10 10 10 10 10 10  0]
 [11 11 11 11 11 11 11 11 11 11 11]], shape=(4, 11), dtype=int64)
tf.Tensor(
[[20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20  0  0  0]
 [21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21  0  0]
 [22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22  0]
 [23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23]], shape=(4, 23), dtype=int64)
tf.Tensor(
[[60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60
  60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60
  60 60 60 60 60 60 60 60 60 60 60 60  0  0  0]
 [61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61
  61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61 61
  61 61 61 61 61 61 61 61 61 61 61 61 61  0  0]
 [62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62 62
  62 62 6

In [124]:
def gen():
    ragged_tensor = tf.ragged.constant([[1, 2], [3]])
    yield 42, ragged_tensor

dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.RaggedTensor), (tf.TensorShape([2]), (tf.TensorShape([1]))) )

TypeError: Cannot convert value <class 'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor'> to a TensorFlow DType.

In [114]:
for i in dataset.take(2):
    print(i)

InvalidArgumentError: TypeError: `generator` yielded an element that could not be converted to the expected type. The expected type was int32, but the yielded element was <tf.RaggedTensor [[1, 2], [3]]>.
TypeError: int() argument must be a string, a bytes-like object or a number, not 'RaggedTensor'


The above exception was the direct cause of the following exception:


Traceback (most recent call last):

  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 801, in generator_py_func
    ret, dtype=dtype.as_numpy_dtype))

  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/script_ops.py", line 203, in _convert
    result = np.asarray(value, dtype=dtype, order="C")

  File "/usr/local/lib/python3.6/dist-packages/numpy/core/_asarray.py", line 85, in asarray
    return array(a, dtype, copy=False, order=order)

ValueError: setting an array element with a sequence.


During handling of the above exception, another exception occurred:


Traceback (most recent call last):

  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/script_ops.py", line 243, in __call__
    ret = func(*args)

  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 309, in wrapper
    return func(*args, **kwargs)

  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 806, in generator_py_func
    "element was %s." % (dtype.name, ret)), sys.exc_info()[2])

  File "/usr/local/lib/python3.6/dist-packages/six.py", line 702, in reraise
    raise value.with_traceback(tb)

  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 801, in generator_py_func
    ret, dtype=dtype.as_numpy_dtype))

  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/script_ops.py", line 203, in _convert
    result = np.asarray(value, dtype=dtype, order="C")

  File "/usr/local/lib/python3.6/dist-packages/numpy/core/_asarray.py", line 85, in asarray
    return array(a, dtype, copy=False, order=order)

TypeError: `generator` yielded an element that could not be converted to the expected type. The expected type was int32, but the yielded element was <tf.RaggedTensor [[1, 2], [3]]>.


	 [[{{node PyFunc}}]]

In [117]:
!pip3 list

Package                Version
---------------------- ---------------
absl-py                0.9.0
asn1crypto             0.24.0
astunparse             1.6.3
attrs                  19.3.0
backcall               0.1.0
bleach                 3.1.5
cachetools             4.1.0
certifi                2020.4.5.1
chardet                3.0.4
cryptography           2.1.4
cycler                 0.10.0
decorator              4.4.2
defusedxml             0.6.0
editdistance           0.5.3
entrypoints            0.3
gast                   0.3.3
google-auth            1.14.2
google-auth-oauthlib   0.4.1
google-pasta           0.2.0
grpcio                 1.28.1
h5py                   2.10.0
idna                   2.6
importlib-metadata     1.6.0
ipykernel              5.1.1
ipython                7.14.0
ipython-genutils       0.2.0
ipywidgets             7.5.1
jedi                   0.17.0
Jinja2                 2.11.2
joblib                 1.0.1
jsonschema             3.2.0
jupyter              

In [122]:
!pip3 uninstall tensorflow-gpu

Found existing installation: tensorflow-gpu 2.2.0
Uninstalling tensorflow-gpu-2.2.0:
  Would remove:
    /usr/local/bin/estimator_ckpt_converter
    /usr/local/bin/saved_model_cli
    /usr/local/bin/tensorboard
    /usr/local/bin/tf_upgrade_v2
    /usr/local/bin/tflite_convert
    /usr/local/bin/toco
    /usr/local/bin/toco_from_protos
    /usr/local/lib/python3.6/dist-packages/tensorflow/*
    /usr/local/lib/python3.6/dist-packages/tensorflow_gpu-2.2.0.dist-info/*
Proceed (y/n)? ^C
[31mERROR: Operation cancelled by user[0m


In [123]:
!y

UnboundLocalError: local variable 'child' referenced before assignment

In [128]:
!pip3 list

Package                 Version
----------------------- -------------------
absl-py                 0.13.0
asn1crypto              0.24.0
astunparse              1.6.3
attrs                   19.3.0
backcall                0.1.0
bleach                  3.1.5
cached-property         1.5.2
cachetools              4.1.0
certifi                 2020.4.5.1
chardet                 3.0.4
cryptography            2.1.4
cycler                  0.10.0
decorator               4.4.2
defusedxml              0.6.0
editdistance            0.5.3
entrypoints             0.3
flatbuffers             1.12
gast                    0.4.0
google-auth             1.14.2
google-auth-oauthlib    0.4.1
google-pasta            0.2.0
grpcio                  1.34.1
h5py                    3.1.0
idna                    2.6
importlib-metadata      1.6.0
ipykernel               5.1.1
ipython                 7.14.0
ipython-genutils        0.2.0
ipywidgets              7.5.1
jedi                    0.17.0
Jinja2          

In [None]:
t