In [1]:
import tensorflow as tf
import numpy as np
import os
import time

In [4]:
text = open('khayyam.txt', 'rb').read().decode(encoding='utf-8')

In [5]:
text[:10]

'|برخیز بتا'

In [6]:
vocabolaries = sorted(set(text))

In [7]:
vocabolaries

['\n',
 ' ',
 '!',
 '|',
 '؟',
 'آ',
 'ئ',
 'ا',
 'ب',
 'ت',
 'ث',
 'ج',
 'ح',
 'خ',
 'د',
 'ذ',
 'ر',
 'ز',
 'س',
 'ش',
 'ص',
 'ض',
 'ط',
 'ظ',
 'ع',
 'غ',
 'ف',
 'ق',
 'ل',
 'م',
 'ن',
 'ه',
 'و',
 'ٌ',
 'َ',
 'ّ',
 'ٔ',
 'پ',
 'چ',
 'ژ',
 'ک',
 'گ',
 'ی']

In [8]:
len(vocabolaries)

43

In [9]:
char2index = {u:i for i, u in enumerate(vocabolaries)}
index2char = np.array(vocabolaries)

In [10]:
char2index

{'\n': 0,
 ' ': 1,
 '!': 2,
 '|': 3,
 '؟': 4,
 'آ': 5,
 'ئ': 6,
 'ا': 7,
 'ب': 8,
 'ت': 9,
 'ث': 10,
 'ج': 11,
 'ح': 12,
 'خ': 13,
 'د': 14,
 'ذ': 15,
 'ر': 16,
 'ز': 17,
 'س': 18,
 'ش': 19,
 'ص': 20,
 'ض': 21,
 'ط': 22,
 'ظ': 23,
 'ع': 24,
 'غ': 25,
 'ف': 26,
 'ق': 27,
 'ل': 28,
 'م': 29,
 'ن': 30,
 'ه': 31,
 'و': 32,
 'ٌ': 33,
 'َ': 34,
 'ّ': 35,
 'ٔ': 36,
 'پ': 37,
 'چ': 38,
 'ژ': 39,
 'ک': 40,
 'گ': 41,
 'ی': 42}

In [13]:
index2char[1]

' '

In [14]:
text_as_integer = np.array([char2index[c] for c in text])

In [15]:
text_as_integer

array([ 3,  8, 16, ..., 14, 42,  0])

In [16]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_integer)

In [17]:
char_dataset

<TensorSliceDataset shapes: (), types: tf.int64>

In [19]:
for i in char_dataset.take(10):
    print(index2char[i.numpy()])

|
ب
ر
خ
ی
ز
 
ب
ت
ا


In [20]:
sequences = char_dataset.batch(30, drop_remainder=True)
sequences

<BatchDataset shapes: (30,), types: tf.int64>

In [26]:
for i in sequences.take(3):
    print('--->', ''.join(index2char[i.numpy()]))

---> |برخیز بتا بیا ز بهر دل ما
|حل
--->  کن به جمال خویشتن مشکل ما
|یک
--->  کوزه شراب تا به هم نوش کنیم
|


In [27]:
def sit(batch):
    input_text = batch[:-1]
    target_text = batch[1:]
    return input_text, target_text
dataset = sequences.map(sit)

In [28]:
dataset

<MapDataset shapes: ((29,), (29,)), types: (tf.int64, tf.int64)>

In [33]:
for i in dataset.take(1):
    print(''.join(index2char[i[0].numpy()]))
    print(''.join(index2char[i[1].numpy()]))

|برخیز بتا بیا ز بهر دل ما
|ح
برخیز بتا بیا ز بهر دل ما
|حل


In [34]:
dataset = dataset.batch(64, drop_remainder=True)
dataset

<BatchDataset shapes: ((64, 29), (64, 29)), types: (tf.int64, tf.int64)>

In [36]:
len(vocabolaries)

43

In [35]:
vocabolary_size = len(vocabolaries)
embedding_dim = 25
rnn_units = 1024

In [56]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabolary_size, 25),
    tf.keras.layers.GRU(1024, return_sequences=True),
    tf.keras.layers.Dense(vocabolary_size)
])

In [60]:
for input_text, target_text in dataset.take(1):
    output = model.predict(input_text)
    print(output[0])

[[ 0.00324541  0.00037203  0.00050542 ... -0.00116422 -0.00274155
  -0.00118266]
 [ 0.00057866  0.00245944 -0.00120837 ... -0.0041001   0.00017226
   0.0002535 ]
 [ 0.00357819  0.00165197 -0.00034095 ... -0.00157473  0.00075142
   0.00054015]
 ...
 [ 0.00076378  0.00396095 -0.00125311 ... -0.00216107 -0.00683515
   0.00787864]
 [ 0.00426875  0.00190863 -0.00049967 ... -0.00298239 -0.00644784
   0.00304987]
 [ 0.00447485  0.00141166 -0.00310612 ... -0.00215425 -0.00849032
   0.00453205]]


In [68]:
si = tf.random.categorical(output[0], num_samples=1)
si

<tf.Tensor: shape=(29, 1), dtype=int64, numpy=
array([[35],
       [13],
       [25],
       [17],
       [33],
       [19],
       [31],
       [ 9],
       [ 8],
       [38],
       [11],
       [30],
       [ 0],
       [ 5],
       [28],
       [25],
       [21],
       [32],
       [40],
       [40],
       [13],
       [41],
       [42],
       [ 2],
       [27],
       [18],
       [33],
       [40],
       [ 9]])>

In [69]:
tf.squeeze(si, axis=-1).numpy()

array([35, 13, 25, 17, 33, 19, 31,  9,  8, 38, 11, 30,  0,  5, 28, 25, 21,
       32, 40, 40, 13, 41, 42,  2, 27, 18, 33, 40,  9])

In [71]:
''.join(index2char[tf.squeeze(si, axis=-1).numpy()])

'ّخغزٌشهتبچجن\nآلغضوککخگی!قسٌکت'

In [67]:
output[0][0]

array([ 0.00324541,  0.00037203,  0.00050542,  0.00300855, -0.00345808,
       -0.00229521, -0.00302488,  0.00191844,  0.00100038, -0.00284826,
        0.00181447,  0.00453737, -0.00284061, -0.00724095,  0.00240566,
        0.0015421 , -0.00342499, -0.00117551, -0.00217069, -0.00056876,
        0.00070123, -0.00347687,  0.00488681,  0.0011458 ,  0.00454585,
       -0.0037517 , -0.00046795, -0.00093804,  0.00025666, -0.00147039,
       -0.00200066, -0.00393228,  0.00171278, -0.00298175,  0.00078911,
        0.00062172,  0.00075701,  0.00108924,  0.00141302, -0.00103579,
       -0.00116422, -0.00274155, -0.00118266], dtype=float32)

In [61]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, None, 25)          1075      
_________________________________________________________________
gru_9 (GRU)                  (None, None, 1024)        3228672   
_________________________________________________________________
dense_9 (Dense)              (None, None, 43)          44075     
Total params: 3,273,822
Trainable params: 3,273,822
Non-trainable params: 0
_________________________________________________________________


In [75]:
def loss_f(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
model.compile(optimizer='adam', loss=loss_f)

In [76]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='khayyam/checkpoints', save_weights_only=True)

In [77]:
history = model.fit(dataset, epochs=10, callbacks=[checkpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [80]:
tf.train.latest_checkpoint('')

'checkpoints'

In [85]:
model_2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabolary_size, 25),
    tf.keras.layers.GRU(1024, return_sequences=True),
    tf.keras.layers.Dense(vocabolary_size)
])

In [87]:
model_2.load_weights(tf.train.latest_checkpoint(''))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f27e3335c90>

In [89]:
model_2.build(tf.TensorShape([1, None]))

In [90]:
model_2.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 25)          1075      
_________________________________________________________________
gru_12 (GRU)                 (None, None, 1024)        3228672   
_________________________________________________________________
dense_12 (Dense)             (None, None, 43)          44075     
Total params: 3,273,822
Trainable params: 3,273,822
Non-trainable params: 0
_________________________________________________________________


In [165]:
num_generate = 1000
first_string = 'به نام خداوند جان و خرد'
input_eval = [char2index[s] for s in first_string]
input_eval = tf.expand_dims(input_eval, 0)
input_eval

<tf.Tensor: shape=(1, 23), dtype=int32, numpy=
array([[ 8, 31,  1, 30,  7, 29,  1, 13, 14,  7, 32, 30, 14,  1, 11,  7,
        30,  1, 32,  1, 13, 16, 14]], dtype=int32)>

In [166]:
model_2.reset_states()

In [172]:
text_generated = []
for i in range(10):
    predictions = model_2.predict(input_eval)
    predictions = tf.squeeze(predictions, 0)
    predicted_ids = tf.random.categorical(predictions, num_samples=1).numpy()
    input_eval = tf.expand_dims(tf.squeeze(predicted_ids, axis=-1).numpy(), 0).numpy()
    text_generated.append(index2char[tf.squeeze(predicted_ids, axis=-1).numpy()])


In [182]:
for i in text_generated:
    print(''.join(i))

عکنتخبک ر| دهتنن  رن نن
لشن هگره گ
مٔممباشد ت||
جدی
 سا مااش
    ببننبف
ف
 امراخین
 وبکلدوه
دان
سمچ  یی|توو
دخ وا ویزم 
 ید!حتز|سرازسالاخرن| زغ
آ
 ت   |تنی اا نسگمختهم
بواوگرنو ر|اهمحر  نرا |
شد ن س
دبووب   ش افنچود
ظآشلع
  و ه ت|خبدف  ش| 
