<a href="https://colab.research.google.com/github/aithaprasad/Bantu_Language_Modeling/blob/main/Bantu_Language_Anything_Goes_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from tensorflow import keras
from keras import layers
import numpy as np
import random
import io


In [2]:
path = "cwe-train.txt"
with io.open(path, encoding="utf-8") as f:
    text = f.read().lower()
text = text.replace("\n", " ")  # We remove newlines chars for nicer display
print("Corpus length:", len(text))

chars = sorted(list(set(text)))
print("Total chars:", len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print("Number of sequences:", len(sentences))

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1



Corpus length: 603432
Total chars: 31
Number of sequences: 201131


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [3]:
model = keras.Sequential(
    [
        keras.Input(shape=(maxlen, len(chars))),
        layers.LSTM(128),
        layers.Dense(len(chars), activation="softmax"),
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)


In [4]:

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)



In [8]:
epochs = 40
batch_size = 128

for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("...Diversity:", diversity)

        generated = ""
        sentence = text[start_index : start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.0
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()



Generating text after epoch: 0
...Diversity: 0.2
...Generating with seed: "wanduso, wezakuwa wa kuuhelelo, na wengi"
...Generated:   wakala wamulongela, "mweye mulumuliza yamulemile kuwa mulungu kwa mulungu kwa ichimu cha mulungu kwa udaho wa mulungu na kuwalongela, "muna yamaandiko yelile ya mulungu yoyose yoyatangigwe na mulungu, kamulongela, "muna ingalawa ya mulungu yoyamwing'hile musa kamulongela, "niye nimwenga wana wanhu wa mulungu kwa kuwalongela, "mweye muna mulungu kamulongela, "niye nimwenga wanhu wa mulungu na wanh

...Diversity: 0.5
...Generating with seed: "wanduso, wezakuwa wa kuuhelelo, na wengi"
...Generated:  la wa kulonga na kuwa mulungu mwiimago yoyose muna yamaandiko yelile. maabaho yesu kamulongela, "mwenevale wamwenga kwizakwina muna zikaye zake ya zuwa na malagilizo ya mulungu yoyose muna yamaandiko yelile yolonga, "muna hamwe na wanhu wamwenga wamwenga wawo wenyewo, iviya kakala kawalongela, "nizakwina muna imizoyo yetu yoyamwing'hileni munhu iyo, na mulungu k

  after removing the cwd from sys.path.


...Generated:   vingi ng'hani ichitada cha munhu vinyamkela chayahende. viwahulike ivo chinhu icho chinyandiza bule, yudeke kugola kasigola umanyi, namwenga ungilwe wawamulongela, "muna dipulihi kwa kulonda kudumba, walangulizi wangu chinhu mwenyewo ina niza wenyewo, kwaviya habule sita, na ndagilizi yake, paulo viyeligwe mweye wasamha ndaliya ino." sekemwize yamo kolongela muna kupeta bilauba, mbali chila munhu

...Diversity: 1.2
...Generating with seed: "wanduso, wezakuwa wa kuuhelelo, na wengi"
...Generated:   mwiilola yoyamuwono wailema kuwa kwa kwiitanya hebu zangu, na ndugu yangu mwenyewo yana na imwe kaching'hani kwaviya na kugatiigwa. mbali icho yoike, wahulika dizi haizumwe liyahe, milagilizo chigoda ubazi. yesu kasoligwa kwa ija haja kumbeudigwa. woese, mbali chonisele munhu yoyanahisa vili wolagabu uhasanyeni fana m'hendigwe yano ya lukuli. viwakogelele wwoni, kamvunha wali na kuhila ulondo wa 


Generating text after epoch: 1
...Diversity: 0.2
...Generating with seed: "hano

In [10]:
model.save("cwe_lstm_model.h5")