taken from [link](https://keras.io/examples/generative/lstm_character_level_text_generation/)
  
data from [link](https://www.kaggle.com/namanj27/astronomers-telegram-dataset?select=Processed_Atels.csv)

# Setup

In [2]:
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import random
import io
import pandas as pd

# Prepare the Data
- upload `Processed_Atels.csv` to runtime 

In [4]:
data = pd.read_csv("Processed_Atels.csv")

In [13]:
text_processed = data["Text processed"]
text = ""
for row in text_processed:
    text += row

print("Corpus length:", len(text))
chars = sorted(list(set(text)))
print("Total chars:", len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
# print(char_indices)
# print(indices_char)

maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen]) # add 40 chars from i to sentences
    next_chars.append(text[i + maxlen]) # add the next char to next_chars
print("Number of sequences:", len(sentences))

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        # print(t, char)
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Corpus length: 1940271
Total chars: 108
Number of sequences: 646744


* `text` is a list of characters from data. `text[0]` = `w`, `text[:5]` = `We re`
* `sentences` is a list of sentences of length `maxlen` from data `text`, incremented by `step` 
    * `sentences[0]` : We report spectroscopic observations of | `next_char[0]` : A
    * `sentences[1]` : report spectroscopic observations of AT2 | `next_char[1]` : 0
    * `sentences[2]` : ort spectroscopic observations of AT2018 | `next_char[2]` : 1
* `x.shape` (646744, 40, 108)
* `y.shape` (646744, 108)

# Build the model: a single LSTM layer

In [26]:
model = keras.Sequential(
    [
        keras.Input(shape=(maxlen, len(chars))),
        layers.LSTM(128),
        layers.Dense(len(chars), activation="softmax"),
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [27]:
model

<tensorflow.python.keras.engine.sequential.Sequential at 0x7f860ae575c0>

# Prepare the text sampling function

In [28]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Train the model

In [29]:
epochs = 1
batch_size = 128

for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    print(f"text len {len(text)} start_index {start_index}, maxlen {maxlen}")
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("...Diversity:", diversity)

        generated = ""
        sentence = text[start_index : start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.0
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()


Generating text after epoch: 0
text len 1940271 start_index 282552, maxlen 40
...Diversity: 0.2
...Generating with seed: " obtained assuming a host galaxy distanc"
...Generated:  e of the previous of the source with the source and the source with the source and the probable the source and the source with a strong the source and in the source and in the source and the spectra and the source and the source and the source and the spectrum of the source and in the source and the source and the source and a strument with the source and the source with the source and in the sour

...Diversity: 0.5
...Generating with seed: " obtained assuming a host galaxy distanc"
...Generated:  e transitud on 2018 MJD 58224 UT with a state of other and band in the new source (radio energy state and and suggest the spectrum (ATel #11180) show the flux density of the respectively with a chand degree law to present of the strongly as a source and the searching was not are and confirm supernovae data band. SN 