taken from [link](https://keras.io/examples/generative/lstm_character_level_text_generation/)
  
data from [link](https://www.kaggle.com/namanj27/astronomers-telegram-dataset?select=Processed_Atels.csv)

# Setup

In [3]:
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import random
import io
import pandas as pd

# Prepare the Data

In [6]:
path_to_file = keras.utils.get_file(
    "Processed_Atels", 
    "https://raw.githubusercontent.com/bellaroseee/447-Group-Project/checkpoint-2/src/Processed_Atels.csv")
data = pd.read_csv(path_to_file)

In [20]:
text_processed = data["Text processed"]
text = ""
for row in text_processed:
    text += row

print("Corpus length:", len(text))
chars = sorted(list(set(text)))
print("Total chars:", len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
# char_indices maps character to index (index is decided here)
# indices_char maps index to character (this is the opposite of char_indices)

maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen]) # add 40 chars from i to sentences
    next_chars.append(text[i + maxlen]) # add the next char to next_chars
print("Number of sequences:", len(sentences))

# test = 1
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    # if (test == 1) :
      # print(i, sentence)
      for t, char in enumerate(sentence):
          # print(t, char)
          x[i, t, char_indices[char]] = 1
      y[i, char_indices[next_chars[i]]] = 1
      # print(char_indices['A'])
      # print(x[0][0])
      # print(char_indices['W'])
      # print(y[0])
      # test = 2

Corpus length: 1940271
Total chars: 108
Number of sequences: 646744


* `text` is a list of characters from data. `text[0]` = `w`, `text[:5]` = `We re`
* `sentences` is a list of sentences of length `maxlen` from data `text`, incremented by `step` 
    * `sentences[0]` : We report spectroscopic observations of | `next_char[0]` : A
    * `sentences[1]` : report spectroscopic observations of AT2 | `next_char[1]` : 0
    * `sentences[2]` : ort spectroscopic observations of AT2018 | `next_char[2]` : 1
* `x.shape` (646744, 40, 108) -> (num of sequences, length of sequence, number of characters) 
* `y.shape` (646744, 108) -> (num of sequences, number of characters)


Full Explanation of For Loop
  
i, sentence: `0 We report spectroscopic observations of`

t, char: `0 W`

`char_indices['W']` = 56

`x[0][0]` : 

```
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False  True False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False]
```

next_char is 'A', `char_indices['A']` = 34.

`y[0]` :

```
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False  True False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False]
```


  


# Build the model: a single LSTM layer

In [27]:
model = keras.Sequential( # stack layers into tf.keras.Model.
    [ # this is the first layer
     
        keras.Input(shape=(maxlen, len(chars))), # instatntiate Keras tensor of shape (40, 180)
        layers.LSTM(128), # 128 is the dimensionality of output space
        layers.Dense(len(chars), activation="softmax"), # densely connected NN layer with output of dimension 40 & softmax activation function.
    ], 
    [ # this is the first layer
     
        keras.Input(shape=(maxlen, len(chars))), # instatntiate Keras tensor of shape (40, 180)
        layers.LSTM(128), # 128 is the dimensionality of output space
        layers.Dense(len(chars), activation="softmax"), # densely connected NN layer with output of dimension 40 & softmax activation function.
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer) # configure the losses and optimizer 

In [29]:
model.summary()

Model: "[<KerasTensor: shape=(None, 40, 108) dtype=float32 (created by layer 'input_4')>, <tensorflow.python.keras.layers.recurrent_v2.LSTM object at 0x7f9a3e202278>, <tensorflow.python.keras.layers.core.Dense object at 0x7f9a367ae780>]"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 128)               121344    
_________________________________________________________________
dense_2 (Dense)              (None, 108)               13932     
Total params: 135,276
Trainable params: 135,276
Non-trainable params: 0
_________________________________________________________________


# Prepare the text sampling function

In [30]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Train the model

In [31]:
epochs = 1
batch_size = 128

for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    print(f"text len {len(text)} start_index {start_index}, maxlen {maxlen}")
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("...Diversity:", diversity)

        generated = ""
        sentence = text[start_index : start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars))) # this is 1 row of same dimesion with x
            for t, char in enumerate(sentence): 
                x_pred[0, t, char_indices[char]] = 1.0 # map True value on x_pred based on 'sentence'
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity) # calls the sample(preds, temperature) fn above
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()

ValueError: ignored