## 1. Import Library 

In [42]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from sklearn.metrics  import accuracy_score,f1_score,precision_score,recall_score
from keras.utils import to_categorical
import numpy as np 
import regex as re 
np.set_printoptions(threshold=np.inf)

## 2. Data Proprocesing

#### 2.1. Read the file and divide it into sentences

In [43]:
def file_to_sentence_list(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    sentences = [sentence.strip() for sentence in re.split(r'(?<=[.!?])\s+', text) if sentence.strip()]
    return sentences 

#### 2.2. Create a dictionary for each word in the text file

In [44]:
file_path = 'train.txt'
text_data = file_to_sentence_list(file_path)
print(text_data)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data) 
total_words = len(tokenizer.word_index) + 1
print(total_words)


['Ho Chi Minh City School of Technical Education, with 53 years of construction and development, has trained more than 47,500 engineers and vocational teachers for the country; 1,300 masters; Training and fostering high quality human resources for the vocational education system as well as providing a team of engineers for the whole country.', 'Over the past few decades, the school has emerged as a reliable address for learners and graduates with outstanding training quality, with the highest employment rate of graduates in Vietnam thanks to its brand name.', 'The school is favored by domestic and foreign businesses.', 'To have a famous brand today, the German working style has penetrated deeply into the minds and subconscious of dozens of generations of teachers and students, originating from the event exactly 50 years ago when the Government of the Federal Republic of Germany sponsored the project and laid the first stone to build a school called Viet Duc Technical TH, located on the

#### 2.3. Create Input 

In [45]:
def CreateInput(text_data):
    input_sequences = []
    for line in text_data:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences
input_sequences=CreateInput(text_data)
print(input_sequences)

[[32, 33], [32, 33, 34], [32, 33, 34, 35], [32, 33, 34, 35, 5], [32, 33, 34, 35, 5, 2], [32, 33, 34, 35, 5, 2, 9], [32, 33, 34, 35, 5, 2, 9, 12], [32, 33, 34, 35, 5, 2, 9, 12, 8], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195, 36], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195, 36, 2], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195, 36, 2, 73], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195, 36, 2, 73, 3], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195, 36, 2, 73, 3, 54], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195, 36, 2, 73, 3, 54, 37], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195, 36, 2, 73, 3, 54, 37, 74], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195, 36, 2, 73, 3, 54, 37, 74, 75], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195, 36, 2, 73, 3, 54, 37, 74, 75, 112], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195, 36, 2, 73, 3, 54, 37, 74, 75, 112, 196], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195, 36, 2, 73, 3, 54, 37, 74, 75, 112, 196, 197], [32, 33, 34, 35, 5, 2, 9, 12, 8, 195, 36, 2, 73, 3, 54, 37, 74, 75, 112, 196, 197, 76], [32, 33, 34,

#### 2.4.Convert to full vectors

In [46]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

#### 2.5 Divide the data set into training set and testing set

In [47]:
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y =to_categorical(y, num_classes=total_words)

### 3 RNN Model

#### 3.1 Configuration RNN

In [48]:
model = Sequential()
model.add(Embedding(total_words, 10, name='embedding_layer')) 
model.add(SimpleRNN(32, return_sequences=True, activation='relu', name='rnn_layer_1'))
model.add(SimpleRNN(32, return_sequences=True, activation='relu', name='rnn_layer_2'))
model.add(SimpleRNN(32, return_sequences=True, activation='relu', name='rnn_layer_3'))
model.add(SimpleRNN(32, activation='relu', name='rnn_layer_4'))
model.add(Dense(total_words, activation='softmax', name='output_layer'))


#### 3.2. Trainning RNN

In [49]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=110, verbose=1)

Epoch 1/110


[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.0694 - loss: 6.1498
Epoch 2/110
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.0933 - loss: 5.4128
Epoch 3/110
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.0847 - loss: 5.2971
Epoch 4/110
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.0893 - loss: 5.2956
Epoch 5/110
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.0861 - loss: 5.2296
Epoch 6/110
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.0823 - loss: 5.2609
Epoch 7/110
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.0838 - loss: 5.2308
Epoch 8/110
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.0802 - loss: 5.1755
Epoch 9/110
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1dc63190fd0>

#### 3.3 Model Summary

In [50]:
model.summary()

#### 3.4 Predicting 

In [51]:
next_words = 10
seed_text="In the early 90s, a delegation of experts from the Federal Republic of Germany"
y_predict=[]
def Recommend(seed_text):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list)
        predicted_word = tokenizer.index_word[np.argmax(predicted_probs)]
        seed_text += " " + predicted_word
        y_predict.append(predicted_word)
    return seed_text
y_result=Recommend(seed_text)
print(y_result)
print(y_predict)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 729ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
In the early 90s, a delegation of experts from the Federal Republic of Germany came to vietnam to find a reliable partner to receive
['came', 'to', 'vietnam', 'to', 'find', 'a', 'reliable', 'partner', 'to', 'receive']


#### 3.5 Evaluating 

In [52]:
y_true = ["came","to","Vietnam","to","find","a","reliable","partner", "to","receive"]
accuracy = accuracy_score(y_true, y_predict)
print(f'Accuracy: {accuracy:.4f}')

precision = precision_score(y_true, y_predict, average='weighted')
print(f"Precision: {precision:.4f}")

recall = recall_score(y_true, y_predict, average='weighted')
print(f"Recall: {recall:.4f}")

f1score = f1_score(y_true, y_predict, average='weighted')
print(f"F1-score: {f1score}")

Accuracy: 0.9000
Precision: 0.9000
Recall: 0.9000
F1-score: 0.9


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
