## 1. Import the necessary libraries

In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics  import accuracy_score,f1_score,precision_score,recall_score
from keras.activations import relu
from keras.utils import to_categorical
import numpy as np
import pandas as pd

## 2 Data preprocessing

>Read CSV

In [2]:
def ReadData(path):
    df=pd.read_csv(path,encoding='utf-8')
    return df['comment']

>Execute Function

In [3]:
df_train=ReadData("../DataPhone/trainprocessed.csv")
df_test= ReadData("../DataPhone/testprocesssed.csv")

> First line

In [4]:
print(df_train[0])
print(df_test[0])

pin kém còn miễn chê mua 832019 tình_trạng pin còn 88 ai giống tôi
điện thoải ổn facelock cực nhanh vân tay ôk màn_hình lớn pin trâu liên_quân zalo youtube một ngày mất khoảng 45 tuy chip 439 mượt đa_nhiệm khá ổn


>Size df_train ,df_test

In [5]:
print(df_train.shape)
print(df_test.shape)

(7786,)
(2224,)


> word separation

In [6]:
def wordseparation(commnents):
    return [comment.split() for comment in commnents]

> Execute Function

In [7]:
df_train=wordseparation(df_train)
df_test=wordseparation(df_test)

> First 2 lines

In [8]:
print(df_train[0:2])
print(df_test[0:2])

[['pin', 'kém', 'còn', 'miễn', 'chê', 'mua', '832019', 'tình_trạng', 'pin', 'còn', '88', 'ai', 'giống', 'tôi'], ['sao', 'gọi', 'điện_thoại', 'màn_hình', 'chấm', 'nhỏ', 'nháy', 'gần', 'camera', 'vậylúc']]
[['điện', 'thoải', 'ổn', 'facelock', 'cực', 'nhanh', 'vân', 'tay', 'ôk', 'màn_hình', 'lớn', 'pin', 'trâu', 'liên_quân', 'zalo', 'youtube', 'một', 'ngày', 'mất', 'khoảng', '45', 'tuy', 'chip', '439', 'mượt', 'đa_nhiệm', 'khá', 'ổn'], ['mình', 'mới', 'mua', 'vivo91c', 'tải', 'ứng_dụng', 'games', 'nhanh', 'hài_lòng', 'cài', 'hình', 'nền', 'khóa', 'màn_hình', 'hay', 'mình', 'biết', 'hết', 'chức_năng', 'nó', 'tư_vấn', 'viên', 'nhiệt_tình']]


>Create  Corpus

In [9]:
def CreateCorpus(Vi):
    tokenizer=Tokenizer(oov_token='<oov>')
    tokenizer.fit_on_texts(Vi)
    return tokenizer

> Execute Funtion

In [10]:
df_train_corpus=CreateCorpus(df_train)
df_test_corpus=CreateCorpus(df_test)
print(df_train_corpus.word_index)
print(df_test_corpus.word_index)

{'<oov>': 1, 'máy': 2, 'mua': 3, 'pin': 4, 'mình': 5, 'game': 6, 'dùng': 7, 'mới': 8, 'tốt': 9, 'chơi': 10, 'ko': 11, 'quá': 12, 'ok': 13, '1': 14, 'giá': 15, 'đẹp': 16, 'mượt': 17, 'nhanh': 18, 'sạc': 19, 'ngày': 20, 'ổn': 21, 'camera': 22, 'còn': 23, 'chụp': 24, 'thấy': 25, 'tầm': 26, 'xài': 27, 'hơn': 28, 'k': 29, 'về': 30, 'màn_hình': 31, 'trâu': 32, 'trong': 33, 'hơi': 34, 'đc': 35, 'khá': 36, 'sản_phẩm': 37, '2': 38, 'lỗi': 39, 'tay': 40, 'tháng': 41, 'sao': 42, 'hay': 43, 'nhân_viên': 44, 'mọi': 45, 'sử_dụng': 46, 'con': 47, 'hình': 48, 'sài': 49, 'hết': 50, 'vân': 51, 'nói_chung': 52, 'nhiệt_tình': 53, 'ngon': 54, 'thứ': 55, 'nóng': 56, 'lắm': 57, '3': 58, 'nghe': 59, 'nó': 60, 'loa': 61, 'luôn': 62, 'điện_thoại': 63, 'khác': 64, 'tệ': 65, 'lag': 66, 'ảnh': 67, 'người': 68, 'wifi': 69, 'xem': 70, 'mấy': 71, 'ở': 72, 'nào': 73, 'biết': 74, 'bạn': 75, 'lần': 76, 'cấu_hình': 77, 'dc': 78, 'đổi': 79, 'nói': 80, 'hàng': 81, 'sáng': 82, 'tôi': 83, 'đt': 84, 'cả': 85, 'đơ': 86, 'tuần'

In [11]:
print(f"Total Word in Corpus of df_train: ({len(df_train_corpus.word_index)})")
print(f"Total Word in Corpus of df_test: ({len(df_test_corpus.word_index)})")

Total Word in Corpus of df_train: (11617)
Total Word in Corpus of df_test: (5352)


> longest sentence in the training set, longest sentence in the test set

In [12]:
max_sentence_Xtrain= max([len(i) for i in  (df_train)])
print(f"max_sentence_df_train: {max_sentence_Xtrain} word")
max_sentence_Xtest=max([len(i) for i in (df_test)])
print(f"max_sentence_df_test: {max_sentence_Xtest} word")

max_sentence_df_train: 129 word
max_sentence_df_test: 131 word


> Create input training data by converting words to indexes and performing n-grams method of getting input sequences

In [13]:
def CreateInput(text_data):
    input_sequences = []
    for line in text_data:
        token_list = df_train_corpus.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

> Execute Function

In [14]:
input_sequences=CreateInput(df_train)
print(input_sequences[0:3])

[[4, 97], [4, 97, 23], [4, 97, 23, 1337]]


> Padding

In [15]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sentence_Xtrain, padding='pre'))
print(input_sequences[0:3])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    4   97]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0

> Divide the data into x_train set and y_train set

In [16]:
X_train, y_train = input_sequences[:, :-1], input_sequences[:, -1]
y_train =to_categorical(y_train, num_classes=len(df_train_corpus.word_index)+1)

## 3.Built Model LSTM

### 3.1. Design configuration for LSTM network

In [17]:
dropout=0.3
total_word=len(df_train_corpus.word_index)
num_classes=len(df_train_corpus.word_index)+1
embedding_size = 50
num_lstm_units = 128
epochs=12

### 3.2. Model LSTM

In [18]:
model = Sequential()
model.add(Embedding(total_word+1, embedding_size, name='embedding_layer'))
model.add(LSTM(num_lstm_units, return_sequences=True,name='LSTM_layer_1'))
model.add(Dropout(dropout,name='Dropout_layer_1'))
model.add(LSTM(num_lstm_units,name='LSTM_layer_2',return_sequences=True))
model.add(LSTM(num_lstm_units,name='LSTM_layer_3'))
model.add(Dense(num_classes, activation='softmax'))

### 3.3. Hyperparameter

In [19]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


### 3.4. Train

In [20]:
model.fit(X_train, y_train, verbose=1, epochs=epochs, batch_size=32)

MemoryError: Unable to allocate 8.06 GiB for an array with shape (186340, 11618) and data type float32

In [None]:
model.save('../model/model_generater.h5')