## 1. Import the necessary libraries

In [131]:
from gensim.models import Word2Vec,KeyedVectors
import numpy as np
from tqdm import tqdm
import pandas as pd
from keras.utils import to_categorical
import numpy as np
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from keras.models import load_model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## 2 Data preprocessing

> Read File CSV

In [132]:
def ReadData(path):
    df=pd.read_csv(path,encoding='utf-8')
    return df['comment'], df['label']

> Execute function

In [133]:
X_train,y_train = ReadData("./DataPhone/trainprocessed.csv")
X_test,y_test=ReadData("./DataPhone/testprocesssed.csv")

In [134]:
X_train.shape

(7786,)

> word separation

In [135]:
def wordseparation(comment):
    return [review.split() for review in comment]

In [136]:
X_train=wordseparation(X_train)
X_test=wordseparation(X_test)
X_train[0:2]

[['pin',
  'kém',
  'còn',
  'miễn',
  'chê',
  'mua',
  '832019',
  'tình_trạng',
  'pin',
  'còn',
  '88',
  'ai',
  'giống',
  'tôi'],
 ['sao',
  'gọi',
  'điện_thoại',
  'màn_hình',
  'chấm',
  'nhỏ',
  'nháy',
  'gần',
  'camera',
  'vậylúc']]

> word embedding

In [137]:
model_wordembedding = Word2Vec(sentences=X_train, vector_size=128, window=5, min_count=0, workers=4, sg=1)
model_wordembedding.wv.save("./model/word.model")

> Load Model_Word2Vec

In [138]:
model_embedding = KeyedVectors.load('./model/word.model')
num_words = len(model_embedding.index_to_key)
print("Number of Words:", num_words)
print("Vector:", model_embedding.get_vector('phần_mềm'))

Number of Words: 11616
Vector: [-0.26839077 -0.5694664   0.16898231  0.2709258   0.06194954 -0.21748176
 -0.1919581  -0.21426831 -0.031178    0.15982957  0.344399    0.1363645
 -0.07074342  0.00841982  0.06501573  0.33299792 -0.41292265 -0.04888694
  0.0448266   0.10994948  0.03558733  0.43595466 -0.10224482 -0.578901
  0.01011072  0.34310105 -0.44262537  0.25674602 -0.04221772 -0.17802006
 -0.00439191 -0.03809931  0.0696373   0.17663802 -0.01129966 -0.24278358
  0.5052278  -0.0655674   0.08159841  0.12477466 -0.19583663  0.10469246
  0.10047981 -0.13904439  0.23409437  0.33600146 -0.16030543 -0.2986271
 -0.01674184  0.21609932  0.06065254  0.02250185  0.20835595  0.15685998
 -0.22031693  0.07688367  0.39416292 -0.33874533 -0.0538542   0.24279806
  0.07723865  0.02359876  0.27001718 -0.06651524  0.15797603 -0.11353112
  0.25065482 -0.16833225 -0.18608865 -0.35185295  0.09435567 -0.43964586
 -0.28560993 -0.0925076   0.01268727 -0.23760478 -0.01106189  0.24776131
 -0.4224776  -0.00570319

> Corpus

In [139]:
model_embedding.key_to_index

{'máy': 0,
 'mua': 1,
 'pin': 2,
 'mình': 3,
 'game': 4,
 'dùng': 5,
 'mới': 6,
 'tốt': 7,
 'chơi': 8,
 'ko': 9,
 'quá': 10,
 'ok': 11,
 '1': 12,
 'giá': 13,
 'đẹp': 14,
 'mượt': 15,
 'nhanh': 16,
 'sạc': 17,
 'ngày': 18,
 'ổn': 19,
 'camera': 20,
 'còn': 21,
 'chụp': 22,
 'thấy': 23,
 'tầm': 24,
 'xài': 25,
 'hơn': 26,
 'k': 27,
 'về': 28,
 'màn_hình': 29,
 'trâu': 30,
 'trong': 31,
 'hơi': 32,
 'đc': 33,
 'khá': 34,
 'sản_phẩm': 35,
 '2': 36,
 'lỗi': 37,
 'tay': 38,
 'tháng': 39,
 'sao': 40,
 'hay': 41,
 'nhân_viên': 42,
 'mọi': 43,
 'sử_dụng': 44,
 'con': 45,
 'hình': 46,
 'sài': 47,
 'hết': 48,
 'vân': 49,
 'nói_chung': 50,
 'nhiệt_tình': 51,
 'ngon': 52,
 'thứ': 53,
 'nóng': 54,
 'lắm': 55,
 '3': 56,
 'nghe': 57,
 'nó': 58,
 'loa': 59,
 'luôn': 60,
 'điện_thoại': 61,
 'khác': 62,
 'tệ': 63,
 'lag': 64,
 'ảnh': 65,
 'người': 66,
 'wifi': 67,
 'xem': 68,
 'mấy': 69,
 'ở': 70,
 'nào': 71,
 'biết': 72,
 'bạn': 73,
 'lần': 74,
 'cấu_hình': 75,
 'dc': 76,
 'đổi': 77,
 'nói': 78,
 'hàng'

> Check to see how many words the longest sentence has

In [140]:
max_sequence = max([len(seq) for seq in X_train])
max_sequence_t=max ([len(seq) for seq in X_test ])
print(max_sequence)
print(max_sequence_t)

129
131


> representation for unknown words

In [141]:
embedding_size = 128

> convert words to vector space and padding

In [142]:
def comment_embedding(comment):
    matrix = np.zeros((131, embedding_size))
    for i, word in enumerate(comment):
            if word in model_embedding:
                matrix[i] = model_embedding.get_vector(word)
    return matrix

> Execute Function

In [143]:
train_data = []
test_data =[]
for x in tqdm(X_train):
    train_data.append(comment_embedding(x))
for y in tqdm(X_test): 
    test_data.append(comment_embedding(y))
test_data=np.array(test_data)
train_data=np.array(train_data)

100%|██████████| 7786/7786 [00:00<00:00, 11897.60it/s]
100%|██████████| 2224/2224 [00:00<00:00, 12105.62it/s]


> Convert labels to numbers

In [144]:
label_encoder = LabelEncoder()
# Mã hóa các nhãn văn bản thành các giá trị số
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded=label_encoder.fit_transform(y_test)

> Convert label to one_hot_vector

In [145]:
print(y_train_encoded[0:3])
# Biểu diễn nhãn dưới dạng one-hot vector
y_train_onehot = to_categorical(y_train_encoded)

[1 0 1]


## 3.Built Model CNN

### 3.1. Design configuration for CNN network

In [146]:
num_classes = 3
filter_sizes = 3
num_filters = 150
epochs = 100
learning_rate = 0.0001
dropout_rate = 0.3

> transform input properties

In [147]:
x_train = train_data.reshape(train_data.shape[0], 131, embedding_size, 1).astype('float32')

### 3.2. Model CNN

In [148]:
# Define the model
model = Sequential()
# Add a convolutional layer
model.add(Conv2D(num_filters, (filter_sizes, embedding_size), activation='relu',
                 input_shape=(131, embedding_size, 1),name='convolution_layer'))
model.add(MaxPooling2D(pool_size=(2, 1),strides=(2, 1),name='max_pooling'))
model.add(Dropout(dropout_rate))
model.add(Flatten())
model.add(Dense(9, activation='relu',name='layer_dense_1'))
model.add(Dense(9, activation='relu',name='layer_dense_2'))
model.add(Dense(9, activation='relu',name='layer_dense_3'))
model.add(Dense(9, activation='relu',name='layer_dense_4'))
model.add(Dense(9, activation='relu',name='layer_dense_5'))
model.add(Dense(9, activation='relu',name='layer_dense_6'))
model.add(Dense(9, activation='relu',name='layer_dense_7'))
model.add(Dense(9, activation='relu',name='layer_dense_8'))
model.add(Dense(3, activation='softmax'))

  super().__init__(


### 3.3. Parameter

In [149]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Print the model summary
print(model.summary())

None


### 3.4. Train

In [150]:
model.fit(x_train ,y_train_onehot, verbose=1, epochs=epochs)
model.save('./model/model_sentiment.h5')

Epoch 1/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.6174 - loss: 0.8783
Epoch 2/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.7807 - loss: 0.6169
Epoch 3/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8080 - loss: 0.5337
Epoch 4/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8160 - loss: 0.5018
Epoch 5/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8136 - loss: 0.4896
Epoch 6/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8279 - loss: 0.4608
Epoch 7/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8326 - loss: 0.4413
Epoch 8/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8275 - loss: 0.4271
Epoch 9/100
[1m244/244[0m



In [151]:
print(model.summary())

None


> Dump file models_sentiment.h5

In [152]:
model.save('./model/model_sentiment.h5')



### 3.5 Evaluation test dataset

> Load file models_sentiment.h5

In [153]:
model_sentiment=load_model("./model/model_sentiment.h5")



> predict  test dataset 

In [154]:
result = model_sentiment.predict(test_data)

[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


> First 10 lines of comments

In [155]:
predict_labels = np.argmax(result, axis=1)
print(predict_labels[0:10])

[2 2 2 0 2 0 2 2 2 0]


> Evaluate the model through Accuracy measures,Precision,Recall,F1-score,

In [156]:
accuracy = accuracy_score(y_test_encoded, predict_labels)
print(f'Accuracy: {accuracy:.4f}')

precision = precision_score(y_test_encoded, predict_labels, average='weighted')
print(f"Precision: {precision:.4f}")

recall = recall_score(y_test_encoded, predict_labels, average='weighted')
print(f"Recall: {recall:.4f}")

f1score = f1_score(y_test_encoded, predict_labels, average='weighted')
print(f"F1-score: {f1score}")

Accuracy: 0.7923
Precision: 0.7881
Recall: 0.7923
F1-score: 0.7899448000899431
