## 1. Import the necessary libraries

In [4]:
from gensim.models import Word2Vec,KeyedVectors
import numpy as np
from tqdm import tqdm
import pandas as pd
from keras.utils import to_categorical
import numpy as np
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from keras.models import load_model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## 2 Data preprocessing

> Read File CSV

In [5]:
def ReadData(path):
    df=pd.read_csv(path,encoding='utf-8')
    return df['comment'], df['label']

> Execute function

In [6]:
X_train,y_train = ReadData("./DataPhone/trainprocessed.csv")
X_test,y_test=ReadData("./DataPhone/testprocesssed.csv")

In [7]:
X_train.shape

(7786,)

> word separation

In [8]:
def wordseparation(comment):
    return [review.split() for review in comment]

In [9]:
X_train=wordseparation(X_train)
X_test=wordseparation(X_test)
X_train[0:2]

[['pin',
  'kém',
  'còn',
  'miễn',
  'chê',
  'mua',
  '832019',
  'tình_trạng',
  'pin',
  'còn',
  '88',
  'ai',
  'giống',
  'tôi'],
 ['sao',
  'gọi',
  'điện_thoại',
  'màn_hình',
  'chấm',
  'nhỏ',
  'nháy',
  'gần',
  'camera',
  'vậylúc']]

> word embedding

In [10]:
model_wordembedding = Word2Vec(sentences=X_train, vector_size=128, window=5, min_count=0, workers=4, sg=1)
model_wordembedding.wv.save("./model/word.model")

> Load Model_Word2Vec

In [21]:
model_embedding = KeyedVectors.load('./model/word.model')
num_words = len(model_embedding.index_to_key)
print("Number of Words:", num_words)
print("Vector:", model_embedding.get_vector('phần_mềm'))

Number of Words: 11616
Vector: [-0.29676306 -0.5523812   0.29708946  0.14677978  0.15434411 -0.22118884
 -0.22413431 -0.03216672  0.01646779  0.29965252  0.4591725   0.17191347
 -0.12816055 -0.02394285  0.19848335  0.46312287 -0.33102342  0.10726438
  0.05330929  0.20024107  0.02613792  0.37338787 -0.1339589  -0.5895731
  0.10730265  0.3342829  -0.3188318   0.29415742 -0.04572802 -0.35089594
  0.02332449 -0.02299648  0.18665695  0.20008974 -0.10690106 -0.15883224
  0.53895795 -0.13592862  0.07226042 -0.03267276 -0.1830259   0.08819146
  0.01389484 -0.10688017  0.28538564  0.40696213 -0.26715556 -0.2783442
 -0.03739665  0.23914094  0.09691461  0.06437067  0.15280935  0.13934802
 -0.11665509 -0.00979637  0.48695856 -0.18629698 -0.06011186  0.19943099
  0.12377575 -0.0577236   0.20280531 -0.16854912  0.19220933 -0.09229698
  0.15679628 -0.10646278 -0.18201728 -0.30503842  0.08273465 -0.4593613
 -0.30457163 -0.03343036  0.08461061 -0.298229    0.05588067  0.17358802
 -0.47685862 -0.0025682

> Corpus

In [19]:
model_embedding.key_to_index

{'máy': 0,
 'mua': 1,
 'pin': 2,
 'mình': 3,
 'game': 4,
 'dùng': 5,
 'mới': 6,
 'tốt': 7,
 'chơi': 8,
 'ko': 9,
 'quá': 10,
 'ok': 11,
 '1': 12,
 'giá': 13,
 'đẹp': 14,
 'mượt': 15,
 'nhanh': 16,
 'sạc': 17,
 'ngày': 18,
 'ổn': 19,
 'camera': 20,
 'còn': 21,
 'chụp': 22,
 'thấy': 23,
 'tầm': 24,
 'xài': 25,
 'hơn': 26,
 'k': 27,
 'về': 28,
 'màn_hình': 29,
 'trâu': 30,
 'trong': 31,
 'hơi': 32,
 'đc': 33,
 'khá': 34,
 'sản_phẩm': 35,
 '2': 36,
 'lỗi': 37,
 'tay': 38,
 'tháng': 39,
 'sao': 40,
 'hay': 41,
 'nhân_viên': 42,
 'mọi': 43,
 'sử_dụng': 44,
 'con': 45,
 'hình': 46,
 'sài': 47,
 'hết': 48,
 'vân': 49,
 'nói_chung': 50,
 'nhiệt_tình': 51,
 'ngon': 52,
 'thứ': 53,
 'nóng': 54,
 'lắm': 55,
 '3': 56,
 'nghe': 57,
 'nó': 58,
 'loa': 59,
 'luôn': 60,
 'điện_thoại': 61,
 'khác': 62,
 'tệ': 63,
 'lag': 64,
 'ảnh': 65,
 'người': 66,
 'wifi': 67,
 'xem': 68,
 'mấy': 69,
 'ở': 70,
 'nào': 71,
 'biết': 72,
 'bạn': 73,
 'lần': 74,
 'cấu_hình': 75,
 'dc': 76,
 'đổi': 77,
 'nói': 78,
 'hàng'

> Check to see how many words the longest sentence has

In [13]:
max_sequence = max([len(seq) for seq in X_train])
max_sequence_t=max ([len(seq) for seq in X_test ])
print(max_sequence)
print(max_sequence_t)

129
131


> representation for unknown words

In [14]:
embedding_size = 128
# UNK_EMBEDDING = np.random.rand(embedding_size)

> convert words to vector space and padding

In [17]:
def comment_embedding(comment):
    matrix = np.zeros((131, embedding_size))
    for i, word in enumerate(comment):
            if word in model_embedding:
                matrix[i] = model_embedding.get_vector(word)
            # else:
            #     matrix[i] = np.zeros(128)
    return matrix

> Execute Function

In [18]:
train_data = []
test_data =[]
for x in tqdm(X_train):
    train_data.append(comment_embedding(x))
for y in tqdm(X_test): 
    test_data.append(comment_embedding(y))
test_data=np.array(test_data)
train_data=np.array(train_data)

100%|██████████| 7786/7786 [00:00<00:00, 9303.23it/s]
100%|██████████| 2224/2224 [00:00<00:00, 8792.16it/s]


> Convert labels to numbers

In [22]:
label_encoder = LabelEncoder()
# Mã hóa các nhãn văn bản thành các giá trị số
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded=label_encoder.fit_transform(y_test)

> Convert label to one_hot_vector

In [23]:

print(y_train_encoded[0:3])
# Biểu diễn nhãn dưới dạng one-hot vector
y_train_onehot = to_categorical(y_train_encoded)

[1 0 1]


## 3.Built Model CNN

### 3.1. Design configuration for CNN network

In [24]:
num_classes = 3
filter_sizes = 3
num_filters = 150
epochs = 40
learning_rate = 0.001
dropout_rate = 0.3

> transform input properties

In [25]:
x_train = train_data.reshape(train_data.shape[0], 131, embedding_size, 1).astype('float32')

### 3.2. Model CNN

In [26]:
# Define the model
model = Sequential()
# Add a convolutional layer
model.add(Conv2D(num_filters, (filter_sizes, embedding_size), activation='relu',
                 input_shape=(131, embedding_size, 1)))
model.add(MaxPooling2D(pool_size=(2, 1)))
model.add(Dropout(dropout_rate))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))

### 3.3. Parameter

In [27]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Print the model summary
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 129, 1, 150)       57750     
                                                                 
 max_pooling2d (MaxPooling2  (None, 64, 1, 150)        0         
 D)                                                              
                                                                 
 dropout (Dropout)           (None, 64, 1, 150)        0         
                                                                 
 flatten (Flatten)           (None, 9600)              0         
                                                                 
 dense (Dense)               (None, 128)               1228928   
                                                                 
 dense_1 (Dense)             (None, 3)                 387       
                                                        

### 3.4. Train

In [28]:
model.fit(x_train ,y_train_onehot, verbose=1, epochs=epochs)
model.save('./model/model_sentiment.h5')


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


  saving_api.save_model(


> Dump file models_sentiment.h5

In [29]:
model.save('./model/model_sentiment.h5')

  saving_api.save_model(


### 3.5 Evaluation test dataset

> Load file models_sentiment.h5

In [30]:
model_sentiment=load_model("./model/model_sentiment.h5")

> predict  test dataset 

In [31]:
result = model_sentiment.predict(test_data)



> First 10 lines of comments

In [32]:
predict_labels = np.argmax(result, axis=1)
print(predict_labels[0:10])

[2 2 2 0 2 0 2 2 2 2]


> Evaluate the model through Accuracy measures,Precision,Recall,F1-score,

In [33]:
accuracy = accuracy_score(y_test_encoded, predict_labels)
print(f'Accuracy: {accuracy:.4f}')

precision = precision_score(y_test_encoded, predict_labels, average='weighted')
print(f"Precision: {precision:.4f}")

recall = recall_score(y_test_encoded, predict_labels, average='weighted')
print(f"Recall: {recall:.4f}")

f1score = f1_score(y_test_encoded, predict_labels, average='weighted')
print(f"F1-score: {f1score}")

Accuracy: 0.8013
Precision: 0.7845
Recall: 0.8013
F1-score: 0.7910524263706001
