## 1. Import the necessary libraries

In [458]:
from gensim.models import Word2Vec,KeyedVectors
import numpy as np
from tqdm import tqdm
import pandas as pd
from keras.utils import to_categorical
import numpy as np
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from keras.models import load_model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## 2 Data preprocessing

> Read File CSV

In [459]:
def ReadData(path):
    df=pd.read_csv(path,encoding='utf-8')
    return df['comment'], df['label']

> Execute function

In [460]:
X_train,y_train = ReadData("./DataPhone/trainprocessed.csv")
X_test,y_test=ReadData("./DataPhone/testprocesssed.csv")

In [461]:
X_train.shape

(7786,)

> word separation

In [462]:
def wordseparation(comment):
    return [review.split() for review in comment]

In [463]:
X_train=wordseparation(X_train)
X_test=wordseparation(X_test)
X_train[0:2]

[['pin',
  'kém',
  'còn',
  'miễn',
  'chê',
  'mua',
  '832019',
  'tình_trạng',
  'pin',
  'còn',
  '88',
  'ai',
  'giống',
  'tôi'],
 ['sao',
  'gọi',
  'điện_thoại',
  'màn_hình',
  'chấm',
  'nhỏ',
  'nháy',
  'gần',
  'camera',
  'vậylúc']]

> word embedding

In [464]:
model_wordembedding = Word2Vec(sentences=X_train, vector_size=128, window=5, min_count=0, workers=4, sg=1)
model_wordembedding.wv.save("./model/word.model")

> Load Model_Word2Vec

In [465]:
model_embedding = KeyedVectors.load('./model/word.model')
num_words = len(model_embedding.index_to_key)
print("Number of Words:", num_words)
print("Vector:", model_embedding.get_vector('phần_mềm'))

Number of Words: 11616
Vector: [-0.3827453  -0.49992877  0.13456644 -0.00486418  0.15884152 -0.21923126
 -0.2954156  -0.09105124  0.03355692  0.2085277   0.41847968 -0.01629056
 -0.27218434 -0.08023937  0.2243351   0.34928292 -0.28355667  0.04318834
 -0.00836492  0.17223535  0.0027854   0.37866616 -0.25149423 -0.45774424
 -0.11549912  0.16263096 -0.3604427   0.32583836 -0.00733858 -0.19906686
 -0.20423645  0.05181192  0.09869606  0.22822438 -0.0940722  -0.30630314
  0.40915743 -0.04049246  0.08892313  0.11835228 -0.22382745  0.25424024
  0.04407566 -0.15520553  0.3344005   0.27177766 -0.36696735 -0.32712555
 -0.06771728  0.18338199  0.1291518   0.04926406  0.23156923  0.06945327
 -0.11137834  0.04027005  0.3986048  -0.2713666   0.09635579  0.17344695
  0.14819704 -0.01683081  0.4183759  -0.21694987  0.08569378 -0.06270046
  0.2425611  -0.0340323  -0.14494827 -0.33292085 -0.05165795 -0.48042902
 -0.30397213 -0.1941495   0.12841249 -0.26897725  0.10100113  0.28125358
 -0.4655839   0.0876

> Corpus

In [1]:
model_embedding.index_to_key

NameError: name 'model_embedding' is not defined

> Check to see how many words the longest sentence has

In [466]:
max_sequence = max([len(seq) for seq in X_train])
max_sequence_t=max ([len(seq) for seq in X_test ])
print(max_sequence)
print(max_sequence_t)

129
131


> representation for unknown words

In [467]:
embedding_size = 128
UNK_EMBEDDING = np.random.rand(embedding_size)

> convert words to vector space and padding

In [468]:
def comment_embedding(comment):
    matrix = np.zeros((131, embedding_size))
    for i, word in enumerate(comment):
            if word in model_embedding:
                matrix[i] = model_embedding.get_vector(word)
            else:
                matrix[i] = UNK_EMBEDDING
    return matrix

> Execute Function

In [469]:
train_data = []
test_data =[]
for x in tqdm(X_train):
    train_data.append(comment_embedding(x))
for y in tqdm(X_test): 
    test_data.append(comment_embedding(y))
test_data=np.array(test_data)
train_data=np.array(train_data)

100%|██████████| 7786/7786 [00:00<00:00, 12484.19it/s]
100%|██████████| 2224/2224 [00:00<00:00, 12784.97it/s]


> Convert labels to numbers

In [470]:
label_encoder = LabelEncoder()
# Mã hóa các nhãn văn bản thành các giá trị số
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded=label_encoder.fit_transform(y_test)

> Convert label to one_hot_vector

In [471]:

print(y_train_encoded[0:3])
# Biểu diễn nhãn dưới dạng one-hot vector
y_train_onehot = to_categorical(y_train_encoded)

[1 0 1]


## 3.Built Model CNN

### 3.1. Design configuration for CNN network

In [472]:
num_classes = 3
filter_sizes = 3
num_filters = 150
epochs = 40
learning_rate = 0.001
dropout_rate = 0.3

> transform input properties

In [473]:
x_train = train_data.reshape(train_data.shape[0], 131, embedding_size, 1).astype('float32')

### 3.2. Model CNN

In [474]:
# Define the model
model = Sequential()
# Add a convolutional layer
model.add(Conv2D(num_filters, (filter_sizes, embedding_size), activation='relu',
                 input_shape=(131, embedding_size, 1)))
model.add(MaxPooling2D(pool_size=(2, 1)))
model.add(Dropout(dropout_rate))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))

  super().__init__(


### 3.3. Parameter

In [475]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Print the model summary
print(model.summary())

None


### 3.4. Train

In [476]:
model.fit(x_train ,y_train_onehot, verbose=1, epochs=epochs)
model.save('./model/model_sentiment.h5')


Epoch 1/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.6882 - loss: 0.7313
Epoch 2/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.8107 - loss: 0.5068
Epoch 3/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.8158 - loss: 0.4830
Epoch 4/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.8311 - loss: 0.4434
Epoch 5/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.8309 - loss: 0.4239
Epoch 6/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 34ms/step - accuracy: 0.8455 - loss: 0.3992
Epoch 7/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - accuracy: 0.8506 - loss: 0.3757
Epoch 8/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.8735 - loss: 0.3318
Epoch 9/40
[1m244/244[0m [32m



> Dump file models_sentiment.h5

In [477]:
model.save('./model/model_sentiment.h5')



### 3.5 Evaluation test dataset

> Load file models_sentiment.h5

In [482]:
model_sentiment=load_model("./model/model_sentiment.h5")



> predict  test dataset 

In [483]:
result = model_sentiment.predict(test_data)

[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


> First 10 lines of comments

In [484]:
predict_labels = np.argmax(result, axis=1)
print(predict_labels[0:10])

[2 1 2 0 2 0 2 2 2 2]


> Evaluate the model through Accuracy measures,Precision,Recall,F1-score,

In [485]:
accuracy = accuracy_score(y_test_encoded, predict_labels)
print(f'Accuracy: {accuracy:.4f}')

precision = precision_score(y_test_encoded, predict_labels, average='weighted')
print(f"Precision: {precision:.4f}")

recall = recall_score(y_test_encoded, predict_labels, average='weighted')
print(f"Recall: {recall:.4f}")

f1score = f1_score(y_test_encoded, predict_labels, average='weighted')
print(f"F1-score: {f1score}")

Accuracy: 0.8017
Precision: 0.7837
Recall: 0.8017
F1-score: 0.7881041427012359
