In [18]:
from sklearn.datasets import load_svmlight_file

# Đọc tệp .svm
X, y = load_svmlight_file('JS_DATASET.svm')
print(f"Shape of feature matrix: {X.shape}")
print(f"Shape of labels: {y.shape}")


Shape of feature matrix: (6725, 77)
Shape of labels: (6725,)


In [2]:
from sklearn.model_selection import train_test_split

# Chia dữ liệu thành tập huấn luyện và tập kiểm thử (test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Làm sạch dữ liệu trước khi train 

In [3]:
from sklearn.preprocessing import StandardScaler
import joblib
scaler = StandardScaler(with_mean=False)  # with_mean=False vì dữ liệu thưa (sparse)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
joblib.dump(scaler, 'scaler.joblib')
print(X_train)
print(X_test)

  (0, 9)	0.0027658121729650037
  (0, 19)	0.001864269701534398
  (0, 20)	0.3548696913675869
  (0, 21)	0.006453785604238408
  (0, 22)	0.33782542862539705
  (0, 23)	9.976684119481888
  (0, 24)	0.025882008227354248
  (0, 26)	1.150708782556366
  (0, 27)	10.585004145989624
  (0, 28)	0.000791716373798783
  (0, 29)	0.013644607375160997
  (0, 30)	0.008101460105482104
  (0, 31)	0.08835624179319403
  (0, 55)	0.009354057623302745
  (0, 56)	0.00675734295065026
  (0, 60)	0.0024887613357411223
  (0, 61)	0.002486489125254527
  (0, 62)	0.007396257604386003
  (0, 65)	0.0031892440665617196
  (0, 69)	0.005944046815324267
  (0, 70)	0.005943228747498432
  (0, 71)	2.3093544246031117
  (0, 72)	1.4353334088202754
  (0, 73)	3.4598079528411945
  (1, 9)	0.04425299476744006
  :	:
  (4706, 21)	0.09282694960762909
  (4706, 22)	1.3513017145015882
  (4706, 23)	8.621583323725488
  (4706, 24)	0.38958441453848924
  (4706, 25)	1.168058185405552
  (4706, 27)	9.183432450769654
  (4706, 29)	0.39692009544148116
  (4706, 30)	0

# Sử dụng mô hình SVC của machine learning 

In [4]:
from sklearn.svm import SVC

model = SVC(probability=True)
model.fit(X_train, y_train)


# Đưa ra độ chính xác + Confusion Matrix + Báo cáo chi tiết các chỉ số (Precision, recall, f1-score)

In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
y_pred = model.predict(X_test)

# Đánh giá accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Ma trận nhầm lẫn
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# Báo cáo chi tiết các chỉ số
print("Classification Report:\n", classification_report(y_test, y_pred))

svm_metrics={
    'accuracy': accuracy_score(y_test, y_pred),
    'presision': metrics.precision_score(y_test, y_pred, average='weighted'),
    'recall': metrics.recall_score(y_test, y_pred, average='weighted'),
    'f1-score': metrics.f1_score(y_test, y_pred, average='weighted')
}
print(svm_metrics)

Accuracy: 0.9930624380574826
Confusion Matrix:
 [[1352    3]
 [  11  652]]
Classification Report:
               precision    recall  f1-score   support

        -1.0       0.99      1.00      0.99      1355
         1.0       1.00      0.98      0.99       663

    accuracy                           0.99      2018
   macro avg       0.99      0.99      0.99      2018
weighted avg       0.99      0.99      0.99      2018

{'accuracy': 0.9930624380574826, 'presision': 0.9930762746489471, 'recall': 0.9930624380574826, 'f1-score': 0.993051593045452}


# Lưu mô hình

In [6]:
import joblib

# Lưu mô hình
joblib.dump(model, 'js_malware_classifier.pkl')


['js_malware_classifier.pkl']

In [7]:
import numpy as np
from scipy.sparse import csr_matrix
import joblib

# Tải mô hình đã huấn luyện
model = joblib.load('js_malware_classifier.pkl')

# Dữ liệu mẫu
data_str = "+1 10:16 14:3 20:1 21:9 22:4061 23:9.00 24:4.61 25:4061 26:1 28:4.61 30:4061 31:1181 48:1 49:15 55:1 56:287 57:40 60:1 61:8 62:8 63:3 65:13 66:5 67:38 68:3 69:3 72:0.01 73:0.07 74:0.29"


# Tách và xử lý dữ liệu
data = data_str.split()
features = [tuple(map(float, x.split(':'))) for x in data[1:]]

# Tạo một sparse matrix với 77 đặc trưng
total_features = 77
rows = [0]  # Chỉ có một hàng
cols = [int(f[0]) - 1 for f in features]  # Trừ đi 1 để bắt đầu từ 0
values = [f[1] for f in features]

# Tạo một mảng đầy đủ với tất cả các đặc trưng
full_values = np.zeros(total_features)
for col, value in zip(cols, values):
    full_values[col] = value
# Tạo sparse matrix
X_new = csr_matrix(full_values)
scaler = joblib.load('scaler.joblib')
X_new = scaler.transform(X_new)
print("Dữ liệu mẫu sau khi chuẩn hóa:", X_new)
# Dự đoán
y_pred = model.predict(X_new)
y_pred_proba = model.predict_proba(X_new)
print("Dự đoán cho dữ liệu mẫu:", y_pred)
print("Dự đoán cho dữ liệu mẫu:", y_pred[0])


Dữ liệu mẫu sau khi chuẩn hóa:   (0, 9)	0.04425299476744006
  (0, 13)	0.7132675577170564
  (0, 19)	0.000932134850767199
  (0, 20)	0.6387654444616564
  (0, 21)	0.14560457410451208
  (0, 22)	1.2161715430514293
  (0, 23)	9.05364444701014
  (0, 24)	0.6110862523911954
  (0, 25)	1.168058185405552
  (0, 27)	9.643650022334421
  (0, 29)	0.6225927028149305
  (0, 30)	0.13288644978575506
  (0, 47)	0.3727886545632666
  (0, 48)	0.3100479930397591
  (0, 54)	0.2353813130935309
  (0, 55)	0.08135195569357236
  (0, 56)	0.00932047303537967
  (0, 59)	0.0032846338902572584
  (0, 60)	0.01991009068592898
  (0, 61)	0.019891913002036216
  (0, 62)	0.0036981288021930014
  (0, 64)	0.11421930987819066
  (0, 65)	0.007973110166404299
  (0, 66)	0.19511537805347068
  (0, 67)	0.016298018883405267
  (0, 68)	0.01628343781414959
  (0, 71)	0.14433465153769448
  (0, 72)	0.5581852145412183
  (0, 73)	2.508360765809866
Dự đoán cho dữ liệu mẫu: [1.]
Dự đoán cho dữ liệu mẫu: 1.0


# Test lại với dữ liệu mẫu

In [53]:
# Dữ liệu mẫu để kiểm tra
import joblib
sample_data = X_test[4].reshape(1, -1)  # Lấy một mẫu từ tập test
print("Sample data:", sample_data)
model = joblib.load('js_malware_classifier.pkl')
predicted_label = model.predict(sample_data)
print("Predicted label for the sample:", predicted_label)

Sample data:   (0, 9)	0.04425299476744006
  (0, 13)	0.7132675577170564
  (0, 19)	0.000932134850767199
  (0, 20)	0.6387654444616564
  (0, 21)	0.13241016798029132
  (0, 22)	1.2161715430514293
  (0, 23)	9.073283588977622
  (0, 24)	0.5557107929280188
  (0, 25)	1.168058185405552
  (0, 27)	9.664569002860093
  (0, 29)	0.5661745509715681
  (0, 30)	0.09271671009607298
  (0, 47)	0.3727886545632666
  (0, 48)	0.3100479930397591
  (0, 54)	0.2353813130935309
  (0, 55)	0.08305269344326377
  (0, 56)	0.00932047303537967
  (0, 59)	0.0032846338902572584
  (0, 60)	0.01991009068592898
  (0, 61)	0.019891913002036216
  (0, 62)	0.0036981288021930014
  (0, 64)	0.11421930987819066
  (0, 65)	0.007973110166404299
  (0, 66)	0.19511537805347068
  (0, 67)	0.016298018883405267
  (0, 68)	0.01628343781414959
  (0, 71)	0.14433465153769448
  (0, 72)	0.637925959475678
  (0, 73)	1.902894374062657
Predicted label for the sample: [1.]


# Train mô hình với DNN của Deeplearning

In [19]:
#chia dữ liệu thành tập huấn luyện và tập kiểm thử và tập validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Shape of training set: {X_train.shape}")
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print(f"Shape of training set: {X_train.shape}")




Shape of training set: (4707, 77)
Shape of training set: (3765, 77)


In [20]:
#Chuẩn hóa lại dữ liệu
from sklearn.preprocessing import StandardScaler
import joblib
scaler = StandardScaler(with_mean=False)  # with_mean=False vì dữ liệu thưa (sparse)
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
joblib.dump(scaler, 'scaler_Dense.joblib')


['scaler_Dense.joblib']

In [21]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten,Dropout


y_train = (y_train + 1) / 2  # Chuyển đổi từ -1, 1 thành 0, 1
y_val = (y_val + 1) / 2
y_test = (y_test + 1) / 2



# Define the model
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),  # First dense layer
    Dropout(0.5),
    Dense(128, activation='relu'),  # First dense layer
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),   # Second dense layer
    Dense(1, activation='sigmoid') # Output layer with sigmoid activation
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model's architecture
model.summary()

In [26]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, min_delta=0.001, restore_best_weights=True)

In [27]:
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_val, y_val), callbacks=[early_stopping])
# Lưu mô hình
model.save('js_malware_classifier_nn.h5')
# model.save('js_malware_classifier_nn.h5')

Epoch 1/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9962 - loss: 0.0121 - val_accuracy: 0.9979 - val_loss: 0.0044
Epoch 2/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9970 - loss: 0.0180 - val_accuracy: 0.9979 - val_loss: 0.0069
Epoch 3/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9982 - loss: 0.0048 - val_accuracy: 0.9968 - val_loss: 0.0053
Epoch 4/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9977 - loss: 0.0064 - val_accuracy: 0.9947 - val_loss: 0.0549
Epoch 5/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9982 - loss: 0.0069 - val_accuracy: 0.9968 - val_loss: 0.0220
Epoch 6/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9973 - loss: 0.0129 - val_accuracy: 0.9968 - val_loss: 0.0077




In [28]:
#predict
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)
# Đánh giá mô hình
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Đánh giá accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Ma trận nhầm lẫn
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Báo cáo chi tiết các chỉ số

print("Classification Report:\n", classification_report(y_test, y_pred,digits=4))


[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.9985133795837463
Confusion Matrix:
 [[1352    3]
 [   0  663]]
Classification Report:
               precision    recall  f1-score   support

         0.0     1.0000    0.9978    0.9989      1355
         1.0     0.9955    1.0000    0.9977       663

    accuracy                         0.9985      2018
   macro avg     0.9977    0.9989    0.9983      2018
weighted avg     0.9985    0.9985    0.9985      2018



In [45]:
from tensorflow.keras.models import load_model

# Load the trained model
model = load_model('js_malware_classifier_nn.h5')

# Test the model with a sample from X_test
sample_data = X_test[5].reshape(1, -1)  # Take a sample from the test set
print(sample_data)
# Make predictions
predicted_class = model.predict(sample_data)    
print("Predict label for the sample",predicted_class)
if predicted_class > 0.5:
    predicted_class = 1
else:
    predicted_class = -1
print("Predict label for the sample",predicted_class)




  (0, 0)	1.9752583840851738
  (0, 9)	0.7526648861006205
  (0, 19)	0.0008419183616423647
  (0, 20)	0.9235163627185593
  (0, 21)	0.19160271154373543
  (0, 22)	1.7981087485110734
  (0, 23)	9.579399245514036
  (0, 24)	0.8602605408103636
  (0, 25)	1.1578202425422777
  (0, 27)	10.269491759391766
  (0, 29)	0.8791569587651178
  (0, 30)	0.15607339820665628
  (0, 33)	0.5875489484740144
  (0, 48)	0.22414552428039705
  (0, 49)	1.5282143332355211
  (0, 55)	0.20196783098900087
  (0, 56)	0.1852992483522718
  (0, 59)	0.0058770257697514485
  (0, 60)	0.6853621930973061
  (0, 61)	0.6846602179545653
  (0, 62)	0.005842751684060437
  (0, 64)	1.164110448644293
  (0, 65)	0.043120818198534513
  (0, 66)	1.4333778131373576
  (0, 67)	0.005161051479488602
  (0, 68)	0.005156281283317644
  (0, 69)	0.7970063812627514
  (0, 70)	0.796915428496701
  (0, 71)	2.1667559009526443
  (0, 72)	1.0365882433322737
  (0, 73)	2.2420468395317767
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step
Predict label f

In [46]:
# Confusion matrix
from sklearn.metrics import confusion_matrix,accuracy_score
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)
#Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

CNN_metrics={
    'accuracy': accuracy_score(y_test, y_pred),
    'presision': metrics.precision_score(y_test, y_pred),
    'recall': metrics.recall_score(y_test, y_pred),
    'f1-score': metrics.f1_score(y_test, y_pred)
}
print(CNN_metrics)





[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.3280475718533201
Confusion Matrix:
 [[   0 1350    5]
 [   0    0    0]
 [   0    1  662]]


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

So Sánh Kết quả của hai phương pháp

In [25]:
from prettytable import PrettyTable
pretty_table = PrettyTable()
pretty_table.field_names = ["Model", "Accuracy","Precision","Recall","F1 Score"]
pretty_table.add_row(["SVM",svm_metrics['accuracy'],svm_metrics['presision'],svm_metrics['recall'],svm_metrics['f1-score']])
pretty_table.add_row(["CNN",CNN_metrics['accuracy'],CNN_metrics['presision'],CNN_metrics['recall'],CNN_metrics['f1-score']])

print(pretty_table)

NameError: name 'svm_metrics' is not defined