In [None]:
import warnings
import os
import shutil  
import pandas as pd
warnings.filterwarnings('ignore')
%matplotlib inline

## <font color = blue>Part A: Pre-processing Binary files to a dataset</font>

### 0. Set paths and random seed

In [None]:
random_seed = 1099
raw_dir = '../raw'
block_dir = '../blocks'
data_good = '../dataset/good'
data_bad = '../dataset/bad'

### 1. Process binary files to basic blocks

In [None]:
import ProcessBinary as proc

In [None]:
proc.generateBlocksFromBinary(raw_dir, block_dir)   

### 2. Create test ID and move blocks into "dataset/good or dataset/bad  folder" 

In [None]:
import ExtractInstructions as ex
import ExtractFeatures as ef

In [None]:
ex.createTestID(block_dir,data_good,data_bad)

### 3. Extract instruction and operands from basic blocks

In [None]:
combine_type = ex.getCombinedOperands("../other_files/Combined_Operands.txt")
set_filename_good, good_instr = ex.extractInstructionsAndOps(data_good, combine_type)
set_filename_bad, bad_instr = ex.extractInstructionsAndOps(data_bad, combine_type)

### 4. Create target label (0/1) and combined good and bad function blocks

In [None]:
id_combined, corpus_combined, target_combined = ef.createTrainSetandLabel(good_instr, bad_instr)

### 5. Extract Features using Counter Vectorizer

In [None]:
counter_arr, features_name = ef.countVectorizer(corpus_combined)

In [None]:
print("\nFeature extraction: ")
print("\nTotal number of features: ",len(features_name))
print(features_name)
print(counter_arr)

### 6. Export counter vectors with features to excel file: "TrainData.xlsx"

In [None]:
ef.exportToExcel("../other_files/TrainData.xlsx", features_name, counter_arr, id_combined, target_combined)

---

## <font color = blue>Part B: Pre-processing dataset</font>

### 1. import data from an excel file and save to Dataframe

In [None]:
df = pd.read_excel('../other_files/TrainData.xlsx', index_col=0)  

In [None]:
df.head()

### 2. Splitting datset into training, validation, test sets

In [None]:
from sklearn.model_selection import train_test_split
y_c = df.target
X_c = df.drop(['fname', 'target','constant_op3'], axis=1)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, stratify=y_c, random_state = random_seed, test_size= 0.10)
X_train_c, X_val_c, y_train_c, y_val_c = train_test_split(X_train_c, y_train_c, stratify=y_train_c, random_state = random_seed, test_size= 0.30)

In [None]:
cnt_train = y_train_c.value_counts()
per_train = y_train_c.value_counts(normalize=True)
cnt_val = y_val_c.value_counts()
per_val = y_val_c.value_counts(normalize=True)
cnt_test = y_test_c.value_counts()
per_test = y_test_c.value_counts(normalize=True)

pd.DataFrame({'Train_Count': cnt_train,'Val_Count': cnt_val,'Test_Count': cnt_test, 'Train_Percent': per_train, 'Val_Percent': per_val, 'Test_Percent': per_test}).style.hide_index()

### 3. Normalize data with z-score

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(X_train_c)
X_train_norm_c = ss.transform(X_train_c)
X_val_norm_c = ss.transform(X_val_c)
X_test_norm_c  = ss.transform(X_test_c)

### 4. Compute class weights

In [None]:
import numpy as np
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train_c),y_train_c)
print(class_weights)

---

## <font color = blue>Part C: Machine Learning Models</font>

In [None]:
import MLModels as ml
import MLModelsEvaluation as mle

### 0. Set Hyperparameters for tuning models

In [None]:
MS_list = [30, 50, 100, 200]
ET_list = [100, 200, 300]
MD_list = [20, 40, 60, 80, 100]
CW_list = [ {0:0.71, 1:1.64} ]
CG_list = [ 0.0001, 0.001, 0.01, 0.1, 1, 10 ,100, 1000]

### 1. Decision tree

In [None]:
#1.1 Build decision tree from Gridsearch CV
dt = ml.buildDecisionTreeGS(random_seed, X_train_c, y_train_c, MS_list, MD_list, CW_list)
dt

In [None]:
#1.2 fit model
dt.fit(X_train_c, y_train_c)

In [None]:
#1.2 Predict target class
pred_train = dt.predict(X_train_c)
pred_val = dt.predict(X_val_c)

In [None]:
#1.3 Evaluate Models 
print("Accuracy Scores (Train): ",mle.computeAccuracy(y_train_c, pred_train))
print("Accuracy Scores (Validation): ",mle.computeAccuracy(y_val_c, pred_val))
mle.showMetrics("Confusion Matrix for Best Decision Tree", y_val_c, pred_val)

In [None]:
#1.4 Compute feature importants from Decision Tree
top_features_dt = ml.getImportantFeatures(20,X_train_c, dt)
print(top_features_dt)
ml.plotImportantFeatures(top_features_dt)

## 2. Random Forest

In [None]:
#2.1 Build random forest from Gridsearch CV
rf =ml.buildRandomForestGS(random_seed, X_train_c, y_train_c,ET_list, MS_list, MD_list, CW_list)
rf

In [None]:
#2.2 Fit model
rf.fit(X_train_c, y_train_c)

In [None]:
#2.3 Predict target class
pred_train_rf = rf.predict(X_train_c)
pred_val_rf = rf.predict(X_val_c)

In [None]:
#2.3 Evaluate Models 
print("Accuracy Scores (Train): ",mle.computeAccuracy(y_train_c, pred_train_rf))
print("Accuracy Scores (Validation): ",mle.computeAccuracy(y_val_c, pred_val_rf))
mle.showMetrics("Confusion Matrix for Best Random Forest", y_val_c, pred_val_rf)

In [None]:
#2.4 Compute feature importants from Random Forest
top_features_rf = ml.getImportantFeatures(20,X_train_c, rf)
print(top_features_rf)
ml.plotImportantFeatures(top_features_rf)

## 3. SVM 

In [None]:
#3.1 Build the Best SVM
svm = ml.buildCustomSVM(100, 0.01, {0: 0.71, 1: 1.64} )
svm

In [None]:
#3.2 Fit model
svm.fit(X_train_norm_c, y_train_c)

In [None]:
#3.3 Predict class labels
pred_train = svm.predict(X_train_norm_c)
pred_val = svm.predict(X_val_norm_c)

In [None]:
#3.4 Evaluate Model
print("Accuracy Scores (Train): ",mle.computeAccuracy(y_train_c, pred_train))
print("Accuracy Scores (Validation): ",mle.computeAccuracy(y_val_c, pred_val))
mle.showMetrics("Confusion Matrix for Best SVM", y_val_c, pred_val)

---

## <font color = blue>Part D: Deep Learning Models</font>

In [None]:
import DLModels as dl
import DLModelsEvaluation as dle
import DLModelsThresholds as dlt

## 1. Reshape the input to 3D

In [None]:
X_train_DL, X_val_DL, X_test_DL = dl.reshape_X(X_train_norm_c, X_val_norm_c, X_test_norm_c)
y_train_dl, y_val_dl, y_test_dl = dl.reshape_Y(y_train_c, y_val_c, y_test_c)

In [None]:
print(X_train_DL.shape)
print(X_val_DL.shape)
print(X_test_DL.shape)

## 2. Compute class weight

In [None]:
# Compute class_weights
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train_c),y_train_c)
print(class_weights)

## 3. Build Model

In [None]:
model = dl.buildDLmodel(X_train_DL.shape[2], 'tanh', 'sigmoid')

## 4. Compile and Fit model

In [None]:
#4.1 compile the model
compiled_model = dl.compileDLModel(model,0.001)

In [None]:
#4.2 For a learning rate
import tensorflow as tf

def scheduler(epoch, lr):
    if epoch < 30:
        return lr
    else:
        return 0.001
    
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
#4.3 fit model and save tranining results
history, mymodel = dl.fitDLmodel(compiled_model, X_train_DL, y_train_dl, X_val_DL, y_val_dl, 32, 50, {0:0.71, 1:1.64},callback)
mymodel.summary()

## 5. Evaluate Models

In [None]:
#5.1 Plot Loss Graph
dle.plotLoss(history, 0, 0.7)

In [None]:
#5.2Plot Accuracy Graph
dle.plotAccuracy(history, 0.5, 1)

In [None]:
preds= mymodel.predict(X_val_DL)
Thresholds = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75]
dlt.plotAccuracyandF1(Thresholds,y_val_c, preds)

## 6. Evaluation the Best Deep Learning Model ( Test set )

In [None]:
#6.2 evaluate the best models
preds= mymodel.predict(X_test_DL)
predicted_class  = dlt.getPredictedLabel(preds, 0.45)
print("Accuracy Scores (Test): ",dle.computeAccuracy(y_test_c, predicted_class))
dle.showMetrics("Confusion Matrix for DL", y_test_c, predicted_class)