In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt 
import seaborn as sns
import random
import time
import math

In [2]:
data = pd.read_csv("dataset.csv")
#data

In [3]:
# Filtering only required columns
data = data[['Core_cyc',
       'Ref_cyc', 'Instruct', 'Ins_Retd', 'ILenStal', 'DTLBLoadMissWD',
       'DTLBStoreMissW', 'DTLBStrMiss_SH', 'DTLBStrMiss_WC',
       'DTLBStrMiss_WD', 'FP_Assist_ANY', 'HW_Intrs_Rcvd',
       'ICache_Misses', 'IDQ_All_DSB_C', 'IDQ_AllMite_UO',
       'L1D_P_Miss_Oc', 'L3_LAT_C_Miss', 'M_Ld_LLCH.XS_M',
       'M_Ld_LLCH.XS_N', 'MLdULLCM_LDRAM', 'M_Ld_Ret_L1Hit',
       'M_Ld_Ret_L2Hit', 'M_Ld_Ret_L3Hit', 'Loop_uops', 'Dec_uops',
       'Cach_uops', 'Uops', 'Macrofus', 'Uops_F.D.', 'res.stl.', 'uop_p0',
       'uop_p1', 'uop_p2', 'uop_p3', 'uop_p4', 'uop_p5', 'uop_p05',
       'BrMispred', 'Mov_elim', 'BrTaken', 'Mov_elim-', 'L1D_Miss',
       'ITLBMissW', 'ITLBMissS', 'L1D_Rep', 'L2ReqAll', 'L2ReqPFms',
       'Load_Hit_Pre', 'BrMispExec_Any', 'BrMispRetd_All',
       'CPL_CYCLES(R0)', 'CPU_CLK_UNH_RF', 'DSB2MIT_SW_CNT',
       'DTLBLoadMiss_W', 'DTLBLoadMissWC', 'binarylabel', 'goal',
       'family']]

In [4]:
benign_df = data.loc[data['binarylabel'] == 0]
malware_df = data.loc[data['binarylabel'] == 1]

In [5]:
benign_df_rows = benign_df.to_numpy().tolist()
malware_df_rows = malware_df.to_numpy().tolist()

In [6]:
#print(benign_df_rows[0])
#print(len(benign_df_rows[0]))

In [7]:
def correlation(x, y):
    mean_x = sum(x)/float(len(x))
    mean_y = sum(y)/float(len(y))
    sub_x = [i-mean_x for i in x]
    sub_y = [i-mean_y for i in y]
    
    numerator = sum([sub_x[i]*sub_y[i] for i in range(len(sub_x))])

    
    std_deviation_x = sum([sub_x[i]**2.0 for i in range(len(sub_x))])
    std_deviation_y = sum([sub_y[i]**2.0 for i in range(len(sub_y))])
    
    denominator = (std_deviation_x*std_deviation_y)**0.5
    
    try:
      cor = numerator/denominator
    except:
      return 5

    return cor

In [8]:
c_partition = []
for i in range(len(benign_df_rows)):
    c_partition.append(0)
for i in range(len(malware_df_rows)):
    c_partition.append(1)
    

required_feature = []

for i in range(55):
    temp = []
    for j in range(len(benign_df_rows)):
        temp.append(benign_df_rows[j][i])
    for j in range(len(malware_df_rows)):
        temp.append(malware_df_rows[j][i])
    required_feature.append([abs(correlation(temp,c_partition)),i])

required_feature.sort()
    
#print(required_feature)

In [9]:
X_train_normal = []
X_train_malware = []

featurelist = []

for i in range(32):
    featurelist.append(required_feature[52-i][1])

    
for i in range(len(benign_df_rows)):
    X_train_normal.append([])
    for j in range(32):
        X_train_normal[i].append(benign_df_rows[i][featurelist[j]])

for i in range(len(malware_df_rows)):
    X_train_malware.append([])
    for j in range(32):
        X_train_malware[i].append(malware_df_rows[i][featurelist[j]])

In [10]:
print(X_train_normal[0])
print(X_train_malware[0])

[871, 548376, 1352, 1123, 24271, 12463, 2429, 2736, 25506228, 25950191, 25950287, 8136008, 1539, 90507, 2374455, 37473576, 1253925, 44738745, 525908, 2919079, 5040486, 453, 38575681, 25272689, 9592709, 9050974, 17830, 3793459, 342831, 2005, 13540405, 4756]
[4830, 6796, 1, 1, 638, 605, 15532, 33, 19990591, 56928250, 56928256, 21641, 13101, 344, 6681, 38897292, 4569, 132127456, 571, 8036738, 719, 31241, 40041320, 77143, 1017, 879, 143, 14985, 45356, 0, 103180346, 0]


In [11]:
def Extract(data,k):
    for i in range(len(data)):
        data[i] = data[i][:k]
    return data

In [12]:
!pip install mlxtend



You should consider upgrading via the 'C:\Users\Deepthi\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [13]:
def ComputeAccuracy(y_pred, y_test):
    match = (y_test == y_pred).sum()
    accuracy = match/len(y_test)
    return accuracy

# BayesNet

#https://analyticsindiamag.com/a-guide-to-inferencing-with-bayesian-network-in-python/


# Naive Bayes
# https://scikit-learn.org/stable/modules/naive_bayes.html
def NaiveBayes(X_train, y_train, X_test, y_test):
    gnb = GaussianNB()
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    accuracy = ComputeAccuracy(y_pred, y_test)
    return accuracy

# https://www.datacamp.com/tutorial/understanding-logistic-regression-python
def Logistic(X_train, y_train, X_test, y_test):
    logreg = LogisticRegression()
    logreg.fit(X_train,y_train)
    y_pred = logreg.predict(X_test)
    accuracy = ComputeAccuracy(y_pred, y_test)
    return accuracy

# MultiPerceptron
# https://scikit-learn.org/stable/modules/neural_networks_supervised.html#:~:text=Multi%2Dlayer%20Perceptron%20(MLP),number%20of%20dimensions%20for%20output.
def MLP(X_train, y_train, X_test, y_test):
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 100), random_state=1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = ComputeAccuracy(y_pred, y_test)
    return accuracy

# SGD : Stochastic Gradient Descent
# https://scikit-learn.org/stable/modules/sgd.html
def SGD(X_train, y_train, X_test, y_test):
    clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = ComputeAccuracy(y_pred, y_test)
    return accuracy

# Simple Logistic 



# SMO 
# https://www.codeproject.com/Articles/1267445/An-Introduction-to-Support-Vector-Machine-SVM-and


# JRIP


# OneR Classifier
# https://rasbt.github.io/mlxtend/user_guide/classifier/OneRClassifier/
from mlxtend.classifier import OneRClassifier
def OneR(X_train, y_train, X_test, y_test):
    oner = OneRClassifier()
    oner.fit(X_train, y_train)
    oner.predict(X_train)
    y_pred = oner.predict(X_test)
    accuracy = ComputeAccuracy(y_pred, y_test)
    return accuracy

# Decision Tree based classifiers
# https://www.datacamp.com/tutorial/decision-tree-classification-python
from sklearn.tree import DecisionTreeClassifier
def DecisionTree(X_train, y_train, X_test, y_test):
    clf = DecisionTreeClassifier()
    clf = clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = ComputeAccuracy(y_pred, y_test)
    return accuracy

In [14]:
def ExtractData(X_normal, X_malware, k):
    X_train_normal = Extract(X_normal, k)
    X_train_malware = Extract(X_malware, k)
    
    # 70% training and 30% test
    y_normal = [0 for i in range(len(X_train_normal))]
    y_malware = [1 for i in range(len(X_train_malware))]
    X_train, X_test, y_train, y_test = train_test_split(X_train_normal, y_normal, test_size=0.3, random_state=1)
    X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train_malware, y_malware, test_size=0.3, random_state=1)
    X_train.extend(X_train2)
    y_train.extend(y_train2)
    X_test.extend(X_test2)
    y_test.extend(y_test2)
    
    random_indices1 = list(range(len(X_train)))
    random_indices2 = list(range(len(X_test)))
    random.shuffle(random_indices1)
    random.shuffle(random_indices2)
    X_train_new = []
    y_train_new = []
    X_test_new = []
    y_test_new = []

    for idx in random_indices1:
        X_train_new.append(X_train[idx])
        y_train_new.append(y_train[idx])
    for idx in random_indices2:
        X_test_new.append(X_test[idx])
        y_test_new.append(y_test[idx])

    X_train = X_train_new
    y_train = y_train_new
    X_test = X_test_new
    y_test = y_test_new
    
    return X_train, y_train, X_test, y_test

In [32]:
def ApplyClassifiers(X_train, y_train, X_test, y_test):
    accuracies = []
    accuracy = NaiveBayes(X_train, y_train, X_test, y_test)
    accuracies.append(accuracy)
    print(f"Naive Bayes: {accuracy}")
    
    accuracy = Logistic(X_train, y_train, X_test, y_test)
    accuracies.append(accuracy)
    print(f"Logistic Regression: {accuracy}")
    
    accuracy = MLP(X_train, y_train, X_test, y_test)
    accuracies.append(accuracy)
    print(f"Multilayer Perceptron: {accuracy}")
    
    accuracy = SGD(X_train, y_train, X_test, y_test)
    accuracies.append(accuracy)
    print(f"Stochastic Gradient Descent: {accuracy}")
    
#     X_train = np.array(X_train)
#     y_train = np.array(y_train)
#     X_test  = np.array(X_test)
#     y_test  = np.array(y_test)
#     accuracy = OneR(X_train, y_train, X_test, y_test)
#     accuracies.append(accuracy)
#     print(f"OneR: {accuracy}")
    
    accuracy = DecisionTree(X_train, y_train, X_test, y_test)
    accuracies.append(accuracy)
    print(f"Decision Tree: {accuracy}")
    
    return accuracies

In [33]:
X_train, y_train, X_test, y_test = ExtractData(X_train_normal, X_train_malware, 32)
accuracies = ApplyClassifiers(X_train, y_train, X_test, y_test)

Naive Bayes: 0.5035466173633116
Logistic Regression: 0.5625091310283679
Multilayer Perceptron: 0.5627836916054992




Stochastic Gradient Descent: 0.6225043954881385
Decision Tree: 0.6179199895213577


In [34]:
print(accuracies)

[0.5035466173633116, 0.5625091310283679, 0.5627836916054992, 0.6225043954881385, 0.6179199895213577]
