In [6]:
import pandas as pd
from confidence_learning import ConfidenceLearning, rf_proba, dt_proba, knn_proba

In [7]:
# load dataset into pandas dataframe
df = pd.read_csv("Dataset.csv")

In [8]:
df.columns

Index(['pslist.nproc', 'pslist.nppid', 'pslist.avg_threads',
       'pslist.nprocs64bit', 'pslist.avg_handlers', 'dlllist.ndlls',
       'dlllist.avg_dlls_per_proc', 'handles.nhandles',
       'handles.avg_handles_per_proc', 'handles.nport', 'handles.nfile',
       'handles.nevent', 'handles.ndesktop', 'handles.nkey', 'handles.nthread',
       'handles.ndirectory', 'handles.nsemaphore', 'handles.ntimer',
       'handles.nsection', 'handles.nmutant', 'ldrmodules.not_in_load',
       'ldrmodules.not_in_init', 'ldrmodules.not_in_mem',
       'ldrmodules.not_in_load_avg', 'ldrmodules.not_in_init_avg',
       'ldrmodules.not_in_mem_avg', 'malfind.ninjections',
       'malfind.commitCharge', 'malfind.protection',
       'malfind.uniqueInjections', 'psxview.not_in_pslist',
       'psxview.not_in_eprocess_pool', 'psxview.not_in_ethread_pool',
       'psxview.not_in_pspcid_list', 'psxview.not_in_csrss_handles',
       'psxview.not_in_session', 'psxview.not_in_deskthrd',
       'psxview.not_in_p

In [9]:
# I want to drop the category column

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import warnings
from math import sqrt

warnings.filterwarnings("ignore")


results_df = pd.DataFrame()
main_df = df.copy()
temp_df = pd.DataFrame()
inverted_df = pd.DataFrame()
testdf = pd.DataFrame()
# change class name from benign and malware to 0 and 1
main_df["Class"] = main_df["Class"].map({"Benign": 0, "Malware": 1})
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
sampled_benign = df_benign.sample(n=int(len(df_benign) * 0.2))
sampled_malicious = df_malicious.sample(n=int(len(df_malicious) * 0.2))
testdf = pd.concat([sampled_benign, sampled_malicious], axis=0)
main_df = main_df.drop(testdf.index)
main_df = main_df.reset_index(drop=True)
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
old_main_df = main_df.copy()
old_main_y = old_main_df["Class"]
n1stpercentile = int(len(df_malicious) * 0.02)
xtest = testdf.drop("Class", axis=1)
ytest = testdf["Class"]
# Initialize logistic regression classifier for self-training
logreg = ConfidenceLearning(
    LogisticRegression(), "logreg", lambda model, x: model.predict_proba(x)
)
# Initialize random forest classifier for self-training
rf = ConfidenceLearning(RandomForestClassifier(), "rf", lambda model, x: rf_proba(model, x))
# Initialize decision tree classifier for self-training
dt = ConfidenceLearning(DecisionTreeClassifier(), "dt", lambda model, x: dt_proba(model, x))
# Initialize K-nearest neighbors classifier for self-training
knn = ConfidenceLearning(
    KNeighborsClassifier(n_neighbors=(int)(sqrt(len(df)) / 2)),
    "knn",
    lambda model, x: knn_proba(model, x),
)
# Initialize naive bayes classifier for self-training
nb = ConfidenceLearning(GaussianNB(), "nb", lambda model, x: model.predict_proba(x))
# Initialize support vector machine classifier for self-training
svm = ConfidenceLearning(SVC(probability=True), "svm", lambda model, x: model.predict_proba(x))
# Initialize list of models
models = [logreg]
for i in range(0, 40):
    temp_df = pd.DataFrame()
    merged_df = pd.concat([main_df, inverted_df], axis=0)

    # Train and evaluate each model
    for model in models:
        print(model.name)
        model.fit(
            (merged_df.drop("Class", axis=1)).values,
            merged_df["Class"].values,
        )
        poisoned = merged_df["Class"].values.copy()
        poisonedacc = model.accuracymeasure(xtest, ytest)
        print(poisonedacc)
        y_data = model.label_heal(
            merged_df.drop("Class", axis=1).values, merged_df["Class"].values
        )
        model.fit(
            merged_df.drop("Class", axis=1).values,
            y_data,
        )
        print(model.c)

        model.datarecorder(
            old_main_y,
            y_data,
            poisoned,
            xtest,
            ytest,
            (i * n1stpercentile*2),
            poisonedacc,
        )

    sampled_malware = df_malicious.sample(n=n1stpercentile)
    sampled_malware["Class"] = 0
    sampled_benign = df_benign.sample(n=n1stpercentile)
    sampled_benign["Class"] = 1
    temp_df = pd.concat([sampled_malware,sampled_benign], axis=0)
    sampled_indices = sampled_malware.index
    main_df = main_df.drop(sampled_indices)
    df_malicious = df_malicious.drop(sampled_indices)
    sampled_indices = sampled_benign.index
    main_df = main_df.drop(sampled_indices)
    df_benign = df_benign.drop(sampled_indices)
    inverted_df = pd.concat([inverted_df, temp_df], axis=0)
    print("Iteration: ", i)
for model in models:
    model.df.to_csv(model.name + " confidence.csv", index=False)

logreg
0.9959037378392217
[9.97498599e-01 2.19228069e-02 8.81415997e-09 ... 3.39539494e-10
 6.45983090e-08 2.75460830e-03]
[0.0025014  0.03670243 0.01540542 ... 0.00028476 0.00041191 0.00040845]
0.9604740637066247
Iteration:  0
logreg
0.994026284348865
[0.95637427 0.07958085 0.00601555 ... 0.98536681 0.97858221 0.98231757]
[0.04362573 0.12705356 0.11242123 ... 0.99762924 0.98496089 0.82261405]
0.9493819411580672
Iteration:  1
logreg
0.9930022188086705
[0.25893814 0.02120484 0.53546134 ... 0.95520491 0.95272984 0.95161822]
[0.14692377 0.16787886 0.14357815 ... 0.94189472 0.70756514 0.97783945]
0.9527926689590741
Iteration:  2
logreg
0.9910394265232975
[0.26068185 0.05031097 0.63706725 ... 0.9892537  0.88794679 0.90452375]
[0.14820344 0.16046982 0.14531437 ... 0.84955132 0.88471907 0.94709159]
0.9476343684791446
Iteration:  3
logreg
0.9918928144734597
[0.31859402 0.05956698 0.06356502 ... 0.91948708 0.87816638 0.88397387]
[0.17729433 0.1937521  0.17542642 ... 0.9752984  0.6900731  0.9665