In [6]:
import pandas as pd
from pu_learning import PuLearning, rf_proba, dt_proba, knn_proba

In [7]:
# load dataset into pandas dataframe
df = pd.read_csv("Dataset.csv")

In [8]:
df.columns

Index(['pslist.nproc', 'pslist.nppid', 'pslist.avg_threads',
       'pslist.nprocs64bit', 'pslist.avg_handlers', 'dlllist.ndlls',
       'dlllist.avg_dlls_per_proc', 'handles.nhandles',
       'handles.avg_handles_per_proc', 'handles.nport', 'handles.nfile',
       'handles.nevent', 'handles.ndesktop', 'handles.nkey', 'handles.nthread',
       'handles.ndirectory', 'handles.nsemaphore', 'handles.ntimer',
       'handles.nsection', 'handles.nmutant', 'ldrmodules.not_in_load',
       'ldrmodules.not_in_init', 'ldrmodules.not_in_mem',
       'ldrmodules.not_in_load_avg', 'ldrmodules.not_in_init_avg',
       'ldrmodules.not_in_mem_avg', 'malfind.ninjections',
       'malfind.commitCharge', 'malfind.protection',
       'malfind.uniqueInjections', 'psxview.not_in_pslist',
       'psxview.not_in_eprocess_pool', 'psxview.not_in_ethread_pool',
       'psxview.not_in_pspcid_list', 'psxview.not_in_csrss_handles',
       'psxview.not_in_session', 'psxview.not_in_deskthrd',
       'psxview.not_in_p

In [9]:
# I want to drop the category column

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import warnings
from math import sqrt

warnings.filterwarnings("ignore")


results_df = pd.DataFrame()
main_df = df.copy()
temp_df = pd.DataFrame()
inverted_df = pd.DataFrame()
testdf = pd.DataFrame()
# change class name from benign and malware to 0 and 1
main_df["Class"] = main_df["Class"].map({"Benign": 0, "Malware": 1})
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
sampled_benign = df_benign.sample(n=int(len(df_benign) * 0.2))
sampled_malicious = df_malicious.sample(n=int(len(df_malicious) * 0.2))
testdf = pd.concat([sampled_benign, sampled_malicious], axis=0)
main_df = main_df.drop(testdf.index)
main_df = main_df.reset_index(drop=True)
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
old_main_df = main_df.copy()
old_main_y = old_main_df["Class"]
n1stpercentile = int(len(df_malicious) * 0.02)
xtest = testdf.drop("Class", axis=1)
ytest = testdf["Class"]
# Initialize logistic regression classifier for self-training
logreg = PuLearning(
    LogisticRegression(), "logreg", lambda model, x: model.predict_proba(x)
)
# Initialize random forest classifier for self-training
rf = PuLearning(RandomForestClassifier(), "rf", lambda model, x: rf_proba(model, x))
# Initialize decision tree classifier for self-training
dt = PuLearning(DecisionTreeClassifier(), "dt", lambda model, x: dt_proba(model, x))
# Initialize K-nearest neighbors classifier for self-training
knn = PuLearning(
    KNeighborsClassifier(n_neighbors=(int)(sqrt(len(df)) / 2)),
    "knn",
    lambda model, x: knn_proba(model, x),
)
# Initialize naive bayes classifier for self-training
nb = PuLearning(GaussianNB(), "nb", lambda model, x: model.predict_proba(x))
# Initialize support vector machine classifier for self-training
svm = PuLearning(SVC(probability=True), "svm", lambda model, x: model.predict_proba(x))
# Initialize list of models
models = [logreg, rf, dt, knn]
for i in range(0, 40):
    temp_df = pd.DataFrame()
    merged_df = pd.concat([main_df, inverted_df], axis=0)

    # Train and evaluate each model
    for model in models:
        print(model.name)
        model.fit(
            (merged_df.drop("Class", axis=1)).values,
            merged_df["Class"].values,
        )
        poisoned = merged_df["Class"].values.copy()
        poisonedacc = model.accuracymeasure(xtest, ytest)
        print(poisonedacc)
        y_data = model.label_heal(
            merged_df.drop("Class", axis=1).values, merged_df["Class"].values
        )
        model.fit(
            merged_df.drop("Class", axis=1).values,
            y_data,
        )
        print(model.c)

        model.datarecorder(
            old_main_y,
            y_data,
            poisoned,
            xtest,
            ytest,
            (i * n1stpercentile),
            poisonedacc,
        )

    sampled_malware = df_malicious.sample(n=n1stpercentile)
    sampled_malware["Class"] = 0
    temp_df = pd.concat([sampled_malware])
    sampled_indices = sampled_malware.index
    main_df = main_df.drop(sampled_indices)
    df_malicious = df_malicious.drop(sampled_indices)
    inverted_df = pd.concat([inverted_df, temp_df], axis=0)
    print("Iteration: ", i)
for model in models:
    model.df.to_csv(model.name + ".csv", index=False)

logreg
0.9953917050691244
[9.97107388e-01 3.11217883e-09 3.83178572e-03 ... 1.13118711e-07
 2.57047624e-03 6.58442950e-14]
0.9598178435527382
rf
0.9998293224099676
[0. 0. 0. ... 0. 0. 0.]
0.9974806515835997
dt
0.9998293224099676
[4.40392443e-07 4.40392443e-07 4.40392443e-07 ... 4.40392443e-07
 4.40392443e-07 4.40392443e-07]
0.9999388387812994
knn
0.9904420549581839
[0. 0. 0. ... 0. 0. 0.]
0.9484889388477387
Iteration:  0
logreg
0.9911247653183137
[9.57481662e-01 2.32926170e-05 5.42591595e-02 ... 9.75334542e-01
 9.76585156e-01 9.29252070e-01]
0.9636761206222969
rf
0.998805256869773
[0.   0.   0.   ... 0.35 0.37 0.42]
0.9943234736534945
dt
0.997610513739546
[4.40392443e-07 4.40392443e-07 4.40392443e-07 ... 9.80392157e-03
 9.80392157e-03 9.80392157e-03]
0.9977897910764273
knn
0.9905273937532002
[0.         0.         0.         ... 0.96694215 0.99173554 0.97520661]
0.9493768860857899
Iteration:  1
logreg
0.9914661204983786
[9.48017625e-01 4.63203130e-05 2.49555744e-02 ... 3.95483515e-01
 

In [11]:
results_df

In [None]:
results_df.to_csv("resultstrainingonly2.csv", index=False)

In [None]:
print(df.shape)

In [None]:
df = pd.read_csv("Dataset.csv")