In [2]:
import pandas as pd
from pu_learning import PuLearning, rf_proba, dt_proba, knn_proba

In [3]:
# load dataset into pandas dataframe
df = pd.read_csv("Dataset.csv")

In [4]:
df.columns

Index(['pslist.nproc', 'pslist.nppid', 'pslist.avg_threads',
       'pslist.nprocs64bit', 'pslist.avg_handlers', 'dlllist.ndlls',
       'dlllist.avg_dlls_per_proc', 'handles.nhandles',
       'handles.avg_handles_per_proc', 'handles.nport', 'handles.nfile',
       'handles.nevent', 'handles.ndesktop', 'handles.nkey', 'handles.nthread',
       'handles.ndirectory', 'handles.nsemaphore', 'handles.ntimer',
       'handles.nsection', 'handles.nmutant', 'ldrmodules.not_in_load',
       'ldrmodules.not_in_init', 'ldrmodules.not_in_mem',
       'ldrmodules.not_in_load_avg', 'ldrmodules.not_in_init_avg',
       'ldrmodules.not_in_mem_avg', 'malfind.ninjections',
       'malfind.commitCharge', 'malfind.protection',
       'malfind.uniqueInjections', 'psxview.not_in_pslist',
       'psxview.not_in_eprocess_pool', 'psxview.not_in_ethread_pool',
       'psxview.not_in_pspcid_list', 'psxview.not_in_csrss_handles',
       'psxview.not_in_session', 'psxview.not_in_deskthrd',
       'psxview.not_in_p

In [5]:
# I want to drop the category column

In [6]:
percentile = 0.02
n_loops = 40

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import warnings
from math import sqrt

warnings.filterwarnings("ignore")


results_df = pd.DataFrame()
main_df = df.copy()
temp_df = pd.DataFrame()
inverted_df = pd.DataFrame()
testdf = pd.DataFrame()
# change class name from benign and malware to 0 and 1
main_df["Class"] = main_df["Class"].map({"Benign": 0, "Malware": 1})
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
sampled_benign = df_benign.sample(n=int(len(df_benign) * 0.2))
sampled_malicious = df_malicious.sample(n=int(len(df_malicious) * 0.2))
testdf = pd.concat([sampled_benign, sampled_malicious], axis=0)
main_df = main_df.drop(testdf.index)
main_df = main_df.reset_index(drop=True)
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
old_main_df = main_df.copy()
old_main_y = old_main_df["Class"]
n1stpercentile = int(len(df_malicious) * percentile)
xtest = testdf.drop("Class", axis=1)
ytest = testdf["Class"]
# Initialize logistic regression classifier for self-training
logreg = PuLearning(
    LogisticRegression(), "logreg", lambda model, x: model.predict_proba(x)
)
# Initialize random forest classifier for self-training
rf = PuLearning(
    RandomForestClassifier(max_depth=10), "rf", lambda model, x: rf_proba(model, x)
)
# Initialize decision tree classifier for self-training
dt = PuLearning(
    DecisionTreeClassifier(max_depth=10), "dt", lambda model, x: dt_proba(model, x)
)
# Initialize K-nearest neighbors classifier for self-training
knn = PuLearning(
    KNeighborsClassifier(n_neighbors=(int)(sqrt(len(df)) / 2)),
    "knn",
    lambda model, x: knn_proba(model, x),
)
# Initialize naive bayes classifier for self-training
nb = PuLearning(GaussianNB(), "nb", lambda model, x: model.predict_proba(x))
# Initialize support vector machine classifier for self-training
svm = PuLearning(SVC(probability=True), "svm", lambda model, x: model.predict_proba(x))
# Initialize list of models
models = [
    logreg,
    # rf,
    # dt,
    # knn,
    # svm,
]
# percentiles = [0.05, 0.10, 0.15, 0.2]

try:
    for i in range(0, n_loops):
        temp_df = pd.DataFrame()
        merged_df = pd.concat([main_df, inverted_df], axis=0)

        # Train and evaluate each model
        for model in models:
            print(model.name)
            model.fit(
                (merged_df.drop("Class", axis=1)).values,
                merged_df["Class"].values,
                0.10,
            )
            poisoned = merged_df["Class"].values.copy()
            poisonedacc = model.accuracymeasure(xtest, ytest)
            print(poisonedacc)
            y_data = model.label_heal(
                merged_df.drop("Class", axis=1).values, merged_df["Class"].values
            )
            model.fit(
                merged_df.drop("Class", axis=1).values,
                y_data,
                0.10,
            )
            print(model.c)

            model.datarecorder(
                old_main_y,
                y_data,
                poisoned,
                xtest,
                ytest,
                (i * n1stpercentile),
                poisonedacc,
            )

        sampled_malware = df_malicious.sample(n=n1stpercentile)
        sampled_malware["Class"] = 0
        temp_df = pd.concat([sampled_malware])
        sampled_indices = sampled_malware.index
        main_df = main_df.drop(sampled_indices)
        df_malicious = df_malicious.drop(sampled_indices)
        inverted_df = pd.concat([inverted_df, temp_df], axis=0)
        print("Iteration: ", i)
finally:
    for model in models:
        model.df.to_csv(model.name + ".csv", index=False)

logreg
0.9951356886840758
0.8269466902682825
Iteration:  0
logreg
0.9907834101382489
0.8085187314237225
Iteration:  1
logreg
0.9890766342379246
0.803196079139225
Iteration:  2
logreg
0.9893326506229732
0.8077078071723583
Iteration:  3
logreg
0.9901006997781191
0.8109665893564657
Iteration:  4
logreg
0.9914661204983786
0.8025604479594179
Iteration:  5
logreg
0.9890766342379246
0.7935682499588737
Iteration:  6
logreg
0.9889059566478922
0.806841032885276
Iteration:  7
logreg
0.9894179894179894
0.79969971067661
Iteration:  8
logreg
0.9899300221880867
0.8064925842133327
Iteration:  9
logreg
0.9888206178528759
0.7759441689964551
Iteration:  10
logreg
0.986857825567503
0.7965697173027922
Iteration:  11
logreg
0.98805256869773
0.7787247200181978
Iteration:  12
logreg
0.987625874722649
0.8043227891822253
Iteration:  13
logreg
0.985236388462195
0.7818259643579228
Iteration:  14
logreg
0.9819935142515788
0.795857614585951
Iteration:  15
logreg
0.9805427547363031
0.7945271370667508
Iteration:  16


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import warnings
from math import sqrt

warnings.filterwarnings("ignore")


results_df = pd.DataFrame()
main_df = df.copy()
temp_df = pd.DataFrame()
inverted_df = pd.DataFrame()
testdf = pd.DataFrame()
# change class name from benign and malware to 0 and 1
main_df["Class"] = main_df["Class"].map({"Benign": 0, "Malware": 1})
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
sampled_benign = df_benign.sample(n=int(len(df_benign) * 0.2))
sampled_malicious = df_malicious.sample(n=int(len(df_malicious) * 0.2))
testdf = pd.concat([sampled_benign, sampled_malicious], axis=0)
main_df = main_df.drop(testdf.index)
main_df = main_df.reset_index(drop=True)
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
old_main_df = main_df.copy()
old_main_y = old_main_df["Class"]
n1stpercentile = int(len(df_malicious) * percentile)
xtest = testdf.drop("Class", axis=1)
ytest = testdf["Class"]
# Initialize logistic regression classifier for self-training
logreg = PuLearning(
    LogisticRegression(), "logreg", lambda model, x: model.predict_proba(x)
)
# Initialize random forest classifier for self-training
rf = PuLearning(
    RandomForestClassifier(max_depth=10), "rf", lambda model, x: rf_proba(model, x)
)
# Initialize decision tree classifier for self-training
dt = PuLearning(
    DecisionTreeClassifier(max_depth=10), "dt", lambda model, x: dt_proba(model, x)
)
# Initialize K-nearest neighbors classifier for self-training
knn = PuLearning(
    KNeighborsClassifier(n_neighbors=(int)(sqrt(len(df)) / 2)),
    "knn",
    lambda model, x: knn_proba(model, x),
)
# Initialize naive bayes classifier for self-training
nb = PuLearning(GaussianNB(), "nb", lambda model, x: model.predict_proba(x))
# Initialize support vector machine classifier for self-training
svm = PuLearning(SVC(probability=True), "svm", lambda model, x: model.predict_proba(x))
# Initialize list of models
models = [
    # logreg,
    rf,
    # dt,
    # knn,
    # svm,
]
# percentiles = [0.05, 0.10, 0.15, 0.2]

try:
    for i in range(0, n_loops):
        temp_df = pd.DataFrame()
        merged_df = pd.concat([main_df, inverted_df], axis=0)

        # Train and evaluate each model
        for model in models:
            print(model.name)
            model.fit(
                (merged_df.drop("Class", axis=1)).values,
                merged_df["Class"].values,
                0.10,
            )
            poisoned = merged_df["Class"].values.copy()
            poisonedacc = model.accuracymeasure(xtest, ytest)
            print(poisonedacc)
            y_data = model.label_heal(
                merged_df.drop("Class", axis=1).values, merged_df["Class"].values
            )
            model.fit(
                merged_df.drop("Class", axis=1).values,
                y_data,
                0.10,
            )
            print(model.c)

            model.datarecorder(
                old_main_y,
                y_data,
                poisoned,
                xtest,
                ytest,
                (i * n1stpercentile),
                poisonedacc,
            )

        sampled_malware = df_malicious.sample(n=n1stpercentile)
        sampled_malware["Class"] = 0
        temp_df = pd.concat([sampled_malware])
        sampled_indices = sampled_malware.index
        main_df = main_df.drop(sampled_indices)
        df_malicious = df_malicious.drop(sampled_indices)
        inverted_df = pd.concat([inverted_df, temp_df], axis=0)
        print("Iteration: ", i)
finally:
    for model in models:
        model.df.to_csv(model.name + ".csv", index=False)

rf
0.9998293224099676
0.9757270639151137
Iteration:  0
rf
0.9998293224099676
0.9624566667132775
Iteration:  1
rf
0.9998293224099676
0.957092483952835
Iteration:  2
rf
0.9996586448199352
0.9486681343563882
Iteration:  3
rf
0.9994879672299027
0.9344824310545767
Iteration:  4
rf
0.9996586448199352
0.9317828588130105
Iteration:  5
rf
0.9994026284348865
0.9302374513868839
Iteration:  6
rf
0.9993172896398703
0.9183070622928656
Iteration:  7
rf
0.9993172896398703
0.9058790246810179
Iteration:  8
rf
0.9994879672299027
0.9087032842024848
Iteration:  9
rf
0.9994026284348865
0.8984851908602487
Iteration:  10
rf
0.9991466120498379
0.8933008553107392
Iteration:  11
rf
0.9991466120498379
0.8952093348624652
Iteration:  12
rf
0.998805256869773
0.8975002532937011
Iteration:  13
rf
0.9988905956647892
0.899825279275037
Iteration:  14
rf
0.9982078853046595
0.8988985149131561
Iteration:  15
rf
0.9981225465096433
0.8941125580150258
Iteration:  16
rf
0.9980372077146271
0.8913222761460289
Iteration:  17
rf
0.

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import warnings
from math import sqrt

warnings.filterwarnings("ignore")


results_df = pd.DataFrame()
main_df = df.copy()
temp_df = pd.DataFrame()
inverted_df = pd.DataFrame()
testdf = pd.DataFrame()
# change class name from benign and malware to 0 and 1
main_df["Class"] = main_df["Class"].map({"Benign": 0, "Malware": 1})
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
sampled_benign = df_benign.sample(n=int(len(df_benign) * 0.2))
sampled_malicious = df_malicious.sample(n=int(len(df_malicious) * 0.2))
testdf = pd.concat([sampled_benign, sampled_malicious], axis=0)
main_df = main_df.drop(testdf.index)
main_df = main_df.reset_index(drop=True)
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
old_main_df = main_df.copy()
old_main_y = old_main_df["Class"]
n1stpercentile = int(len(df_malicious) * percentile)
xtest = testdf.drop("Class", axis=1)
ytest = testdf["Class"]
# Initialize logistic regression classifier for self-training
logreg = PuLearning(
    LogisticRegression(), "logreg", lambda model, x: model.predict_proba(x)
)
# Initialize random forest classifier for self-training
rf = PuLearning(
    RandomForestClassifier(max_depth=10), "rf", lambda model, x: rf_proba(model, x)
)
# Initialize decision tree classifier for self-training
dt = PuLearning(
    DecisionTreeClassifier(max_depth=5), "dt", lambda model, x: dt_proba(model, x)
)
# Initialize K-nearest neighbors classifier for self-training
knn = PuLearning(
    KNeighborsClassifier(n_neighbors=(int)(sqrt(len(df)) / 2)),
    "knn",
    lambda model, x: knn_proba(model, x),
)
# Initialize naive bayes classifier for self-training
nb = PuLearning(GaussianNB(), "nb", lambda model, x: model.predict_proba(x))
# Initialize support vector machine classifier for self-training
svm = PuLearning(SVC(probability=True), "svm", lambda model, x: model.predict_proba(x))
# Initialize list of models
models = [
    # logreg,
    # rf,
    dt,
    # knn,
    # svm,
]
# percentiles = [0.05, 0.10, 0.15, 0.2]

try:
    for i in range(0, n_loops):
        temp_df = pd.DataFrame()
        merged_df = pd.concat([main_df, inverted_df], axis=0)

        # Train and evaluate each model
        for model in models:
            print(model.name)
            model.fit(
                (merged_df.drop("Class", axis=1)).values,
                merged_df["Class"].values,
                0.10,
            )
            poisoned = merged_df["Class"].values.copy()
            poisonedacc = model.accuracymeasure(xtest, ytest)
            print(poisonedacc)
            y_data = model.label_heal(
                merged_df.drop("Class", axis=1).values, merged_df["Class"].values
            )
            model.fit(
                merged_df.drop("Class", axis=1).values,
                y_data,
                0.10,
            )
            print(model.c)

            model.datarecorder(
                old_main_y,
                y_data,
                poisoned,
                xtest,
                ytest,
                (i * n1stpercentile),
                poisonedacc,
            )

        sampled_malware = df_malicious.sample(n=n1stpercentile)
        sampled_malware["Class"] = 0
        temp_df = pd.concat([sampled_malware])
        sampled_indices = sampled_malware.index
        main_df = main_df.drop(sampled_indices)
        df_malicious = df_malicious.drop(sampled_indices)
        inverted_df = pd.concat([inverted_df, temp_df], axis=0)
        print("Iteration: ", i)
finally:
    for model in models:
        model.df.to_csv(model.name + ".csv", index=False)

dt
0.9997439836149513
0.9233045738549397
Iteration:  0
dt
0.9991466120498379
0.9194199217612332
Iteration:  1
dt
0.9991466120498379
0.9194199217612332
Iteration:  2
dt
0.9991466120498379
0.924165150747499
Iteration:  3
dt
0.9990612732548216
0.9256601266787745
Iteration:  4
dt
0.9991466120498379
0.923669517966824
Iteration:  5
dt
0.9991466120498379
0.924285914298925
Iteration:  6
dt
0.9991466120498379
0.903441014154977
Iteration:  7
dt
0.9993172896398703
0.9242626000993657
Iteration:  8
dt
0.999231950844854
0.8527524787686065
Iteration:  9
dt
0.9993172896398703
0.8638386003464257
Iteration:  10
dt
0.999231950844854
0.9726276525564576
Iteration:  11
dt
0.9995733060249189
0.9167145205694806
Iteration:  12
dt
0.998378562894692
0.95731547094685
Iteration:  13
dt
0.9988905956647892
0.919354033320499
Iteration:  14
dt
0.998378562894692
0.9578777613843779
Iteration:  15
dt
0.9955623826591569
0.9058375119133787
Iteration:  16
dt
0.995647721454173
0.9048101269549712
Iteration:  17
dt
0.999317289

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import warnings
from math import sqrt

warnings.filterwarnings("ignore")


results_df = pd.DataFrame()
main_df = df.copy()
temp_df = pd.DataFrame()
inverted_df = pd.DataFrame()
testdf = pd.DataFrame()
# change class name from benign and malware to 0 and 1
main_df["Class"] = main_df["Class"].map({"Benign": 0, "Malware": 1})
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
sampled_benign = df_benign.sample(n=int(len(df_benign) * 0.2))
sampled_malicious = df_malicious.sample(n=int(len(df_malicious) * 0.2))
testdf = pd.concat([sampled_benign, sampled_malicious], axis=0)
main_df = main_df.drop(testdf.index)
main_df = main_df.reset_index(drop=True)
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
old_main_df = main_df.copy()
old_main_y = old_main_df["Class"]
n1stpercentile = int(len(df_malicious) * percentile)
xtest = testdf.drop("Class", axis=1)
ytest = testdf["Class"]
# Initialize logistic regression classifier for self-training
logreg = PuLearning(
    LogisticRegression(), "logreg", lambda model, x: model.predict_proba(x)
)
# Initialize random forest classifier for self-training
rf = PuLearning(
    RandomForestClassifier(max_depth=10), "rf", lambda model, x: rf_proba(model, x)
)
# Initialize decision tree classifier for self-training
dt = PuLearning(
    DecisionTreeClassifier(max_depth=10), "dt", lambda model, x: dt_proba(model, x)
)
# Initialize K-nearest neighbors classifier for self-training
knn = PuLearning(
    KNeighborsClassifier(n_neighbors=(int)(sqrt(len(df)) / 2)),
    "knn",
    lambda model, x: knn_proba(model, x),
)
# Initialize naive bayes classifier for self-training
nb = PuLearning(GaussianNB(), "nb", lambda model, x: model.predict_proba(x))
# Initialize support vector machine classifier for self-training
svm = PuLearning(SVC(probability=True), "svm", lambda model, x: model.predict_proba(x))
# Initialize list of models
models = [
    # logreg,
    # rf,
    # dt,
    knn,
    # svm,
]
# percentiles = [0.05, 0.10, 0.15, 0.2]

try:
    for i in range(0, n_loops):
        temp_df = pd.DataFrame()
        merged_df = pd.concat([main_df, inverted_df], axis=0)

        # Train and evaluate each model
        for model in models:
            print(model.name)
            model.fit(
                (merged_df.drop("Class", axis=1)).values,
                merged_df["Class"].values,
                0.10,
            )
            poisoned = merged_df["Class"].values.copy()
            poisonedacc = model.accuracymeasure(xtest, ytest)
            print(poisonedacc)
            y_data = model.label_heal(
                merged_df.drop("Class", axis=1).values, merged_df["Class"].values
            )
            model.fit(
                merged_df.drop("Class", axis=1).values,
                y_data,
                0.10,
            )
            print(model.c)

            model.datarecorder(
                old_main_y,
                y_data,
                poisoned,
                xtest,
                ytest,
                (i * n1stpercentile),
                poisonedacc,
            )

        sampled_malware = df_malicious.sample(n=n1stpercentile)
        sampled_malware["Class"] = 0
        temp_df = pd.concat([sampled_malware])
        sampled_indices = sampled_malware.index
        main_df = main_df.drop(sampled_indices)
        df_malicious = df_malicious.drop(sampled_indices)
        inverted_df = pd.concat([inverted_df, temp_df], axis=0)
        print("Iteration: ", i)
finally:
    for model in models:
        model.df.to_csv(model.name + ".csv", index=False)

knn
0.9913807817033623
0.7699625532380385
Iteration:  0
knn
0.9913807817033623
0.7713379380113643
Iteration:  1
knn
0.9913807817033623
0.7719090592998914
Iteration:  2
knn
0.9912954429083461
0.7719429471599578
Iteration:  3
knn
0.990868748933265
0.7717196996766829
Iteration:  4
knn
0.990868748933265
0.7716994619611934
Iteration:  5
knn
0.9910394265232975
0.7720697852394209
Iteration:  6
knn
0.9911247653183137
0.7726414932843104
Iteration:  7
knn
0.9912101041133299
0.7746866134215744
Iteration:  8
knn
0.9915514592933948
0.7759155921741494
Iteration:  9
knn
0.9914661204983786
0.7769919883409695
Iteration:  10
knn
0.9913807817033623
0.7780178473930236
Iteration:  11
knn
0.9913807817033623
0.7781948369137947
Iteration:  12
knn
0.9912954429083461
0.7792145763677578
Iteration:  13
knn
0.9913807817033623
0.7812249400395674
Iteration:  14
knn
0.9913807817033623
0.7853401785120214
Iteration:  15
knn
0.991636798088411
0.786127133181461
Iteration:  16
knn
0.9912954429083461
0.7877170530208468
Ite

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import warnings
from math import sqrt

warnings.filterwarnings("ignore")


results_df = pd.DataFrame()
main_df = df.copy()
temp_df = pd.DataFrame()
inverted_df = pd.DataFrame()
testdf = pd.DataFrame()
# change class name from benign and malware to 0 and 1
main_df["Class"] = main_df["Class"].map({"Benign": 0, "Malware": 1})
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
sampled_benign = df_benign.sample(n=int(len(df_benign) * 0.2))
sampled_malicious = df_malicious.sample(n=int(len(df_malicious) * 0.2))
testdf = pd.concat([sampled_benign, sampled_malicious], axis=0)
main_df = main_df.drop(testdf.index)
main_df = main_df.reset_index(drop=True)
df_benign = main_df[main_df["Class"] == 0]
df_malicious = main_df[main_df["Class"] == 1]
old_main_df = main_df.copy()
old_main_y = old_main_df["Class"]
n1stpercentile = int(len(df_malicious) * percentile)
xtest = testdf.drop("Class", axis=1)
ytest = testdf["Class"]
# Initialize logistic regression classifier for self-training
logreg = PuLearning(
    LogisticRegression(), "logreg", lambda model, x: model.predict_proba(x)
)
# Initialize random forest classifier for self-training
rf = PuLearning(
    RandomForestClassifier(max_depth=10), "rf", lambda model, x: rf_proba(model, x)
)
# Initialize decision tree classifier for self-training
dt = PuLearning(
    DecisionTreeClassifier(max_depth=10), "dt", lambda model, x: dt_proba(model, x)
)
# Initialize K-nearest neighbors classifier for self-training
knn = PuLearning(
    KNeighborsClassifier(n_neighbors=(int)(sqrt(len(df)) / 2)),
    "knn",
    lambda model, x: knn_proba(model, x),
)
# Initialize naive bayes classifier for self-training
nb = PuLearning(GaussianNB(), "nb", lambda model, x: model.predict_proba(x))
# Initialize support vector machine classifier for self-training
svm = PuLearning(SVC(probability=True), "svm", lambda model, x: model.predict_proba(x))
# Initialize list of models
models = [
    # logreg,
    # rf,
    # dt,
    # knn,
    svm,
]
# percentiles = [0.05, 0.10, 0.15, 0.2]

try:
    for i in range(0, n_loops):
        temp_df = pd.DataFrame()
        merged_df = pd.concat([main_df, inverted_df], axis=0)

        # Train and evaluate each model
        for model in models:
            print(model.name)
            model.fit(
                (merged_df.drop("Class", axis=1)).values,
                merged_df["Class"].values,
                0.10,
            )
            poisoned = merged_df["Class"].values.copy()
            poisonedacc = model.accuracymeasure(xtest, ytest)
            print(poisonedacc)
            y_data = model.label_heal(
                merged_df.drop("Class", axis=1).values, merged_df["Class"].values
            )
            model.fit(
                merged_df.drop("Class", axis=1).values,
                y_data,
                0.10,
            )
            print(model.c)

            model.datarecorder(
                old_main_y,
                y_data,
                poisoned,
                xtest,
                ytest,
                (i * n1stpercentile),
                poisonedacc,
            )

        sampled_malware = df_malicious.sample(n=n1stpercentile)
        sampled_malware["Class"] = 0
        temp_df = pd.concat([sampled_malware])
        sampled_indices = sampled_malware.index
        main_df = main_df.drop(sampled_indices)
        df_malicious = df_malicious.drop(sampled_indices)
        inverted_df = pd.concat([inverted_df, temp_df], axis=0)
        print("Iteration: ", i)
finally:
    for model in models:
        model.df.to_csv(model.name + ".csv", index=False)

svm
0.9837856289469192
0.7877373696779354
Iteration:  0
svm
0.9834442737668544
0.7955993473031496
Iteration:  1
svm
0.9833589349718382
0.7945215915961404
Iteration:  2
svm
0.9831029185867896
0.7934946772268437
Iteration:  3
svm
0.9827615634067247
0.7942581828328092
Iteration:  4
svm
0.9824202082266599
0.7971949259283938
Iteration:  5
svm
0.982505547021676
0.7988420444608548
Iteration:  6
svm
0.9822495306366275
0.7995390622521976
Iteration:  7
svm
0.9819081754565625
0.7992413854235378
Iteration:  8
svm
0.9816521590715139
0.7994466298402647
Iteration:  9
svm
0.9814814814814815
0.7993165937422211
Iteration:  10
svm
0.9816521590715139
0.8018996301698363
Iteration:  11
svm
0.9816521590715139
0.8025254130905886
Iteration:  12
svm
0.9813108038914491
0.801116612744184
Iteration:  13
svm
0.9810547875064004
0.8028303008337716
Iteration:  14
svm
0.9809694487113841
0.802641605124599
Iteration:  15
svm
0.980884109916368
0.8018421994310665
Iteration:  16
svm
0.980884109916368
0.7959528394820385
Iter

ValueError: The number of classes has to be greater than one; got 1 class