In [1]:
import os
import json
import pandas as pd
import numpy as np
import statistics
from helpers import *
import re
import pickle

In [2]:
json_file = r"D:\ClassWork\anti_virus\Vigil-Anti\EXE_Dataset\ember2018\train_features_1.jsonl"

with open(json_file, 'r') as f:
    json_ds_list = list(f)

DataSet = []
for i,ds in enumerate(json_ds_list):
    if( i > 15000):
        break
    DataSet.append(json.loads(ds))

# to free some of the precious memory
del json_ds_list

print (DataSet[0])

simple_ds = DataSet[5]

{'sha256': '2ef9a92ee6c955364564b0df75ee3753473014b2ba162b9df90afe6df9dbb256', 'md5': '7e39aeea7bc21d16b8652516a150b282', 'appeared': '2018-01', 'label': 1, 'avclass': 'sivis', 'histogram': [60782, 5895, 2020, 1487, 2075, 1367, 1145, 856, 2037, 725, 2027, 716, 1418, 903, 672, 1014, 1605, 652, 702, 691, 1048, 927, 641, 599, 795, 636, 598, 598, 677, 629, 597, 571, 8564, 738, 921, 600, 1253, 835, 645, 565, 1015, 919, 958, 868, 917, 784, 1435, 1307, 1470, 1081, 903, 1380, 913, 914, 872, 823, 1013, 1048, 1001, 1289, 1063, 1261, 792, 771, 1852, 3074, 928, 1346, 1238, 1786, 1036, 857, 1028, 1149, 902, 749, 1003, 1101, 1014, 883, 2012, 1152, 1374, 1468, 1242, 1374, 1312, 1447, 975, 848, 716, 1067, 940, 1566, 1298, 1468, 897, 3196, 1406, 2574, 2206, 5376, 2771, 1455, 2052, 2923, 1401, 908, 2522, 1562, 3768, 3473, 2336, 813, 3879, 2968, 5270, 2441, 1323, 1398, 1176, 1245, 843, 944, 984, 1172, 878, 851, 1168, 1116, 1029, 2612, 900, 1471, 827, 767, 953, 1479, 908, 4228, 772, 1342, 753, 719, 828, 7

### Exploring the unique section names

In [3]:
"""
all_sectionNames = set()
for ds_obj in DataSet:
    for dic_elm in ds_obj['section']['sections']:
        all_sectionNames.add(dic_elm['name'])

with open('sectionNames.txt', 'w') as f:
    f.write('\n'.join(all_sectionNames))

correct_sec_names = []
for n in all_sectionNames:
    if(n and n[0] == "."):
        correct_sec_names.append(n)

with open('sectionNames_correct.txt', 'w') as f:
    f.write('\n'.join(correct_sec_names))
"""

'\nall_sectionNames = set()\nfor ds_obj in DataSet:\n    for dic_elm in ds_obj[\'section\'][\'sections\']:\n        all_sectionNames.add(dic_elm[\'name\'])\n\nwith open(\'sectionNames.txt\', \'w\') as f:\n    f.write(\'\n\'.join(all_sectionNames))\n\ncorrect_sec_names = []\nfor n in all_sectionNames:\n    if(n and n[0] == "."):\n        correct_sec_names.append(n)\n\nwith open(\'sectionNames_correct.txt\', \'w\') as f:\n    f.write(\'\n\'.join(correct_sec_names))\n'

#### spoiler: there are lots of malicious section names
#### so I just extracted the most common and correct section names and then wrote them into "common_section_names.txt"
#### any other section names will be considered "UNKNOWN"

In [4]:
# Saving the most common section names

with open(os.path.join(os.getcwd(), 'assets', 'common_section_names.txt'), 'r') as f:
    Common_section_names = f.readlines()

Common_section_names = [re.sub(r'\n', '', i) for i in Common_section_names]

# Explore all the possible imports

In [5]:
"""
from tqdm import tqdm
all_imports = set()
for obj in tqdm(DataSet):
    import_DLL_dict = obj['imports']
    DLL_list = list(import_DLL_dict.keys())
    for elm in DLL_list:
        if(elm.endswith('.dll')):
            all_imports.add(elm)
    #all_imports = set(all_imports)

with open('all_imports_cleansed.txt', 'w') as f:
    f.write('\n'.join(all_imports))
"""

"\nfrom tqdm import tqdm\nall_imports = set()\nfor obj in tqdm(DataSet):\n    import_DLL_dict = obj['imports']\n    DLL_list = list(import_DLL_dict.keys())\n    for elm in DLL_list:\n        if(elm.endswith('.dll')):\n            all_imports.add(elm)\n    #all_imports = set(all_imports)\n\nwith open('all_imports_cleansed.txt', 'w') as f:\n    f.write('\n'.join(all_imports))\n"


### Same problem with DLL imports, there are numerous different DLLs
### and I cannot really filter all of them, so I will just grab the most common DLLs that are associated with most malwares
### and another feature which will be the number of imported DLLs

# Let's just cleanse the data

In [6]:
new_Dataset = []

for simple_ds in tqdm(DataSet, desc='cleansing the dataset'):
    try:
        # add reduced features of byteentropy distribution
        simple_ds.update(Interpret_Histogram(simple_ds['byteentropy'], 'byteentropy'))

        # add reduced features of byte histogram distribution
        simple_ds.update(Interpret_Histogram(simple_ds['histogram'], 'bytehistogram'))

        # reduce strings field
        simple_ds = extract_subfields_from_fields(simple_ds, 'strings', normalize_names=True, delete_field=True)

        # flatten the strings printables distribution field
        simple_ds = flatten_strings_printable_distribution(simple_ds, delete_field=True)

        # reduce general field
        simple_ds = extract_subfields_from_fields(simple_ds, 'general', normalize_names=True, delete_field=True)

        # reduce header field
        simple_ds = extract_subfields_from_fields(simple_ds, 'header', normalize_names=True, delete_field=True)
        simple_ds = extract_subfields_from_fields(simple_ds, 'header_optional', normalize_names=False, delete_field=True)
        simple_ds = extract_subfields_from_fields(simple_ds, 'header_coff', normalize_names=False, delete_field=True)


        # handle data directories field
        simple_ds = handle_data_directories_field(simple_ds)


        # handle sections fields
        simple_ds = handle_section_names(simple_ds, Common_section_names, delete_field=True)

        # handle imports fields
        simple_ds = handle_DLL_imports(simple_ds, delete_field=False)

        # Remove the useless columns for now (they are not entirely useless but they will make the training process very complex for me :(( )
        useless_columns = ['sha256'
            ,'md5'
            ,'appeared'
            ,'avclass'
            ,'histogram'
            ,'byteentropy'
            ,'imports'
            ,'exports'
            ,'dll_characteristics'
            ,'characteristics']

        for useless_col in useless_columns:
            del simple_ds[useless_col]
        
        new_Dataset.append(simple_ds)
    except:
        continue


# Finally, free the original dataset from our precious memory
del DataSet


cleansing the dataset:   1%|          | 112/15001 [00:00<00:13, 1108.94it/s]

cleansing the dataset:  50%|████▉     | 7448/15001 [00:05<00:06, 1123.50it/s]

Error inside handle_DLL_imports()


cleansing the dataset:  91%|█████████▏| 13690/15001 [00:10<00:01, 1196.13it/s]

Error inside handle_DLL_imports()


cleansing the dataset: 100%|██████████| 15001/15001 [00:12<00:00, 1247.59it/s]


# Let's prepare our Pandas DataFrame

In [7]:
# df= pd.DataFrame()
# i = 0
# for dic in new_Dataset:
#     df = pd.concat([df, pd.DataFrame([0]*len(df.columns))], axis=0)
#     for k in dic.keys():
#         if k in df.columns:
#             try:
#                 df.loc[i, k] = dic[k]
#             except:
#                 print(k)
#                 print(df)
#         else:
#             dummy_list = pd.DataFrame([0]*len(df) if len(df) > 0 else [0])
#             df.insert(0, k, dummy_list)
#             #print(df.columns)
#             df.loc[i, k] = dic[k]
    
#     #print(df.head())
#     i+=1


# df.fillna(0)
# print(df)

# df.to_csv('lol.csv')

In [9]:
# Just make a very big dictionary by joining keys together
# then just casting this very big dictionary to a very big dataframe :)

all_keys = set().union(*new_Dataset)

merged_dict = {}

for d in tqdm(new_Dataset, desc="contructing a very big Dictionary"):
    for key in all_keys:
        if key in d.keys():
            if key in merged_dict:
                merged_dict[key].append(d[key])
            else:
                merged_dict[key] = [d[key]]
        else:
            if key not in merged_dict:
                merged_dict[key] = []

df = pd.DataFrame().from_dict(merged_dict, orient='index').transpose()


df.fillna(0, inplace=True)
df.to_csv('Dataset.csv')
df.describe()

contructing a very big Dictionary:   0%|          | 0/15001 [00:00<?, ?it/s]

contructing a very big Dictionary: 100%|██████████| 15001/15001 [00:01<00:00, 11253.30it/s]


Unnamed: 0,.idata_size,strings_printabledist_25,.aspack_vsize,sizeof_headers,strings_printabledist_9,strings_printabledist_82,strings_printabledist_44,strings_printabledist_95,general_imports,strings_registry,...,strings_printabledist_51,.idat_vsize,CLR_RUNTIME_HEADER_virtual_address,.xdata_size,strings_printabledist_14,.textbss_vsize,strings_avlength,.xdata_props_len,mean_of_first_tertile_byteentropy,strings_numstrings
count,15001.0,15001.0,15001.0,15001.0,15001.0,15001.0,15001.0,15001.0,15001.0,15001.0,...,15001.0,15001.0,15001.0,15001.0,15001.0,15001.0,15001.0,15001.0,15001.0,15001.0
mean,1373.918205,2108.124,126.421439,1856.527432,390.415706,2481.147,803.10186,357.667889,114.269249,0.302646,...,1737.138257,73.888407,99652.69,485.454836,958.848077,749.2201,36.876985,0.108259,3121.380347,8029.616
std,6502.073311,21982.31,6694.649364,3649.037775,1632.070865,13381.22,3351.127516,1610.162114,201.690491,6.564754,...,5231.488967,1874.95817,11446560.0,8252.815492,7626.328708,37474.66,378.607564,0.995061,13167.978979,28554.83
min,0.0,0.0,0.0,512.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,34.0,0.0,1024.0,14.0,246.0,57.0,9.0,8.0,0.0,...,112.0,0.0,0.0,0.0,65.0,0.0,7.697155,0.0,20.576471,645.0
50%,0.0,197.0,0.0,1024.0,72.0,743.0,241.0,66.0,67.0,0.0,...,368.0,0.0,0.0,0.0,205.0,0.0,12.005911,0.0,162.576471,2659.0
75%,0.0,666.0,0.0,4096.0,226.0,2184.0,680.0,210.0,163.0,0.0,...,1143.0,0.0,0.0,0.0,802.0,0.0,17.862069,0.0,1145.882353,6624.0
max,433664.0,1179314.0,774144.0,207360.0,56245.0,1050041.0,149959.0,61444.0,9024.0,710.0,...,234986.0,51200.0,1401809000.0,502784.0,672431.0,3489435.0,35348.421466,14.0,559351.576471,1158922.0


In [11]:


with open(os.path.join(os.getcwd(), 'assets', 'suspicious_imports.txt'), 'r') as f:
    sus_imports = f.readlines()
sus_imports = [re.sub(r'\n', '', i) for i in sus_imports]

boolean_columns = sus_imports + []
categorical_columns = ["subsystem", "magic", "machine"]


for col in df.columns:
    if col in boolean_columns:
        df[col] = df[col].astype(bool)
        df[col].fillna(False)
        continue

    if col in categorical_columns:
        df[col].replace(0, 'UNKNOWN', inplace=True)
        continue
    df[col].fillna(0)
    df[col] = df[col].astype(np.int64)
    df[col].fillna(0)

for col in df.columns:
    print(f"{col}:        {df[col].dtype}")


df.to_csv('Dataset_big.csv', index=False)

# Save our feature list
feature_columns = list(df.columns)
feature_columns.pop(feature_columns.index('label'))
print(f"total number of Features: {len(feature_columns)}")
with open(os.path.join(os.getcwd(), 'assets', 'features.pkl'), 'wb') as f:
    pickle.dump(feature_columns, f)



.idata_size:        int64
strings_printabledist_25:        int64
.aspack_vsize:        int64
sizeof_headers:        int64
strings_printabledist_9:        int64
strings_printabledist_82:        int64
strings_printabledist_44:        int64
strings_printabledist_95:        int64
general_imports:        int64
strings_registry:        int64
.code_props_len:        int64
.code_entropy:        int64
strings_printabledist_89:        int64
minor_subsystem_version:        int64
strings_printabledist_10:        int64
strings_printabledist_88:        int64
strings_printabledist_32:        int64
general_symbols:        int64
.data_vsize:        int64
.pdata_props_len:        int64
strings_printabledist_17:        int64
mean_of_second_tertile_bytehistogram:        int64
RESOURCE_TABLE_size:        int64
strings_printabledist_56:        int64
strings_printabledist_15:        int64
strings_printabledist_42:        int64
.aspack_props_len:        int64
strings_printabledist_3:        int64
strings_prin

# Remove the -1 tuples!

In [12]:
df = df[df['label'] != -1]

# Encoding Categorical Columns

In [13]:
from sklearn.preprocessing import LabelEncoder

df_train = df.copy()
array_of_Label_Encoders = []
for col in categorical_columns:
    new_LE = LabelEncoder().fit(df_train[col])
    df_train[col] = new_LE.transform(df_train[col])
    array_of_Label_Encoders.append(new_LE)

with open(os.path.join(os.getcwd(), 'models', 'enc.pkl'), 'wb') as f:
    pickle.dump(array_of_Label_Encoders, f)

# Let's make our classifier

In [14]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


df_train_1 = df_train.copy()

feature_columns = list(df_train_1.columns)
feature_columns.pop(feature_columns.index("label"))
if(re.findall('Unnamed', feature_columns[0], re.IGNORECASE)):
    feature_columns.pop(0)


x_train, x_test, y_train, y_test = train_test_split(df_train_1[feature_columns], df_train_1['label'], test_size=0.3, shuffle=True)

svm_model = SVC(kernel='poly', degree= 3, verbose=True).fit(x_train, y_train)

y_pred = svm_model.predict(x_test)
print(classification_report(y_test, y_pred))


with open(os.path.join(os.getcwd(), 'models', 'svm.pkl'), 'wb') as f:
    pickle.dump(svm_model, f)

[LibSVM]              precision    recall  f1-score   support

           0       0.77      0.01      0.01      1526
           1       0.53      1.00      0.70      1747

    accuracy                           0.54      3273
   macro avg       0.65      0.50      0.35      3273
weighted avg       0.64      0.54      0.38      3273



In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

df_train_2 = df.copy()

for col in categorical_columns:
    df_train_2[col] = LabelEncoder().fit_transform(df_train_2[col])

feature_columns = list(df_train_2.columns)
feature_columns.pop(feature_columns.index("label"))

if(re.findall('Unnamed', feature_columns[0], re.IGNORECASE)):
    feature_columns.pop(0)


x_train, x_test, y_train, y_test = train_test_split(df_train_2[feature_columns], df_train_2['label'], test_size=0.3, shuffle=True)

rf_model = RandomForestClassifier().fit(x_train, y_train)

y_pred = rf_model.predict(x_test)
print(classification_report(y_test, y_pred))


with open(os.path.join(os.getcwd(), 'models', 'rf.pkl'), 'wb') as f:
    pickle.dump(rf_model, f)


              precision    recall  f1-score   support

           0       0.90      0.95      0.93      1553
           1       0.95      0.91      0.93      1720

    accuracy                           0.93      3273
   macro avg       0.93      0.93      0.93      3273
weighted avg       0.93      0.93      0.93      3273



# Pytorch's Neural Network

In [16]:
import torch
import torch.nn as nn


class MyNet(nn.Module):
    def __init__(self, num_features = 120):
        super(MyNet, self).__init__()
        
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.Linear(num_features, 512)
        self.batch_norm2 = nn.BatchNorm1d(512)
        self.dense2 = nn.Linear(512, 128)
        self.batch_norm3 = nn.BatchNorm1d(128)
        self.dense4 = nn.Linear(128, 128)
        self.batch_norm4 = nn.BatchNorm1d(128)
        self.dense5 = nn.Linear(128, 8)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.batch_norm1(x.float())
        x = torch.tanh(self.dense1(x))
        x = self.batch_norm2(x.float())
        x = torch.tanh(self.dense2(x))
        x = self.batch_norm3(x.float())
        #x = torch.tanh(self.dense3(x))
        x = torch.tanh(self.dense4(x))
        x = self.batch_norm4(x.float())
        x = torch.tanh(self.dense5(x))
        x = self.softmax(x)
        return x

# Create an instance of the network
net = MyNet()

# Print the network architecture
print(net)

MyNet(
  (batch_norm1): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dense1): Linear(in_features=120, out_features=512, bias=True)
  (batch_norm2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dense2): Linear(in_features=512, out_features=128, bias=True)
  (batch_norm3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dense4): Linear(in_features=128, out_features=128, bias=True)
  (batch_norm4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dense5): Linear(in_features=128, out_features=8, bias=True)
  (softmax): Softmax(dim=1)
)


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd

# Define a custom dataset class
class MyDataset(Dataset):
    def __init__(self, dataframe_train, dataframe_labels):
        self.data = dataframe_train
        self.labels = dataframe_labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        x = self.data[index, :]
        y = self.labels[index]
        return x, y

# Load the data from a pandas DataFrame
#df = pd.read_csv('Dataset_1.csv') 


df_train_3 = df.copy()

for i, col in enumerate(categorical_columns):
    df_train_3[col] = LabelEncoder().fit_transform(df_train_3[col]).astype(int)

feature_columns = list(df_train_3.columns)
feature_columns.pop(feature_columns.index("label"))

if(re.findall('Unnamed', feature_columns[0], re.IGNORECASE)):
    feature_columns.pop(0)
    
num_features = len(feature_columns)


x_train, x_test, y_train, y_test = train_test_split(df_train_3.drop(columns=['label']), df_train_3['label'], test_size=0.3, shuffle=True)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, shuffle=True)


# Split the data into features and labels
X = torch.from_numpy(np.asarray(x_train, dtype=np.int64))
#X = torch.tensor(torch.from_numpy(np.asarray(x_train, dtype=np.int64)), dtype=torch.int64)
#y = torch.tensor(torch.from_numpy(np.asarray(y_train, dtype=bool)), dtype=torch.int64)
y = torch.tensor(torch.from_numpy(np.asarray(y_train, dtype=bool)), dtype= torch.int64)


# Split the data into features and labels
X_val = torch.from_numpy(np.asarray(x_val, dtype=np.int64))
y_val = torch.tensor(torch.from_numpy(np.asarray(y_val, dtype=bool)), dtype=torch.int64)


# Create instances of the dataset and data loader
dataset = MyDataset(X, y)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True)


val_dataset = MyDataset(X_val, y_val)
val_dataloader = DataLoader(val_dataset, batch_size=256, shuffle=False)


# Create an instance of the network
net = MyNet(num_features)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.01, weight_decay=0.0001)

# Training loop
for epoch in range(130):  # Replace 10 with the desired number of epochs
    running_loss = 0.0
    running_corrects = 0  # Counter for correct predictions
    
    for inputs, labels in dataloader:
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = net(inputs)
        
        # Compute the loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Accumulate the loss
        running_loss += loss.item()
        
        # Calculate the predictions and accuracy
        _, preds = torch.max(outputs, 1)  # Get the predicted labels
        corrects = torch.sum(preds == labels)  # Count the number of correct predictions
        running_corrects += corrects.item()
    
    # Calculate the running accuracy on the training set
    train_accuracy = running_corrects / len(dataset)
     
    # Calculate the validation accuracy
    val_corrects = 0
    val_total = 0
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            outputs = net(inputs)
            _, preds = torch.max(outputs, 1)
            val_corrects += torch.sum(preds == labels).item()
            val_total += labels.size(0)
    val_accuracy = val_corrects / val_total
    
    # Print the average loss and accuracy for the epoch
    print(f'Epoch {epoch+1}: Loss = {running_loss/len(dataloader):.4f}, Train Accuracy = {train_accuracy:.4f}, Val Accuracy = {val_accuracy:.4f}')

# Save the trained model
torch.save(net.state_dict(), 'trained_model.pt')

  y = torch.tensor(torch.from_numpy(np.asarray(y_train, dtype=bool)), dtype= torch.int64)
  y_val = torch.tensor(torch.from_numpy(np.asarray(y_val, dtype=bool)), dtype=torch.int64)


Epoch 1: Loss = 1.9762, Train Accuracy = 0.5331, Val Accuracy = 0.5969
Epoch 2: Loss = 1.8950, Train Accuracy = 0.6431, Val Accuracy = 0.6545
Epoch 3: Loss = 1.8472, Train Accuracy = 0.7103, Val Accuracy = 0.7238
Epoch 4: Loss = 1.8177, Train Accuracy = 0.7595, Val Accuracy = 0.7762
Epoch 5: Loss = 1.8041, Train Accuracy = 0.7742, Val Accuracy = 0.7565
Epoch 6: Loss = 1.7977, Train Accuracy = 0.7778, Val Accuracy = 0.7932
Epoch 7: Loss = 1.7928, Train Accuracy = 0.7908, Val Accuracy = 0.7997
Epoch 8: Loss = 1.7945, Train Accuracy = 0.7825, Val Accuracy = 0.7919
Epoch 9: Loss = 1.7881, Train Accuracy = 0.8024, Val Accuracy = 0.8010
Epoch 10: Loss = 1.7865, Train Accuracy = 0.8010, Val Accuracy = 0.7971
Epoch 11: Loss = 1.7847, Train Accuracy = 0.8050, Val Accuracy = 0.8050
Epoch 12: Loss = 1.7835, Train Accuracy = 0.8082, Val Accuracy = 0.7958
Epoch 13: Loss = 1.7840, Train Accuracy = 0.8116, Val Accuracy = 0.7919
Epoch 14: Loss = 1.7843, Train Accuracy = 0.8063, Val Accuracy = 0.7919
E

In [18]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Define the testing function
def test_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    device = next(model.parameters()).device  # Get the device of the model
    
    all_labels = []
    all_preds = []
    
    with torch.no_grad():
        for inputs, labels in dataloader:
            
            # Forward pass
            outputs = model(inputs)
            # Get the predicted labels
            #preds = torch.round(torch.sigmoid(outputs))
            _, preds = torch.max(outputs, 1)  # Get the predicted labels
            
            # Collect the labels and predictions
            all_labels += list(labels.numpy().reshape((-1,1)))
            all_preds += list(preds.numpy().reshape((-1,1)))
    
    return np.asarray(all_labels), np.asarray(all_preds)

# Create an instance of the network
net = MyNet(num_features)

# Load the trained weights
net.load_state_dict(torch.load('trained_model.pt'))

# Set the model to evaluation mode
net.eval()

# Create the testing dataset and data loader
X = torch.from_numpy(np.asarray(x_test, dtype=np.int64))
y = torch.from_numpy(np.asarray(y_test, dtype=np.int64))
test_dataset = MyDataset(X, y)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Test the model
true_labels, predicted_labels = test_model(net, test_dataloader)

# Import the necessary libraries for classification report
from sklearn.metrics import classification_report

# Print the classification report
target_names = ['class_0', 'class_1']  # Replace with appropriate class names
print(classification_report(true_labels, predicted_labels, labels=np.unique(true_labels)))

              precision    recall  f1-score   support

           0       0.88      0.81      0.84      1553
           1       0.84      0.90      0.87      1720

    accuracy                           0.86      3273
   macro avg       0.86      0.85      0.86      3273
weighted avg       0.86      0.86      0.86      3273



# End-to-End Testing

In [19]:
from subprocess import run
Good_file = r"D:\win32diskimager-1.0.0-install.exe"
Bad_file = r"D:\hackSF\filtered_dataset\Win32_EXE\186"
end_script = r"D:\ClassWork\anti_virus\Vigil-Anti\Source\EXEs\run.py"
model_path= r"D:\ClassWork\anti_virus\Vigil-Anti\Source\EXEs\models\rf.pkl"

result1 = run(['python', end_script, Good_file, model_path], capture_output=True)
resutl2 = run(['python', end_script, Bad_file, model_path], capture_output=True)
print(result1.stdout)
print(resutl2.stdout)



# Neural Networks suck

In [20]:
"""
from sklearn.neural_network import MLPClassifier

MLP_classifier = MLPClassifier(hidden_layer_sizes=[120, 120, 30], solver='sgd', alpha=1, random_state=1)

df_train_3 = df.copy()

for col in categorical_columns:
    df_train_3[col] = LabelEncoder().fit_transform(df_train_3[col])

feature_columns = list(df_train_3.columns)
feature_columns.pop(feature_columns.index("label"))

x_train, x_test, y_train, y_test = train_test_split(df_train_3[feature_columns], df_train_3['label'], test_size=0.3, shuffle=True)

for i in range(10):
    MLP_classifier.fit(x_train, y_train)

y_pred = MLP_classifier.predict(x_test)
print(classification_report(y_test, y_pred, zero_division=1))
"""

'\nfrom sklearn.neural_network import MLPClassifier\n\nMLP_classifier = MLPClassifier(hidden_layer_sizes=[120, 120, 30], solver=\'sgd\', alpha=1, random_state=1)\n\ndf_train_3 = df.copy()\n\nfor col in categorical_columns:\n    df_train_3[col] = LabelEncoder().fit_transform(df_train_3[col])\n\nfeature_columns = list(df_train_3.columns)\nfeature_columns.pop(feature_columns.index("label"))\n\nx_train, x_test, y_train, y_test = train_test_split(df_train_3[feature_columns], df_train_3[\'label\'], test_size=0.3, shuffle=True)\n\nfor i in range(10):\n    MLP_classifier.fit(x_train, y_train)\n\ny_pred = MLP_classifier.predict(x_test)\nprint(classification_report(y_test, y_pred, zero_division=1))\n'