In [1]:
import os
import json
import pandas as pd
import numpy as np
import statistics
from helpers import *
import re
import pickle

In [2]:
json_file = r"D:\ClassWork\anti_virus\Vigil-Anti\EXE_Dataset\ember2018\train_features_1.jsonl"

with open(json_file, 'r') as f:
    json_ds_list = list(f)

DataSet = []
for i,ds in enumerate(json_ds_list):
    #if( i > 12000):
    #    break
    DataSet.append(json.loads(ds))

# to free some of the precious memory
del json_ds_list

print (DataSet[0])

simple_ds = DataSet[5]

{'sha256': '2ef9a92ee6c955364564b0df75ee3753473014b2ba162b9df90afe6df9dbb256', 'md5': '7e39aeea7bc21d16b8652516a150b282', 'appeared': '2018-01', 'label': 1, 'avclass': 'sivis', 'histogram': [60782, 5895, 2020, 1487, 2075, 1367, 1145, 856, 2037, 725, 2027, 716, 1418, 903, 672, 1014, 1605, 652, 702, 691, 1048, 927, 641, 599, 795, 636, 598, 598, 677, 629, 597, 571, 8564, 738, 921, 600, 1253, 835, 645, 565, 1015, 919, 958, 868, 917, 784, 1435, 1307, 1470, 1081, 903, 1380, 913, 914, 872, 823, 1013, 1048, 1001, 1289, 1063, 1261, 792, 771, 1852, 3074, 928, 1346, 1238, 1786, 1036, 857, 1028, 1149, 902, 749, 1003, 1101, 1014, 883, 2012, 1152, 1374, 1468, 1242, 1374, 1312, 1447, 975, 848, 716, 1067, 940, 1566, 1298, 1468, 897, 3196, 1406, 2574, 2206, 5376, 2771, 1455, 2052, 2923, 1401, 908, 2522, 1562, 3768, 3473, 2336, 813, 3879, 2968, 5270, 2441, 1323, 1398, 1176, 1245, 843, 944, 984, 1172, 878, 851, 1168, 1116, 1029, 2612, 900, 1471, 827, 767, 953, 1479, 908, 4228, 772, 1342, 753, 719, 828, 7

### Exploring the unique section names

In [3]:
"""
all_sectionNames = set()
for ds_obj in DataSet:
    for dic_elm in ds_obj['section']['sections']:
        all_sectionNames.add(dic_elm['name'])

with open('sectionNames.txt', 'w') as f:
    f.write('\n'.join(all_sectionNames))

correct_sec_names = []
for n in all_sectionNames:
    if(n and n[0] == "."):
        correct_sec_names.append(n)

with open('sectionNames_correct.txt', 'w') as f:
    f.write('\n'.join(correct_sec_names))
"""

'\nall_sectionNames = set()\nfor ds_obj in DataSet:\n    for dic_elm in ds_obj[\'section\'][\'sections\']:\n        all_sectionNames.add(dic_elm[\'name\'])\n\nwith open(\'sectionNames.txt\', \'w\') as f:\n    f.write(\'\n\'.join(all_sectionNames))\n\ncorrect_sec_names = []\nfor n in all_sectionNames:\n    if(n and n[0] == "."):\n        correct_sec_names.append(n)\n\nwith open(\'sectionNames_correct.txt\', \'w\') as f:\n    f.write(\'\n\'.join(correct_sec_names))\n'

#### spoiler: there are lots of malicious section names
#### so I just extracted the most common and correct section names and then wrote them into "common_section_names.txt"
#### any other section names will be considered "UNKNOWN"

In [4]:
# Saving the most common section names

with open(os.path.join(os.getcwd(), 'assets', 'common_section_names.txt'), 'r') as f:
    Common_section_names = f.readlines()

Common_section_names = [re.sub(r'\n', '', i) for i in Common_section_names]

# Explore all the possible imports

In [5]:
"""
from tqdm import tqdm
all_imports = set()
for obj in tqdm(DataSet):
    import_DLL_dict = obj['imports']
    DLL_list = list(import_DLL_dict.keys())
    for elm in DLL_list:
        if(elm.endswith('.dll')):
            all_imports.add(elm)
    #all_imports = set(all_imports)

with open('all_imports_cleansed.txt', 'w') as f:
    f.write('\n'.join(all_imports))
"""

"\nfrom tqdm import tqdm\nall_imports = set()\nfor obj in tqdm(DataSet):\n    import_DLL_dict = obj['imports']\n    DLL_list = list(import_DLL_dict.keys())\n    for elm in DLL_list:\n        if(elm.endswith('.dll')):\n            all_imports.add(elm)\n    #all_imports = set(all_imports)\n\nwith open('all_imports_cleansed.txt', 'w') as f:\n    f.write('\n'.join(all_imports))\n"


### Same problem with DLL imports, there are numerous different DLLs
### and I cannot really filter all of them, so I will just grab the most common DLLs that are associated with most malwares
### and another feature which will be the number of imported DLLs

# Let's just cleanse the data

In [6]:
new_Dataset = []

for simple_ds in tqdm(DataSet, desc='cleansing the dataset'):
    try:
        # add reduced features of byteentropy distribution
        simple_ds.update(Interpret_Histogram(simple_ds['byteentropy'], 'byteentropy'))

        # add reduced features of byte histogram distribution
        simple_ds.update(Interpret_Histogram(simple_ds['histogram'], 'bytehistogram'))

        # reduce strings field
        simple_ds = extract_subfields_from_fields(simple_ds, 'strings', normalize_names=True, delete_field=True)

        # flatten the strings printables distribution field
        simple_ds = flatten_strings_printable_distribution(simple_ds, delete_field=True)

        # reduce general field
        simple_ds = extract_subfields_from_fields(simple_ds, 'general', normalize_names=True, delete_field=True)

        # reduce header field
        simple_ds = extract_subfields_from_fields(simple_ds, 'header', normalize_names=True, delete_field=True)
        simple_ds = extract_subfields_from_fields(simple_ds, 'header_optional', normalize_names=False, delete_field=True)
        simple_ds = extract_subfields_from_fields(simple_ds, 'header_coff', normalize_names=False, delete_field=True)


        # handle data directories field
        simple_ds = handle_data_directories_field(simple_ds)


        # handle sections fields
        simple_ds = handle_section_names(simple_ds, Common_section_names, delete_field=True)

        # handle imports fields
        simple_ds = handle_DLL_imports(simple_ds, delete_field=False)

        # Remove the useless columns for now (they are not entirely useless but they will make the training process very complex for me :(( )
        useless_columns = ['sha256'
            ,'md5'
            ,'appeared'
            ,'avclass'
            ,'histogram'
            ,'byteentropy'
            ,'imports'
            ,'exports'
            ,'dll_characteristics'
            ,'characteristics']

        for useless_col in useless_columns:
            del simple_ds[useless_col]
        
        new_Dataset.append(simple_ds)
    except:
        continue


# Finally, free the original dataset from our precious memory
del DataSet

#print(simple_ds)

with open('lol.json', 'w') as f:
     json.dump(new_Dataset[5], f, indent=6)


cleansing the dataset:   1%|          | 77/12001 [00:00<00:16, 740.36it/s]

cleansing the dataset:  62%|██████▏   | 7474/12001 [00:07<00:03, 1278.77it/s]

Error inside handle_DLL_imports()


cleansing the dataset: 100%|██████████| 12001/12001 [00:11<00:00, 1033.39it/s]


# Let's prepare our Pandas DataFrame

In [7]:
# df= pd.DataFrame()
# i = 0
# for dic in new_Dataset:
#     df = pd.concat([df, pd.DataFrame([0]*len(df.columns))], axis=0)
#     for k in dic.keys():
#         if k in df.columns:
#             try:
#                 df.loc[i, k] = dic[k]
#             except:
#                 print(k)
#                 print(df)
#         else:
#             dummy_list = pd.DataFrame([0]*len(df) if len(df) > 0 else [0])
#             df.insert(0, k, dummy_list)
#             #print(df.columns)
#             df.loc[i, k] = dic[k]
    
#     #print(df.head())
#     i+=1


# df.fillna(0)
# print(df)

# df.to_csv('lol.csv')

In [8]:
#df = pd.DataFrame().from_dict(DataSet_Dict)

#print(df.head())



df = pd.DataFrame()

for dictionary_obj in tqdm(new_Dataset, desc="constructing a pandas dataframe..."):
    df_row = pd.DataFrame().from_dict(dictionary_obj, orient='index').transpose()
    df = pd.concat([df, df_row], axis=0, join='outer',ignore_index=False)

df.fillna(0, inplace=True)
df.to_csv('Dataset.csv')
df.describe()

constructing a pandas dataframe...: 100%|██████████| 12001/12001 [09:28<00:00, 21.11it/s]


Unnamed: 0,.code_size,.code_entropy,.code_vsize,.code_props_len,.text_size,.text_entropy,.text_vsize,.text_props_len,.rdata_size,.rdata_entropy,...,Dbgcore.dll_num_funcs,ncrypt.dll_num_funcs,.UPX_size,.UPX_entropy,.UPX_vsize,.UPX_props_len,.upx_size,.upx_entropy,.upx_vsize,.upx_props_len
count,12001.0,12001.0,12001.0,12001.0,12001.0,12001.0,12001.0,12001.0,12001.0,12001.0,...,12001.0,12001.0,12001.0,12001.0,12001.0,12001.0,12001.0,12001.0,12001.0,12001.0
mean,258.213482,0.062253,254.584618,0.035414,373157.3,4.920905,391254.7,2.649029,72844.97,2.851109,...,0.012832,0.000917,21.502208,0.000658,21.502208,0.000333,1.407883,0.00038,1.706524,0.000417
std,3889.317044,0.584017,3868.117829,0.328881,1491375.0,2.762939,1537916.0,2.155105,599443.5,2.690136,...,1.041154,0.068914,2355.54703,0.072054,2355.54703,0.036513,154.232246,0.041637,186.948177,0.045642
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,2560.0,4.594207,2084.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,69632.0,6.338444,72320.0,3.0,512.0,2.299104,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,262144.0,6.640903,275024.0,3.0,28672.0,5.262078,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,168448.0,7.469426,168242.0,6.0,62112260.0,7.999931,62112080.0,17.0,44892670.0,7.999782,...,101.0,7.0,258048.0,7.89343,258048.0,4.0,16896.0,4.561308,20480.0,5.0


In [9]:


with open(os.path.join(os.getcwd(), 'assets', 'suspicious_imports.txt'), 'r') as f:
    sus_imports = f.readlines()
sus_imports = [re.sub(r'\n', '', i) for i in sus_imports]

boolean_columns = sus_imports + []
categorical_columns = ["subsystem", "magic", "machine"]


for col in df.columns:
    if col in boolean_columns:
        df[col] = df[col].astype(bool)
        df[col].fillna(False)
        continue

    if col in categorical_columns:
        df[col].replace(0, 'UNKNOWN', inplace=True)
        continue
    df[col].fillna(0)
    df[col] = df[col].astype(np.int64)
    df[col].fillna(0)

for col in df.columns:
    print(f"{col}:        {df[col].dtype}")


df.to_csv('Dataset_big.csv', index=False)

# Save our feature list
feature_columns = list(df.columns)
feature_columns.pop(feature_columns.index('label'))
print(f"total number of Features: {len(feature_columns)}")
with open(os.path.join(os.getcwd(), 'assets', 'features.pkl'), 'wb') as f:
    pickle.dump(feature_columns, f)



label:        int64
zero_bytes_byteentropy:        int64
full_bytes_byteentropy:        int64
mean_of_bytes_byteentropy:        int64
standard_dev_byteentropy:        int64
total_bytes_byteentropy:        int64
mean_of_first_tertile_byteentropy:        int64
mean_of_second_tertile_byteentropy:        int64
mean_of_third_tertile_byteentropy:        int64
zero_bytes_bytehistogram:        int64
full_bytes_bytehistogram:        int64
mean_of_bytes_bytehistogram:        int64
standard_dev_bytehistogram:        int64
total_bytes_bytehistogram:        int64
mean_of_first_tertile_bytehistogram:        int64
mean_of_second_tertile_bytehistogram:        int64
mean_of_third_tertile_bytehistogram:        int64
strings_numstrings:        int64
strings_avlength:        int64
strings_printables:        int64
strings_entropy:        int64
strings_paths:        int64
strings_urls:        int64
strings_registry:        int64
strings_MZ:        int64
strings_printabledist_0:        int64
strings_printabl

# Remove the -1 tuples!

In [10]:
df = df[df['label'] != -1]

# Encoding Categorical Columns

In [11]:
from sklearn.preprocessing import LabelEncoder

df_train = df.copy()
array_of_Label_Encoders = []
for col in categorical_columns:
    new_LE = LabelEncoder().fit(df_train[col])
    df_train[col] = new_LE.transform(df_train[col])
    array_of_Label_Encoders.append(new_LE)

with open(os.path.join(os.getcwd(), 'models', 'enc.pkl'), 'wb') as f:
    pickle.dump(array_of_Label_Encoders, f)

# Let's make our classifier

In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


df_train_1 = df_train.copy()

feature_columns = list(df_train_1.columns)
feature_columns.pop(feature_columns.index("label"))
if(re.findall('Unnamed', feature_columns[0], re.IGNORECASE)):
    feature_columns.pop(0)


x_train, x_test, y_train, y_test = train_test_split(df_train_1[feature_columns], df_train_1['label'], test_size=0.3, shuffle=True)

svm_model = SVC(kernel='poly', degree= 3, verbose=True).fit(x_train, y_train)

y_pred = svm_model.predict(x_test)
print(classification_report(y_test, y_pred))


with open(os.path.join(os.getcwd(), 'models', 'svm.pkl'), 'wb') as f:
    pickle.dump(svm_model, f)

[LibSVM]              precision    recall  f1-score   support

           0       0.88      0.01      0.01      1286
           1       0.51      1.00      0.68      1334

    accuracy                           0.51      2620
   macro avg       0.69      0.50      0.34      2620
weighted avg       0.69      0.51      0.35      2620



In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

df_train_2 = df.copy()

for col in categorical_columns:
    df_train_2[col] = LabelEncoder().fit_transform(df_train_2[col])

feature_columns = list(df_train_2.columns)
feature_columns.pop(feature_columns.index("label"))

if(re.findall('Unnamed', feature_columns[0], re.IGNORECASE)):
    feature_columns.pop(0)


x_train, x_test, y_train, y_test = train_test_split(df_train_2[feature_columns], df_train_2['label'], test_size=0.3, shuffle=True)

rf_model = RandomForestClassifier().fit(x_train, y_train)

y_pred = rf_model.predict(x_test)
print(classification_report(y_test, y_pred))


with open(os.path.join(os.getcwd(), 'models', 'rf.pkl'), 'wb') as f:
    pickle.dump(rf_model, f)


              precision    recall  f1-score   support

           0       0.90      0.94      0.92      1253
           1       0.94      0.91      0.93      1367

    accuracy                           0.92      2620
   macro avg       0.92      0.92      0.92      2620
weighted avg       0.92      0.92      0.92      2620



# Pytorch's Neural Network

In [24]:
import torch
import torch.nn as nn


class MyNet(nn.Module):
    def __init__(self, num_features = 120):
        super(MyNet, self).__init__()
        
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.Linear(num_features, 512)
        self.batch_norm2 = nn.BatchNorm1d(512)
        self.dense2 = nn.Linear(512, 128)
        self.batch_norm3 = nn.BatchNorm1d(128)
        self.dense3 = nn.Linear(128, 128)
        self.dense4 = nn.Linear(128, 8)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.batch_norm1(x.float())
        x = torch.tanh(self.dense1(x))
        x = self.batch_norm2(x.float())
        x = torch.tanh(self.dense2(x))
        x = self.batch_norm3(x.float())
        x = torch.tanh(self.dense3(x))
        x = torch.tanh(self.dense4(x))
        x = self.softmax(x)
        return x

# Create an instance of the network
net = MyNet()

# Print the network architecture
print(net)

MyNet(
  (batch_norm1): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dense1): Linear(in_features=120, out_features=512, bias=True)
  (batch_norm2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dense2): Linear(in_features=512, out_features=128, bias=True)
  (batch_norm3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dense3): Linear(in_features=128, out_features=128, bias=True)
  (dense4): Linear(in_features=128, out_features=8, bias=True)
  (softmax): Softmax(dim=1)
)


In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd

# Define a custom dataset class
class MyDataset(Dataset):
    def __init__(self, dataframe_train, dataframe_labels):
        self.data = dataframe_train
        self.labels = dataframe_labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        x = self.data[index, :]
        y = self.labels[index]
        return x, y

# Load the data from a pandas DataFrame
#df = pd.read_csv('Dataset_1.csv') 


df_train_3 = df.copy()

for i, col in enumerate(categorical_columns):
    df_train_3[col] = LabelEncoder().fit_transform(df_train_3[col]).astype(int)

feature_columns = list(df_train_3.columns)
feature_columns.pop(feature_columns.index("label"))

if(re.findall('Unnamed', feature_columns[0], re.IGNORECASE)):
    feature_columns.pop(0)
    
num_features = len(feature_columns)


x_train, x_test, y_train, y_test = train_test_split(df_train_3.drop(columns=['label']), df_train_3['label'])


# Split the data into features and labels
X = torch.from_numpy(np.asarray(x_train, dtype=np.int64))
#X = torch.tensor(torch.from_numpy(np.asarray(x_train, dtype=np.int64)), dtype=torch.int64)
#y = torch.tensor(torch.from_numpy(np.asarray(y_train, dtype=bool)), dtype=torch.int64)
y = torch.tensor(torch.from_numpy(np.asarray(y_train, dtype=bool)), dtype= torch.int64)

# Create instances of the dataset and data loader
dataset = MyDataset(X, y)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

# Create an instance of the network
net = MyNet(num_features)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.0005)

# Training loop
for epoch in range(130):  # Replace 10 with the desired number of epochs
    running_loss = 0.0
    running_corrects = 0  # Counter for correct predictions
    
    for inputs, labels in dataloader:
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = net(inputs)
        
        # Compute the loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Accumulate the loss
        running_loss += loss.item()
        
        # Calculate the predictions and accuracy
        _, preds = torch.max(outputs, 1)  # Get the predicted labels
        corrects = torch.sum(preds == labels)  # Count the number of correct predictions
        running_corrects += corrects.item()
    
    # Calculate the running accuracy
    accuracy = running_corrects / len(dataset)
    
    # Print the average loss and accuracy for the epoch
    print(f'Epoch {epoch+1}: Loss = {running_loss/len(dataloader):.4f}, Accuracy = {accuracy:.4f}')

# Save the trained model
torch.save(net.state_dict(), 'trained_model.pt')

  y = torch.tensor(torch.from_numpy(np.asarray(y_train, dtype=bool)), dtype= torch.int64)


Epoch 1: Loss = 1.9294, Accuracy = 0.7154
Epoch 2: Loss = 1.8375, Accuracy = 0.7522
Epoch 3: Loss = 1.8035, Accuracy = 0.7969
Epoch 4: Loss = 1.7894, Accuracy = 0.8029
Epoch 5: Loss = 1.7814, Accuracy = 0.8163
Epoch 6: Loss = 1.7752, Accuracy = 0.8262
Epoch 7: Loss = 1.7764, Accuracy = 0.8291
Epoch 8: Loss = 1.7736, Accuracy = 0.8290
Epoch 9: Loss = 1.7719, Accuracy = 0.8355
Epoch 10: Loss = 1.7710, Accuracy = 0.8389
Epoch 11: Loss = 1.7683, Accuracy = 0.8415
Epoch 12: Loss = 1.7689, Accuracy = 0.8415
Epoch 13: Loss = 1.7684, Accuracy = 0.8439
Epoch 14: Loss = 1.7671, Accuracy = 0.8449
Epoch 15: Loss = 1.7653, Accuracy = 0.8542
Epoch 16: Loss = 1.7635, Accuracy = 0.8554
Epoch 17: Loss = 1.7648, Accuracy = 0.8520
Epoch 18: Loss = 1.7604, Accuracy = 0.8632
Epoch 19: Loss = 1.7608, Accuracy = 0.8595
Epoch 20: Loss = 1.7618, Accuracy = 0.8562
Epoch 21: Loss = 1.7576, Accuracy = 0.8694
Epoch 22: Loss = 1.7558, Accuracy = 0.8699
Epoch 23: Loss = 1.7578, Accuracy = 0.8688
Epoch 24: Loss = 1.7

In [29]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Define the testing function
def test_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    device = next(model.parameters()).device  # Get the device of the model
    
    all_labels = []
    all_preds = []
    
    with torch.no_grad():
        for inputs, labels in dataloader:
            
            # Forward pass
            outputs = model(inputs)
            # Get the predicted labels
            #preds = torch.round(torch.sigmoid(outputs))
            _, preds = torch.max(outputs, 1)  # Get the predicted labels
            
            # Collect the labels and predictions
            all_labels += list(labels.numpy().reshape((-1,1)))
            all_preds += list(preds.numpy().reshape((-1,1)))
    
    return np.asarray(all_labels), np.asarray(all_preds)

# Create an instance of the network
net = MyNet(num_features)

# Load the trained weights
net.load_state_dict(torch.load('trained_model.pt'))

# Set the model to evaluation mode
net.eval()

# Create the testing dataset and data loader
X = torch.from_numpy(np.asarray(x_test, dtype=np.int64))
y = torch.from_numpy(np.asarray(y_test, dtype=np.int64))
test_dataset = MyDataset(X, y)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Test the model
true_labels, predicted_labels = test_model(net, test_dataloader)

# Import the necessary libraries for classification report
from sklearn.metrics import classification_report

# Print the classification report
target_names = ['class_0', 'class_1']  # Replace with appropriate class names
print(classification_report(true_labels, predicted_labels, labels=np.unique(true_labels)))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91      1023
           1       0.92      0.93      0.92      1160

    accuracy                           0.92      2183
   macro avg       0.92      0.92      0.92      2183
weighted avg       0.92      0.92      0.92      2183



# End-to-End Testing

In [None]:
from subprocess import run
Good_file = r"D:\win32diskimager-1.0.0-install.exe"
Bad_file = r"D:\hackSF\filtered_dataset\Win32_EXE\186"
end_script = r"D:\ClassWork\anti_virus\Vigil-Anti\Source\EXEs\run.py"
model_path= r"D:\ClassWork\anti_virus\Vigil-Anti\Source\EXEs\models\rf.pkl"

result1 = run(['python', end_script, Good_file, model_path], capture_output=True)
resutl2 = run(['python', end_script, Bad_file, model_path], capture_output=True)
print(result1.stdout)
print(resutl2.stdout)



# Neural Networks suck

In [None]:
"""
from sklearn.neural_network import MLPClassifier

MLP_classifier = MLPClassifier(hidden_layer_sizes=[120, 120, 30], solver='sgd', alpha=1, random_state=1)

df_train_3 = df.copy()

for col in categorical_columns:
    df_train_3[col] = LabelEncoder().fit_transform(df_train_3[col])

feature_columns = list(df_train_3.columns)
feature_columns.pop(feature_columns.index("label"))

x_train, x_test, y_train, y_test = train_test_split(df_train_3[feature_columns], df_train_3['label'], test_size=0.3, shuffle=True)

for i in range(10):
    MLP_classifier.fit(x_train, y_train)

y_pred = MLP_classifier.predict(x_test)
print(classification_report(y_test, y_pred, zero_division=1))
"""

'\nfrom sklearn.neural_network import MLPClassifier\n\nMLP_classifier = MLPClassifier(hidden_layer_sizes=[120, 120, 30], solver=\'sgd\', alpha=1, random_state=1)\n\ndf_train_3 = df.copy()\n\nfor col in categorical_columns:\n    df_train_3[col] = LabelEncoder().fit_transform(df_train_3[col])\n\nfeature_columns = list(df_train_3.columns)\nfeature_columns.pop(feature_columns.index("label"))\n\nx_train, x_test, y_train, y_test = train_test_split(df_train_3[feature_columns], df_train_3[\'label\'], test_size=0.3, shuffle=True)\n\nfor i in range(10):\n    MLP_classifier.fit(x_train, y_train)\n\ny_pred = MLP_classifier.predict(x_test)\nprint(classification_report(y_test, y_pred, zero_division=1))\n'