In [2]:
#######################################################################
# This code is for procesing of data for Zero day attacks
# Author: Pal, Anibrata
# Date: 20/12/2023
#######################################################################
import sys

import pandas as pd
import numpy as np


import gc
from time import sleep

import psutil
from fastai.tabular.core import df_shrink

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier


def read_parquet(path):
    df = pd.read_parquet(path)
    return df


def create_zero_main(df):
    print(df.shape)
    # Select all rows where labels are like "Zero"
    zero_rows = df[df['Label'].str.startswith('Zero')]

    # Randomly select the same number of rows where labels are like "Ben"
    ben_rows = df[df['Label'].str.startswith('Ben')].sample(len(zero_rows), replace=True)

    # Select the rows where labels are not like Zero or are not the  Ben labelled rows selected above from df
    rdf = df[~df['Label'].str.startswith('Zero') & ~df.index.isin(ben_rows.index)]

    # Concatenate the selected rows into a new DataFrame in random order
    cdf = pd.concat([zero_rows, ben_rows]).sample(frac=1).reset_index(drop=True)

    # Shuffle the dataset and reset the index for rdf
    rdf = rdf.sample(frac=1).reset_index(drop=True)

    del zero_rows, ben_rows
    sleep(5)
    gc.collect()

    return cdf, rdf


def balance_dataset(df):
    # Generalized function
    # Check if the difference between the number of rows with Label Benign and number of
    # rows without the same Label is less than 1% of the total number of rows.
    total_rows = df.shape[0]
    benign_rows = df[df['Label'] == 'Benign'].shape[0]
    non_benign_rows = df[df['Label'] != 'Benign'].shape[0]
    difference = abs(benign_rows - non_benign_rows)

    X_balanced = []
    y_balanced = []

    if difference < (total_rows * 0.01):
        print("The difference between the number of rows with Label Benign and "
              "number of rows without the same Label is less than 1% of the total "
              "number of rows. balancing not needed")
    else:
        print("The difference between the number of rows with Label Benign and "
              "number of rows without the same Label is greater than or equal to 1% "
              "of the total number of rows.")
        print("Balancing the data")
        print(benign_rows, non_benign_rows)

        # Change the Label of all Non-Benign rows to Malicious
        df['Label'] = df['Label'].astype(str)
        df.loc[df['Label'] != 'Benign', 'Label'] = 'Malicious'
        print(df)
        # Drop the Label column for RandomUnder sampling or SMOTE
        X = df.drop('Label', axis=1)  # Features (excluding the 'label' column)
        y = df['Label']  # Target variable ('label' column)

        print("% of Benign rows: ", benign_rows / total_rows * 100)
        s_values = [0.83, 0.86, 0.9, 0.93, 0.96, 0.99]
        k_values = [1, 2, 3, 4, 5]

        if benign_rows > non_benign_rows:
            print(" Benign rows are more than Malicious rows")
            # Use Random Under sampler to balance the dataset
            print("Use RandomUndersampler to reduce the number of benign rows to be "
                  "similar to the number of non-benign rows.")
            bal_flag = 0  # Set balance flag =0 when benign rows are more than non-benign rows
            # Balance the dataset
            X_balanced, y_balanced = balance_data(X, y, s_values, k_values, bal_flag)
        else:
            print(" Malicious rows are more than Benign rows")
            bal_flag = 1  # Set balance flag =1 when malicious rows are more than benign rows and
            # malicious rows can't be reduced.
            # Use SMOTE to balance the dataset
            # Balance the dataset
            X_balanced, y_balanced = balance_data(X, y, s_values, k_values, bal_flag)

        # The undersampled DataFrame is now ready to be used
        # print(X_balanced, y_balanced)

    return X_balanced, y_balanced


""" Implement the SMOTE and RandomUnderSampling to balance the dataset """


def balance_data(X, y, s_values, k_values, bal_flag):
    """ Implement SMOTE with K nearest neighbour and RandomUnderSampler """
    # Declare score = 0
    fscore = 0
    fs = 0
    fk = 0

    # Create a KFold object with n_splits=10
    for s in s_values:
        for k in k_values:
            # Declare the models for SMOTE over and Random Undersampling
            # define pipeline using Decision Tree Sampler to check the quality of the model.
            model = DecisionTreeClassifier()
            over = SMOTE(sampling_strategy=s, k_neighbors=k)
            if bal_flag == 0:
                print("Using both SMOTE and Random Undersampling")
                under = RandomUnderSampler(sampling_strategy=s, random_state=42)
                steps = [('over', over), ('under', under), ('model', model)]
            else:
                print("Using only SMOTE")
                steps = [('over', over), ('model', model)]
            pipeline = Pipeline(steps=steps)
            # evaluate pipeline
            cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
            scores = cross_val_score(pipeline, X, y, scoring='recall', cv=cv, n_jobs=-1)
            score = np.mean(scores)
            if fscore < score:
                fscore = score
                fs = s
                fk = k
                print('HIGHEST SCORE: > k=%d, s=%.1f, Mean Recall: %.3f' % (k, s, score))
            print('> k=%d, s=%.1f, Mean Recall: %.3f' % (k, s, score))
            print('*******************************************')

    # Apply SMOTE and/or Random Under Sampling to recreate a balanced dataset
    X, y = apply_smote_under(X, y, fs, fk, bal_flag)
    return X, y


# Apply SMOTE oversampling and Random Under Sampling
def apply_smote_under(X, y, s, k, bal_flag):
    over = SMOTE(sampling_strategy=s, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=s, random_state=42)
    if bal_flag == 0:
        print("Using both SMOTE and Random Undersampling")
        X, y = over.fit_resample(X, y)
        X, y = under.fit_resample(X, y)
    else:
        print("Using only SMOTE")
        X, y = over.fit_resample(X, y)
    return X, y


def create_zero_trojan_main(df):
    # Create an attack dataframe that includes both zeroday and trojan.
    # This is to check if the model is generalized to capture both zeroday and trojan attacks
    print(df.shape)
    return df, df


def balance_dataset_specific(df):
    s = 0.99
    k = 5
    # over = SMOTE(sampling_strategy=s, k_neighbors=k)
    # Do RandomOverSampling to balance the dataset
    over = RandomOverSampler(sampling_strategy=s, random_state=42)

    # Print the type of data in the Label column
    print("Type of Data in Label column: ", df['Label'].dtypes)

    # Check the unique values in the Label column
    print("Unique values in Label column: ", df['Label'].unique())

    # Print datatypes of each column in the dataframe
    print(df.dtypes)

    # Te part below is not needed, the labels should be changed into
    # 1 for Benign and -1 for Malicious
    # To resolve the error "TypeError: Cannot setitem on a Categorical with a new category
    # (Malicious), set the categories first" , update the categories first
    """df['Label'] = df['Label'].cat.add_categories(['Malicious'])

    # Change the Label of all Non-Benign rows to Malicious
    df.loc[df['Label'] != 'Benign', 'Label'] = 'Malicious'

    # Print number of Benign records
    print("Number of Benign records: ", len(df[df['Label'] == 'Benign']))

    # Print number of Malicious records
    print("Number of Malicious records: ", len(df[df['Label'] == 'Malicious']))"""

    """# Change the type of the Label column to string 
    df['Label'] = df['Label'].astype(str)"""

    # Convert the "Label" column to integers (1 for Benign, -1 for Not Benign)
    df['Label'] = df['Label'].apply(lambda x: 1 if x == 'Benign' else -1)

    """# Set "Benign" to 1 and the rest to -1 dynamically
    unique_categories = df['Label'].unique()
    category_mapping = {category: 1 if category == 'Benign' else -1 for category in unique_categories}
    df['Label'] = pd.Categorical(df['Label'], categories=category_mapping.keys()).codes"""

    # Print the datatype of the Label column
    print("Type of Data in Label column: ", df['Label'].dtypes)

    print(df['Label'])

    # Print the total number of records in RDF
    print("Total number of records in RDF: ", len(df))

    # Drop the Label column for RandomUndersampling or SMOTE
    X = df_shrink(df.drop('Label', axis=1))  # Features (excluding the 'label' column)

    y = df['Label']  # Target variable ('label' column)

    # Show total memory usage
    print("Total memory usage: ", psutil.virtual_memory().used / (1024 ** 2), "MB")

    print(X)
    print(y)

    # exit(0)

    """# Show total memory usage
    print("Total memory usage: ", psutil.virtual_memory().used / (1024 ** 2), "MB")

    # Print the shapes of X and y, also print the memory usage, also print unique values in y
    print("X shape: ", X.shape)
    print("y shape: ", y.shape)
    print("X memory usage: ", X.memory_usage().sum() / 1024**2, "MB")
    print("y unique values: ", y.unique())
    print("y type: ", type(y))
    # print("y memory usage: ", y.memory_usage().sum() / 1024 ** 2, "MB")"""

    print("Test/Before Oversampling")
    # Show total memory usage
    print("Total memory usage: ", psutil.virtual_memory().used / (1024 ** 2), "MB")

    # Apply Oversampling
    X_over, y_over = over.fit_resample(X, y)

    del X, y
    sleep(5)
    gc.collect()

    return X_over, y_over


def create_data(val, model):
    # Show total memory usage
    print("Total memory usage: ", psutil.virtual_memory().used / (1024 ** 2), "MB")

    # Read the parquet file
    path = '../data/final_df.parquet'
    df = df_shrink(read_parquet(path))

    # Show total memory usage
    print("Total memory usage: ", psutil.virtual_memory().used / (1024 ** 2), "MB")

    # Check if the first argument provided is "zero" then call zero_day_main()
    # otherwise call zero_trojan_main()
    if val == "zero":
        # check if the argv argument is "zero"
        # if sys.argv[1] == "zero":
        cdf, rdf = create_zero_main(df)  # Call zero_day_main()

        # Release memory
        del df
        sleep(5)
        gc.collect()

        # Show total memory usage
        print("Total memory usage: ", psutil.virtual_memory().used / (1024 ** 2), "MB")

        print("Training Dataset Shape: ", rdf.shape)
        print("Zero Dataset Shape: ", cdf.shape)
        print("**********************")
        X, y = balance_dataset_specific(rdf)

        if model != "rf":
            # Normalize the values of the features dataframe using MinMaxScaler
            scaler = MinMaxScaler()
            X = scaler.fit_transform(X)

        # X = df_shrink(X)

        # print(X.head(), y.head())
        print("**********************")
    else:
        # THIS PART HAVE TO BE DEVELOPED LATER TO INCLUDE TROJAN DATA ALSO IN THE TESTING DATA
        # AND REMOVE THE TROJAN DATA FROM THE TRAINING DATA TO TEST ZER-DAY ATTACK FOR BOTH
        # ZERODAY AND TROJAN
        cdf, rdf = create_zero_trojan_main(df)  # Call zero_trojan_main() which creates a new df
        # where Zeroday and Trojan are used combined as zero day.
        X, y = balance_dataset_specific(cdf)

    return X, y, cdf


def zeroday_create_data(df, model):
    # Show total memory usage
    print("Total memory usage: ", psutil.virtual_memory().used / (1024 ** 2), "MB")

    # Convert the "Label" column to integers (1 for Benign, -1 for Not Benign)
    df['Label'] = df['Label'].apply(lambda x: 1 if x == 'Benign' else -1)

    # Separate the features and the target variable Label from the dataframe df.
    features = df.drop('Label', axis=1)
    target = df['Label'].ravel()

    del df
    sleep(5)
    gc.collect()

    if model != "rf":
        # Normalize the values of the features dataframe using MinMaxScaler
        scaler = MinMaxScaler()
        features = scaler.fit_transform(features)

    print(features, target)

    return features, target

# The commented part below is to test the program separately.



In [3]:
# Main program to train the model and test it
# The training and validation would be done by using a 5/10-fold cross validation

# Import statements
# import pandas as pd
# import numpy as np
# import os
import sys
import time
from datetime import *
from timeit import default_timer as timer
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import ConfusionMatrixDisplay, PrecisionRecallDisplay, RocCurveDisplay, classification_report
from sklearn import metrics
# from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
# from fastcore.basics import *
# from fastcore.parallel import *
# from os import cpu_count
import pickle
# import subprocess

# from fastai.tabular.all import df_shrink
# import gc
# from time import sleep

#from zerodaydata import *
#from preprocess import *

# Group 5: Custom imports Qboost Library
sys.path.append('/home/anibrata/Anibrata/PROJECTS/CODE/ZERO_DAY/binary')
from qboost.qboost import *
from dwave.system import DWaveSampler, EmbeddingComposite


def save_model(model):
    # Save model using Pickle
    model_p = 'rf_model.pkl'
    pickle.dump(model, open(model_p, 'wb'))


def reportResult(y, predictions, model, train_time, predict_time):
    # Save the results in a text file
    print("Reporting Result ...")
    result_time = datetime.now().strftime("%Y%m%d%H%M%S")
    report = classification_report(y, predictions)
    with open('../results/' + str(result_time) + '_' + str(model) + '_report.txt', 'a') as f:
        f.write('Training time :' + str(train_time) + '\n')
        f.write(report)
        f.write('Prediction time :' + str(predict_time) + '\n')
    f.close()
    print("Saved...")


def train_classify_RF(X, y):
    # Train and validate the dataset using a 5-fold cross validation

    # Create a Random Forest classifier with 50 decision trees (as per Sarhan et al. 2023)
    rf_classifier = RandomForestClassifier(n_estimators=50, criterion='gini', max_depth=None)

    # Use 5-fold cross validation to evaluate the model
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    print("Start cross-validation ...")
    # Perform cross validation and get the average accuracy,
    scores = cross_val_score(rf_classifier, X, y, cv=cv, scoring='accuracy')
    print("Cross-validation complete ...")
    average_accuracy = np.mean(scores)
    accuracy_std = np.std(scores)

    print('Average Accuracy (Cross-Val): %.3f' % average_accuracy)
    print('Average Standard Deviation (Cross-Val): %.3f' % accuracy_std)

    # Start the timer
    start = timer()
    # Fit the classifier on the entire training dataset
    rf_classifier.fit(X, y)
    # Stop the timer
    end = timer()
    train_time = end - start
    # Show total training time taken - This includes cross validation time.
    print("Training Time  (Training Data): ", train_time, " seconds")

    # Start the timer
    start = timer()
    # Make predictions on the entire training dataset
    y_train_pred = rf_classifier.predict(X)
    # Stop the timer
    end = timer()
    pred_time = end - start
    # Show total training time taken - This includes cross validation time.
    print("Prediction Time (Training Data): ", pred_time, " seconds")

    # Save the model using Pickle
    save_model(rf_classifier)

    # Compute the confusion matrix
    confusion_matrix = metrics.confusion_matrix(y, y_train_pred)
    print(confusion_matrix)

    class_names = y.unique()

    # Plot non-normalized confusion matrix
    titles_options = [
        ("Confusion matrix, without normalization", None),
        ("Normalized confusion matrix", "true"),
    ]
    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_estimator(
            rf_classifier,
            X,
            y,
            display_labels=class_names,
            # cmap=plt.cm.Blues,
            cmap='Blues',
            normalize=normalize,
        )
        disp.ax_.set_title(title)

        print(title)
        print(disp.confusion_matrix)

    # plt.show()
    plt.savefig('../results/conf_train_matrix.png')

    # Plot the confusion matrix
    """metrics.plot_confusion_matrix(rf_classifier, X, y, cmap='Blues')
    plt.savefig('../results/conf_train_matrix.png')"""

    # Plot the ROC curve
    """metrics.plot_roc_curve(rf_classifier, X, y)
    plt.savefig('../results/roc_train_curve.png')"""

    RocCurveDisplay.from_predictions(y, y_train_pred)
    # plt.show()
    plt.savefig('../results/roc_train_curve.png')

    # Plot the precision-recall curve
    """metrics.plot_precision_recall_curve(rf_classifier, X, y)
    plt.savefig('../results/prc_train_curve.png')"""

    display = PrecisionRecallDisplay.from_predictions(
        y, y_train_pred, name="RF-gini", plot_chance_level=True
    )
    _ = display.ax_.set_title("2-class Precision-Recall curve")
    plt.savefig('../results/prc_train_curve.png')

    # Calculate the Accuracy, Precision, Recall, F1-Score, and AUC
    """accuracy = accuracy_score(y, y_train_pred)
    precision = precision_score(y, y_train_pred)
    recall = recall_score(y, y_train_pred)
    f1score = f1_score(y, y_train_pred)"""
    auc_score = roc_auc_score(y, y_train_pred)

    """print("Training Accuracy:", accuracy)
    print("Training Precision:", precision)
    print("Training Recall:", recall)
    print("Training F1-Score:", f1score)"""
    print("Training AUC-Score:", auc_score)

    return rf_classifier, train_time


# Test the model on the test dataset (Zero day dataset)
def test_classify_RF(model, X, y):

    # Start the timer
    start_time = time.time()
    # Make predictions on the test dataset
    y_test_pred = model.predict(X)
    # End the timer
    end_time = time.time()

    pred_time = end_time - start_time

    # Compute the confusion matrix
    confusion_matrix = metrics.confusion_matrix(y, y_test_pred)
    print(confusion_matrix)

    class_names = y.unique()

    # Plot non-normalized confusion matrix
    titles_options = [
        ("Confusion matrix, without normalization", "true"),
        ("Normalized confusion matrix", None),
    ]
    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_estimator(
            model,
            X,
            y,
            display_labels=class_names,
            # cmap=plt.cm.Blues,
            cmap='Blues',
            normalize=normalize,
        )
        disp.ax_.set_title(title)

        print(title)
        print(disp.confusion_matrix)

    # plt.show()
    plt.savefig('../results/conf_test_matrix_rf.png')

    # Plot the ROC curve
    RocCurveDisplay.from_predictions(y, y_test_pred)
    plt.savefig('../results/roc_test_curve_rf.png')

    # Plot the precision-recall curve
    display = PrecisionRecallDisplay.from_predictions(
        y, y_test_pred, name="RF-gini", plot_chance_level=True
    )
    _ = display.ax_.set_title("2-class Precision-Recall curve")
    plt.savefig('../results/prc_test_curve_rf.png')

    # Calculate the Accuracy, Precision, Recall, F1-Score, and AUC
    """accuracy = accuracy_score(y, y_test_pred)
    precision = precision_score(y, y_test_pred)
    recall = recall_score(y, y_test_pred)
    f1score = f1_score(y, y_test_pred)"""
    auc_score = roc_auc_score(y, y_test_pred)

    """print("Test Accuracy:", accuracy)
    print("Test Precision:", precision)
    print("Test Recall:", recall)
    print("Test F1-Score:", f1score)"""
    print("Test AUC-Score:", auc_score)

    print("Test Time taken: ", pred_time, "seconds")

    return y_test_pred, pred_time


""" START Insert methods for QBoost Cross Validation and QBoost Test """

""" Training Qboost and prediction with QBoost """
""" This model has returned the best lambda value as  lam = 0.08506944444444445 """
""" Once the lambda value is fixed, the model can be used for test """
""" So, validation is not needed every time. It will be run once and then 
 the best lambda value would be used for the test """


def train_classify_Qboost(X, y, crossval):
    """ Evaluate the QBoost model for the value of lambda to be used """
    n_features = np.size(X, 1)
    print('Number of features:', n_features)
    print('Number of training samples:', len(X))

    """ Create block to override cross-validation """
    if crossval == str(1):
        print('Carrying out cross validation. Crossval: ', crossval)
        """ Use cross validation to find out the lambda value """
        # See Boyda et al. (2017), Eq. (17) regarding normalization
        normalized_lambdas = np.linspace(0.0, 1.75, 10)
        lambdas = normalized_lambdas / n_features
        print('Performing cross-validation using {} '
              'values of lambda, this make take several minutes...'.format(len(lambdas)))
        clf_qboost, lam, bfeatures = qboost_lambda_sweep(X, y, lambdas, verbose=True)
        print('Best Classifier: ', clf_qboost)
        print('Best lambda value: ', lam)
        print('Best features: ', bfeatures)
    else:
        # lam = 0.07142857142857142
        lam = 0.001

    """ Use the best lambda value for the QBoost model training """
    """ Start Timer for QBoost training """
    start = timer()
    qboost = QBoostClassifier(X, y, lam)
    """ End Timer """
    end = timer()
    train_time = end - start
    print('QBoost Training time in seconds :', train_time)

    return qboost, train_time


def test_classify_Qboost(model, X, y):
    print('Number of test samples:', len(X))

    """ Predict with Qboost and evaluate the model """
    """ Start timer for QBoost prediction """
    start = timer()
    y_pred = model.predict_class(X)

    print(type(y_pred))
    print('y_pred: ', y_pred)

    """ End timer """
    end = timer()
    pred_time = end - start
    print('QBoost Prediction time in seconds :', pred_time)

    # Compute the confusion matrix
    confusion_matrix = metrics.confusion_matrix(y, y_pred)
    print(confusion_matrix)

    class_names = y.unique()

    # Plot non-normalized confusion matrix
    titles_options = [
        ("Confusion matrix, without normalization", "true"),
        ("Normalized confusion matrix", None),
    ]
    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_estimator(
            model,
            X,
            y,
            display_labels=class_names,
            # cmap=plt.cm.Blues,
            cmap='Blues',
            normalize=normalize,
        )
        disp.ax_.set_title(title)

        print(title)
        print(disp.confusion_matrix)

    # plt.show()
    plt.savefig('../results/conf_test_matrix_QB.png')

    # Plot the ROC curve
    RocCurveDisplay.from_predictions(y, y_pred)
    plt.savefig('../results/roc_test_curve_QB.png')

    # Plot the precision-recall curve
    display = PrecisionRecallDisplay.from_predictions(
        y, y_pred, name="QBoost", plot_chance_level=True
    )
    _ = display.ax_.set_title("2-class Precision-Recall curve")
    plt.savefig('../results/prc_test_curve_QB.png')

    # Calculate the Accuracy, Precision, Recall, F1-Score, and AUC
    """accuracy = accuracy_score(y, y_test_pred)
    precision = precision_score(y, y_test_pred)
    recall = recall_score(y, y_test_pred)
    f1score = f1_score(y, y_test_pred)"""
    auc_score = roc_auc_score(y, y_pred)

    """print("Test Accuracy:", accuracy)
    print("Test Precision:", precision)
    print("Test Recall:", recall)
    print("Test F1-Score:", f1score)"""
    print("Test AUC-Score:", auc_score)

    print("Test Time taken: ", pred_time, "seconds")

    return y_pred, pred_time

In [4]:
pwd

'/home/anibrata/Anibrata/PROJECTS/CODE/ZERO_DAY/notebooks'

In [5]:
path = '../data/final_df.parquet'

In [6]:
df = df_shrink(read_parquet(path))

In [17]:
df

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F9495,F9496,F9497,F9498,F9499,F9500,F9501,F9502,F9503,Label
0,127,0,0,50,1,0,9,36,7,1,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
1,67,0,0,11,13,1,0,12,1,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
2,23,0,0,12,5,2,1,30,5,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
3,5,0,0,9,1,4,1,27,18,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
4,23,0,0,12,5,2,1,30,5,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357781,122,1,0,18,1,0,1,4,2,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
357782,0,0,0,8,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
357783,1,0,0,184,1,2,0,17,7,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
357784,1,0,0,148,0,0,0,6,3,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1


In [16]:
df['Label'] = df['Label'].apply(lambda x: 1 if x == 'Benign' else -1)

In [None]:
correlations = df.corr()['label'].sort_values(ascending=True)

In [18]:
del X, y

In [8]:
X = df_shrink(df.drop('Label', axis=1))

In [9]:
y = df['Label']

In [19]:
y

NameError: name 'y' is not defined

In [12]:
y = y.apply(lambda x: 1 if x == 'Benign' else -1)

In [15]:
y

0        -1
1        -1
2        -1
3        -1
4        -1
         ..
357781   -1
357782   -1
357783   -1
357784   -1
357785   -1
Name: Label, Length: 357786, dtype: int64

In [14]:
cdf, rdf = create_zero_main(df)

(357786, 9418)


In [16]:
del df
gc.collect()

NameError: name 'df' is not defined

In [17]:
cdf

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F9495,F9496,F9497,F9498,F9499,F9500,F9501,F9502,F9503,Label
0,448,0,0,63,27,6,0,4,1,1,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Benign
1,472,0,0,101,3,5,2,7,5,2,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Benign
2,376,0,0,38,30,0,1,11,1,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Benign
3,237,52,0,22,4,5,1,5,2,1,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Benign
4,1031,0,0,126,15,6,24,15,6,10,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26673,982,0,0,63,19,6,7,10,6,2,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Benign
26674,227,0,0,6,3,0,0,0,3,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Benign
26675,16,0,0,2,3,0,1,10,5,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Zeroday
26676,722,0,0,68,32,2,3,21,2,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Zeroday


In [26]:
rdf

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F9495,F9496,F9497,F9498,F9499,F9500,F9501,F9502,F9503,Label
0,84,0,0,14,22,11,2,28,12,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
1,125,0,0,47,2,2,1,7,1,1,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
2,84,0,0,15,22,13,3,31,13,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
3,304,0,0,14,5,7,2,18,6,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
4,291,11,0,81,2,2,1,2,1,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331655,84,0,0,15,22,13,3,31,13,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
331656,3,0,0,1,2,0,1,2,2,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
331657,14,0,0,3,3,5,0,14,7,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1
331658,23,0,0,12,5,2,1,30,5,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,-1


In [22]:
print("Type of Data in Label column: ", rdf['Label'].dtypes)

Type of Data in Label column:  int64


In [24]:
print("Unique values in Label column: ", rdf['Label'].unique())

Unique values in Label column:  [-1  1]


In [25]:
rdf['Label'] = rdf['Label'].apply(lambda x: 1 if x == 'Benign' else -1)

In [None]:
correlations = rdf.corr()['label'].sort_values(ascending=True) 