In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)"

In [2]:
def reduce_attack_class(classCol):
    mapping = {
            'Benign': 'Benign',
            'DoS': 'DoS',
            'DDoS': 'DDoS',
            'scanning': 'Phishing',
            'Reconnaissance':'Phishing',
            'xss': 'XSS',
            'password': 'Password',
            'injection': 'SQL Injection',
            'Bot': 'Botnet',
            'Brute Force': 'Brute Force',
            'Infilteration': 'Phishing',
            'Exploits': 'Zero Day Exploit',
            'Fuzzers': 'Phishing',
            'Backdoor': 'Malware',
            'Generic': 'Malware',
            'mitm': 'MITM',
            'ransomware': 'Malware',
            'Analysis': 'Phishing',
            'Theft': 'Phishing',
            'Shellcode': 'Zero Day Exploit',
            'Worms': 'Malware',
           }
    return classCol.apply(lambda v: mapping[v])

In [3]:
def dataset_class_distribution(df,chunk_size=1000000):

    attack_counts = pd.Series(dtype=int)
    total_rows = 0
    
    for chunk in df:
        chunk['Attack_Class'] = reduce_attack_class(chunk['Attack'])
        chunk_attack_counts = chunk['Attack_Class'].value_counts()
        attack_counts = attack_counts.add(chunk_attack_counts, fill_value=0)
        total_rows += len(chunk)

    attack_percentages = (attack_counts / total_rows) * 100

    results_df = pd.DataFrame({
        'Unique Value': attack_counts.index,
        'Count': attack_counts.values,
        'Percentage': attack_percentages.values
    })

    return results_df

In [4]:
file_path = '/kaggle/input/nf-uq-nids-v2/NF-UQ-NIDS-v2.csv'
chunksize=1000000

In [5]:
df=pd.read_csv(file_path, chunksize=chunksize)

In [6]:
results_df = dataset_class_distribution(df)
print(results_df)

        Unique Value       Count  Percentage
0             Benign  25165295.0   33.117470
1             Botnet    143097.0    0.188315
2        Brute Force    123982.0    0.163160
3               DDoS  21748351.0   28.620779
4                DoS  17875585.0   23.524228
5               MITM      7723.0    0.010163
6            Malware     39127.0    0.051491
7           Password   1153323.0    1.517770
8           Phishing   6558598.0    8.631100
9      SQL Injection    684897.0    0.901323
10               XSS   2455020.0    3.230801
11  Zero Day Exploit     32978.0    0.043399


In [7]:
!rm -rf /kaggle/working/*

In [8]:
# Load the CSV file with column names
df_cols = pd.read_csv(file_path, nrows=0)  # Read only the first row for header

# Get the column names
column_names = df_cols.columns.tolist()

items_to_add = ['Attack_Class']
column_names.extend(items_to_add)

print(column_names)

['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'IN_PKTS', 'OUT_BYTES', 'OUT_PKTS', 'TCP_FLAGS', 'CLIENT_TCP_FLAGS', 'SERVER_TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'DURATION_IN', 'DURATION_OUT', 'MIN_TTL', 'MAX_TTL', 'LONGEST_FLOW_PKT', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN', 'MAX_IP_PKT_LEN', 'SRC_TO_DST_SECOND_BYTES', 'DST_TO_SRC_SECOND_BYTES', 'RETRANSMITTED_IN_BYTES', 'RETRANSMITTED_IN_PKTS', 'RETRANSMITTED_OUT_BYTES', 'RETRANSMITTED_OUT_PKTS', 'SRC_TO_DST_AVG_THROUGHPUT', 'DST_TO_SRC_AVG_THROUGHPUT', 'NUM_PKTS_UP_TO_128_BYTES', 'NUM_PKTS_128_TO_256_BYTES', 'NUM_PKTS_256_TO_512_BYTES', 'NUM_PKTS_512_TO_1024_BYTES', 'NUM_PKTS_1024_TO_1514_BYTES', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT', 'ICMP_TYPE', 'ICMP_IPV4_TYPE', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE', 'DNS_TTL_ANSWER', 'FTP_COMMAND_RET_CODE', 'Label', 'Attack', 'Dataset', 'Attack_Class']


In [9]:
# Create a new empty DataFrame with the same column names and add 'Attack_class'
empty_df = pd.DataFrame(columns=column_names)
empty_df.head()

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack,Dataset,Attack_Class


In [10]:
# Save the empty DataFrame to a new CSV file
empty_df.to_csv('train.csv', index=False)
empty_df.to_csv('test.csv', index=False)

In [11]:
from sklearn.model_selection import StratifiedKFold

In [None]:
x=1 
size=0
splits=10
df=pd.read_csv(file_path, chunksize=chunksize)

# Iterate through the dataset in chunks
for chunk in df:
    # X1 = chunk.drop(['Attack'], axis=1)
    # Y1 = chunk["Attack"]

    #drop
    chunk['Attack_Class'] = reduce_attack_class(chunk['Attack'])
    X1 = chunk.drop(['Attack_Class',], axis=1)
    Y1 = chunk["Attack_Class"]

    skfolds = StratifiedKFold(n_splits=splits)  # Split into 150 parts


    # Perform the split only once for each chunk
    for train_index, test_index in skfolds.split(X1, Y1):
        x_train = X1.iloc[train_index]
        y_train = Y1.iloc[train_index]
        x_test = X1.iloc[test_index]
        y_test = Y1.iloc[test_index]
        train_dataset=[]
        test_dataset=[]
        # Get the first split and append it to the new dataset
        train_dataset.append(pd.concat([x_train, y_train], axis=1))
        test_dataset.append(pd.concat([x_test, y_test], axis=1))
        
        train_dataset = pd.concat(train_dataset, ignore_index=True)
        test_dataset = pd.concat(test_dataset, ignore_index=True)
        
        print("Chunck",x)
        size=size+len(train_dataset['Attack'])
        x=x+1

        with open('train.csv', 'a') as f:
          train_dataset.to_csv(f, header=False, index=False) 
            
        with open('test.csv', 'a') as f:
          test_dataset.to_csv(f, header=False, index=False) 
        break  # Break after processing the first split this will reduce the size of the dataset by 50 times

Chunck 1
Chunck 2
Chunck 3
Chunck 4
Chunck 5
Chunck 6
Chunck 7
Chunck 8
Chunck 9
Chunck 10
Chunck 11
Chunck 12
Chunck 13
Chunck 14
Chunck 15
Chunck 16
Chunck 17
Chunck 18
Chunck 19
Chunck 20
Chunck 21
Chunck 22
Chunck 23
Chunck 24
Chunck 25
Chunck 26
Chunck 27
Chunck 28
Chunck 29
Chunck 30
Chunck 31
Chunck 32
Chunck 33
Chunck 34
Chunck 35
Chunck 36
Chunck 37
Chunck 38
Chunck 39
Chunck 40
Chunck 41
Chunck 42
Chunck 43
Chunck 44
Chunck 45


In [16]:
file_path = '/kaggle/working/test.csv'
chunksize=2000000

In [17]:
df=pd.read_csv(file_path, chunksize=chunksize)

In [18]:
results_df = dataset_class_distribution(df)
print(results_df)

        Unique Value      Count  Percentage
0             Benign  2516544.0   33.117659
1             Botnet    14304.0    0.188240
2        Brute Force    12394.0    0.163105
3               DDoS  2174843.0   28.620882
4                DoS  1787566.0   23.524326
5               MITM      753.0    0.009909
6            Malware     3905.0    0.051390
7           Password   115338.0    1.517845
8           Phishing   655862.0    8.631128
9      SQL Injection    68492.0    0.901353
10               XSS   245506.0    3.230853
11  Zero Day Exploit     3291.0    0.043309
