# Imports

In [None]:
import numpy as np
import pandas as pd
import socket as sk
import struct as st
import datetime as dt
import ipaddress as ip
from time import perf_counter

# Configurations

In [None]:
# finding the bad guys, dah!

# slowloris 
slowloris_low  = st.unpack('!I', sk.inet_aton('10.128.0.1'))[0]
slowloris_high = st.unpack('!I', sk.inet_aton('10.128.0.50'))[0]

# slowhttptest
slowhttptest_low  = st.unpack('!I', sk.inet_aton('10.128.0.50'))[0]
slowhttptest_high = st.unpack('!I', sk.inet_aton('10.128.0.100'))[0]

# slowloris_ng
slowloris_ng_low  = st.unpack('!I', sk.inet_aton('10.128.0.100'))[0]
slowloris_ng_high = st.unpack('!I', sk.inet_aton('10.128.0.150'))[0]

# defining the TCP flags
tcp_flags = [2, 4, 16, 17, 18, 20, 24, 25, 82, 144, 152, 194]

# Prequel Preprocessing
* this function aims to cast the raw data into a 10 base integer represetation

In [None]:
def prequelProcessing(dataset):
    
    # Casting IP to a single integer
    dataset['source_ip'] = dataset.source_ip.apply(lambda x: st.unpack('!I', sk.inet_aton(x))[0])
    dataset['dest_ip'] = dataset.dest_ip.apply(lambda x: st.unpack('!I', sk.inet_aton(x))[0])
    
    # Casting Hexa to decimal base
    dataset['tcp_flag'] = dataset.tcp_flag.apply(lambda x: int(x, 16))
    
    # Parsing string to datetime object
    dataset['date'] = dataset['date'] + ' ' + dataset['time']
    dataset['date'] = pd.to_datetime(dataset['date'], format='%Y%m%d %H:%M:%S', utc=True)
    
    # Getting rid of useless columns
    dataset.drop(columns=['data', 'time'], inplace=True)
    
    return dataset

# Features Engineering

In [None]:
def features(grouped_data): 
    
    number_requisitions = np.sum(grouped_data['dest_port'] == 80) + np.sum(grouped_data['dest_port'] == 443)
    number_different_destinations = len(np.unique(grouped_data['dest_ip']))
    mean_frame_length = grouped_data['frame_length'].mean()
    
    data = {
            'number_requisitions'           : [number_requisitions], 
            'number_different_destinations' : [number_different_destinations], 
            'mean_frame_length'             : [mean_frame_length]
           }

    for flag in tcp_flags:
        data['flag_' + str(flag)] = [np.sum(grouped_data['tcp_flag'] == flag)]
    
    return pd.DataFrame(data)

In [None]:
def turnToPercentil(dataset, summary, column_name):

    for i in range(len(summary.index)):
        
        if summary[column_name][i] > 0:

            data_percentil = dataset.loc[summary.index[i], column_name] / summary[column_name][i]
            dataset.loc[summary.index[i], column_name] = data_percentil.values

In [None]:
def normalizationPerTimePeriod(dataset):
    
    summary = dataset.groupby('date').sum()
    
    column_names= dataset.columns.values
    column_names = np.delete(column_names, 2)
    
    for column in column_names:
        
        turnToPercentil(dataset, summary, column)

In [None]:
def generateLabelColumn(grouped):
    
    # setting all IPs with none intruser type
    grouped['y'] = 0
    
    # resetting the index 
    dataset = grouped.reset_index()
    
    # finding the bad guys
    slowloris    = (dataset.source_ip >= slowloris_low) & (dataset.source_ip < slowloris_high)
    slowhttptest = (dataset.source_ip >= slowhttptest_low) & (dataset.source_ip < slowhttptest_high)
    slowloris_ng = (dataset.source_ip >= slowloris_ng_low) & (dataset.source_ip < slowloris_ng_high)

    # and labeling them
    dataset.loc[slowloris, 'y']    = 1
    dataset.loc[slowhttptest, 'y'] = 2
    dataset.loc[slowloris_ng, 'y'] = 3
    
    # resuming the original index
    dataset.set_index(['date', 'source_ip'], inplace=True)
    
    # getting rid of useless columns
    dataset.drop(columns=['level_2'], inplace=True)
    
    return dataset

In [None]:
def preprocessing(packages, frequency):
    
    dataset = prequelProcessing(packages)
    
    grouped = dataset.groupby([
            # groupping the data per a specific time frequency
            pd.Grouper(key='date', freq=frequency), 
            # groupping the remaining data by the IPs
            pd.Grouper(key='source_ip')
            # Applying the function who will create the news features
            ]).apply(features)
    
    # normalizing the data
    normalizationPerTimePeriod(grouped)
    
    # generating the true label array
    dataset = generateLabelColumn(grouped)
    
    return dataset

# Running all over together

In [None]:
collection = list()

total_lines = !wc -l Complements/TCP1.csv | grep -Eo '^[0-9]+'
total_lines = int(total_lines[0]) 
number_of_lines = 10000

total_start = perf_counter()

for i, chunk in enumerate(pd.read_csv('Complements/TCP1.csv', 
                                   low_memory=False, 
                                   index_col=[0], 
                                   chunksize=number_of_lines)):

    porcentage = (i * number_of_lines) / total_lines
    
    print(round(porcentage * 100, 2), '% complete')
    
    preprocessed_chunk = preprocessing(chunk, '10s')
    collection.append(preprocessed_chunk)
    
total_stop = perf_counter()

print('#'*50, '\nTotal time:', total_stop - total_start)

In [1]:
import pandas as pd

In [2]:
packages = pd.read_csv('Complements/TCP1.csv', 
                                   low_memory=False, 
                                   index_col=[0])

  mask |= (ar1 == a)


In [3]:
from modules.preprocessing import preprocessing

In [4]:
dataset = preprocessing(packages, '20s')

Applying prequel processing
Generating the features
Normalizing the features
Creating the true label array
