In [1]:
import pandas as pd
import numpy as np
import boto3

In [2]:
# Question 1

# Reading the data into pandas dataframe.
df = pd.read_csv("KDDTrain+.txt", header=None)
df.columns = ['Header' + str(i) for i in range(len(df.columns))]

In [3]:
df['id'] = np.arange(1, df.shape[0]+1) # Adding id column with unique increasing integers for each record.

In [4]:
df.head()

Unnamed: 0,Header0,Header1,Header2,Header3,Header4,Header5,Header6,Header7,Header8,Header9,...,Header34,Header35,Header36,Header37,Header38,Header39,Header40,Header41,Header42,id
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20,1
1,0,udp,other,SF,146,0,0,0,0,0,...,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,2
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,3
3,0,tcp,http,SF,232,8153,0,0,0,0,...,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,4
4,0,tcp,http,SF,199,420,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,5


In [5]:
# Question 2. Used docker container to run DynamoDB locally.

print('creating dynamodb resource')
dynamodb = boto3.resource(
    'dynamodb',
    endpoint_url='http://localhost:8000', ### This is the URL for my DynamoDB docker container
    region_name='us-east-1', ### This is a bit ugly but you have to specify valid AWS region for this code to work!
    aws_access_key_id='access_key',
    aws_secret_access_key='secret_key',
    verify=False)

print ('got resource:', dynamodb) ## This is a confirmation that I'm able to connect to DynamoDB without problems 

creating dynamodb resource
got resource: dynamodb.ServiceResource()


In [12]:
# Question 3.

table = dynamodb.Table('NSL-KDD')

#delete the table if it exists
try:
    table.delete() 
except: 
    print('Table does not exist already!')


In [13]:
### Let's see if we can create the table or retrieve it if it is already created.

result = dynamodb.create_table( ### Now, I'm creating the table
    TableName='NSL-KDD',
    KeySchema=[
        {
            'AttributeName': 'ID',
            'KeyType': 'HASH'  # Partition key
        }
    ],
    AttributeDefinitions=[
        {
                'AttributeName': 'ID',
                'AttributeType': 'S'
        }
    ],
    ProvisionedThroughput={
        'ReadCapacityUnits': 1,
        'WriteCapacityUnits': 1
    }
)
print('Created table:', result)

Created table: dynamodb.Table(name='NSL-KDD')


In [14]:
table = dynamodb.Table('NSL-KDD')
for j in range(1000): # used a subset of first 1000 records after long wait-time and crashes for Q3,4 and 5.
    response = table.put_item(
        Item={
            'ID': str(df['id'][j]),
            'Duration': str(df['Header0'][j]),
            'Protocol Type': df['Header1'][j],
            'Service': df['Header2'][j],
            'Flag': df['Header3'][j],
            'Src Bytes': str(df['Header4'][j]),
            'Dst Bytes': str(df['Header5'][j]),
            'Land': str(df['Header6'][j]),
            'Wrong Fragment': str(df['Header7'][j]), 
            'Urgent': str(df['Header8'][j]),
            'Hot': str(df['Header9'][j]),
            'Logins': str(df['Header10'][j]),
            'Logged In': str(df['Header11'][j]),
            'Compromised': str(df['Header12'][j]),
            'Root Shell': str(df['Header13'][j]),
            'Su Attempted': str(df['Header14'][j]),
            'Num Root': str(df['Header15'][j]),
            'Creations': str(df['Header16'][j]),
            'Num Shells': str(df['Header17'][j]),
            'Files': str(df['Header18'][j]),
            'Cmds': str(df['Header19'][j]),
            'Is Hot Logins': str(df['Header20'][j]),
            'Is Guest Login': str(df['Header21'][j]),
            'Count': str(df['Header22'][j]),
            'Srv Count': str(df['Header23'][j]),
            'Serror Rate': str(df['Header24'][j]),
            'Srv Serror Rate': str(df['Header25'][j]),
            'Rerror Rate': str(df['Header26'][j]),
            'Srv Rerror Rate': str(df['Header27'][j]),
            'Same Srv Rate': str(df['Header28'][j]),
            'Diff Srv Rate': str(df['Header29'][j]),
            'Rate1': str(df['Header30'][j]),
            'Dst Host Count': str(df['Header31'][j]),
            'Count': str(df['Header32'][j]),
            'Srv Rate': str(df['Header33'][j]),
            'Rate2': str(df['Header34'][j]),
            'Src Port Rate': str(df['Header35'][j]),
            'Host Rate': str(df['Header36'][j]),
            'Rate3': str(df['Header37'][j]),
            'Serror Rate': str(df['Header38'][j]),
            'Rate4': str(df['Header39'][j]),
            'Rerror Rate': str(df['Header40'][j]),
            'Class': df['Header41'][j],
            'Difficulty Level': str(df['Header42'][j]),       
        }
    )

print("Put Item succeeded")
print(response)

Put Item succeeded
{'ResponseMetadata': {'RequestId': '5d254f47-008f-4bd2-b657-bfd3be414f43', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Fri, 23 Sep 2022 01:10:14 GMT', 'content-type': 'application/x-amz-json-1.0', 'x-amz-crc32': '2745614147', 'x-amzn-requestid': '5d254f47-008f-4bd2-b657-bfd3be414f43', 'content-length': '2', 'server': 'Jetty(9.4.48.v20220622)'}, 'RetryAttempts': 0}}


In [15]:
from boto3.dynamodb.conditions import Key

#Getting 10 items from the table.
table = dynamodb.Table('NSL-KDD')
response = table.scan(
    Limit=10 
)

print (response['Items'])

[{'Rate1': '0.0', 'Num Root': '0', 'Rate3': '0.0', 'Rate2': '0.0', 'Rate4': '0.0', 'Num Shells': '0', 'Srv Serror Rate': '0.0', 'Protocol Type': 'tcp', 'Creations': '0', 'Count': '38', 'Dst Bytes': '0', 'Is Guest Login': '0', 'Serror Rate': '0.0', 'Is Hot Logins': '0', 'Logged In': '1', 'Dst Host Count': '4', 'ID': '49', 'Src Bytes': '334', 'Files': '0', 'Src Port Rate': '1.0', 'Srv Rerror Rate': '0.0', 'Urgent': '0', 'Srv Rate': '1.0', 'Service': 'ftp_data', 'Duration': '0', 'Rerror Rate': '0.0', 'Hot': '0', 'Diff Srv Rate': '0.0', 'Flag': 'SF', 'Land': '0', 'Compromised': '0', 'Srv Count': '2', 'Host Rate': '0.18', 'Logins': '0', 'Root Shell': '0', 'Wrong Fragment': '0', 'Cmds': '0', 'Class': 'warezclient', 'Su Attempted': '0', 'Same Srv Rate': '1.0', 'Difficulty Level': '12'}, {'Rate1': '0.25', 'Num Root': '0', 'Rate3': '0.0', 'Rate2': '0.0', 'Rate4': '0.0', 'Num Shells': '0', 'Srv Serror Rate': '0.0', 'Protocol Type': 'tcp', 'Creations': '0', 'Count': '255', 'Dst Bytes': '4482', 'I

In [31]:
# Question 4.

from boto3.dynamodb.conditions import Attr

scan_kwargs = {
    'FilterExpression': Attr('Class').eq('neptune') # Filters only the neptune class attacks.
}
response = table.scan(**scan_kwargs)
print(response.get('Items', []))

[{'Rate1': '0.0', 'Num Root': '0', 'Rate3': '1.0', 'Rate2': '0.06', 'Rate4': '0.0', 'Num Shells': '0', 'Srv Serror Rate': '1.0', 'Protocol Type': 'tcp', 'Creations': '0', 'Count': '7', 'Dst Bytes': '0', 'Is Guest Login': '0', 'Serror Rate': '1.0', 'Is Hot Logins': '0', 'Logged In': '0', 'Dst Host Count': '255', 'ID': '762', 'Src Bytes': '0', 'Files': '0', 'Src Port Rate': '0.0', 'Srv Rerror Rate': '0.0', 'Urgent': '0', 'Srv Rate': '0.03', 'Service': 'ssh', 'Duration': '0', 'Rerror Rate': '0.0', 'Hot': '0', 'Diff Srv Rate': '0.05', 'Flag': 'S0', 'Land': '0', 'Compromised': '0', 'Srv Count': '7', 'Host Rate': '0.0', 'Logins': '0', 'Root Shell': '0', 'Wrong Fragment': '0', 'Cmds': '0', 'Class': 'neptune', 'Su Attempted': '0', 'Same Srv Rate': '0.07', 'Difficulty Level': '20'}, {'Rate1': '0.0', 'Num Root': '0', 'Rate3': '1.0', 'Rate2': '0.06', 'Rate4': '0.0', 'Num Shells': '0', 'Srv Serror Rate': '1.0', 'Protocol Type': 'tcp', 'Creations': '0', 'Count': '3', 'Dst Bytes': '0', 'Is Guest Log

In [26]:
# Question 5.

from boto3.dynamodb.conditions import Attr

# Creating a list with the DoS attacks to filter.
DoS = ['apache2', 'back', 'land', 'neptune', 'mailbomb', 'pod', 'processtable', 'smurf', 'teardrop', 'udpstorm', 'worm']
scan_kwargs = {
    'FilterExpression': Attr('Class').is_in(DoS) # Filters only the items with DoS attacks.
}
response = table.scan(**scan_kwargs)
print(response.get('Items', []))


[{'Rate1': '0.0', 'Num Root': '0', 'Rate3': '1.0', 'Rate2': '0.06', 'Rate4': '0.0', 'Num Shells': '0', 'Srv Serror Rate': '1.0', 'Protocol Type': 'tcp', 'Creations': '0', 'Count': '7', 'Dst Bytes': '0', 'Is Guest Login': '0', 'Serror Rate': '1.0', 'Is Hot Logins': '0', 'Logged In': '0', 'Dst Host Count': '255', 'ID': '762', 'Src Bytes': '0', 'Files': '0', 'Src Port Rate': '0.0', 'Srv Rerror Rate': '0.0', 'Urgent': '0', 'Srv Rate': '0.03', 'Service': 'ssh', 'Duration': '0', 'Rerror Rate': '0.0', 'Hot': '0', 'Diff Srv Rate': '0.05', 'Flag': 'S0', 'Land': '0', 'Compromised': '0', 'Srv Count': '7', 'Host Rate': '0.0', 'Logins': '0', 'Root Shell': '0', 'Wrong Fragment': '0', 'Cmds': '0', 'Class': 'neptune', 'Su Attempted': '0', 'Same Srv Rate': '0.07', 'Difficulty Level': '20'}, {'Rate1': '0.0', 'Num Root': '0', 'Rate3': '1.0', 'Rate2': '0.06', 'Rate4': '0.0', 'Num Shells': '0', 'Srv Serror Rate': '1.0', 'Protocol Type': 'tcp', 'Creations': '0', 'Count': '3', 'Dst Bytes': '0', 'Is Guest Log

Going off what has been discussed in class and the lecture notes, I think a Key-Value database model works fairly well for this task. However, I think a column-based model might be interesting to try, it could be useful to organize the data in this way, and maybe make use of a super column for the last 2 features that represent the activity type, while the other columns for the traffic input can remain in their own columns or even have their own super column, in order to categorize the features as a whole in: traffic input features and activity type features.