## Step 0
Import dependencies

In [1]:
import numpy as np
import pandas as pd
import csv
from IPython.display import display

## Step 1
Compute a pandas dataframe with the payloads from the different collections  
dataframe columns:  
`<is_malicious> | <injection_type> | <payload>`

example: 

`1 | SQL | ' OR 1=1 LIMIT 1 #`

In [2]:
def convert_txt_to_data(src_file,is_malicious,injection_type):
    
    # Read file
    payloads_txt = open('dataset/{}.txt'.format(src_file), 'r', encoding='UTF-8').readlines()
    
    # Create dataframe
    payloads_df = pd.DataFrame(payloads_txt, columns=['payload'])
    payloads_df['is_malicious'] = [is_malicious] * len(payloads_df)
    payloads_df['injection_type'] = [injection_type] * len(payloads_df)
 
    print('First 5 lines of ' + injection_type)
    display(payloads_df.head())
    
    return payloads_df
    
payload_list = []
for collection, injection_type, is_malicious in [
    ('SQLCollection', 'SQL', 1),
    ('XSSCollection', 'XSS', 1),
    ('ShellCollection', 'SHELL', 1),
    ('non-maliciousCollection', 'LEGAL', 0)
]:
    payload_list.append(convert_txt_to_data(collection, is_malicious, injection_type))

data = pd.concat(payload_list, ignore_index=True)

First 5 lines of SQL


Unnamed: 0,payload,is_malicious,injection_type
0,﻿'\n,1,SQL
1,a' or 1=1-- \n,1,SQL
2,"""a"""" or 1=1--""\n",1,SQL
3,or a = a\n,1,SQL
4,a' or 'a' = 'a\n,1,SQL


First 5 lines of XSS


Unnamed: 0,payload,is_malicious,injection_type
0,"﻿data:text/html;alert(1)/*,<svg%20onload=eval(...",1,XSS
1,"'"">*/--></title></style></textarea></script%0A...",1,XSS
2,""" onclick=alert(1)//<button ‘ onclick=alert(1)...",1,XSS
3,"';alert(String.fromCharCode(88,83,83))//';aler...",1,XSS
4,""">><marquee><img src=x onerror=confirm(1)></ma...",1,XSS


First 5 lines of SHELL


Unnamed: 0,payload,is_malicious,injection_type
0,() { 0; }; touch /tmp/blns.shellshock1.fail;\n,1,SHELL
1,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,1,SHELL
2,<<< %s(un='%s') = %u\n,1,SHELL
3,'+++ATH0\n,1,SHELL
4,/dev/null; touch /tmp/blns.fail ; echo\n,1,SHELL


First 5 lines of LEGAL


Unnamed: 0,payload,is_malicious,injection_type
0,569993989\n,0,LEGAL
1,46201\n,0,LEGAL
2,Indianapolis\n,0,LEGAL
3,20354328\n,0,LEGAL
4,A8Cyj4uzrSgkGg4szKuHeI\n,0,LEGAL


## Step 2
Data cleaning
- Remove'\n'
- Remove duplicated
- Remove emptydata
- Remove short data size 1
- Shuffle

Run mutiple times

In [5]:
# Remove ending \n and white spaces
data['payload'] = data['payload'].str.strip('\n')
data['payload'] = data['payload'].str.strip()

# Remove any empty data points
rows_initial  = len(data['payload'])
data = data[data['payload'].str.len() != 0]
print('Empty data points removed: ' + str(rows_initial  - len(data)))

# Remove any suspicious data points of size 1
rows_initial  = len(data['payload'])
data = data[(data['is_malicious'] == 0) | ((data['is_malicious'] == 1) & (data['payload'].str.len() > 1))]
print('Malicious data points of size 1 removed: ' + str(rows_initial - len(data)))

# Remove duplicates
rows_initial = len(data['payload'])
data = data.drop_duplicates(subset='payload', keep='last')
print('Duplicate data points removed: ' + str(rows_initial - len(data)))

# Reformat rows that have the format b'<payload>' into <payload>
data['payload'] = [payload[2:-1] if payload.startswith("b'") or payload.startswith('b"') 
                        else payload for payload in data['payload']]

# Shuffle dataset and reset indices again
data = data.sample(frac=1).reset_index(drop=True)
data.index.name = 'index'

# Remove payloads that cant be saved into .csv using pandas, e.g. they will be null/NA/NaN
data.to_csv('dataset/payloads.csv',encoding='UTF-8')

# Reload dataframe from saved .csv. The dataframe will contain a few null values
data = pd.read_csv("dataset/payloads.csv",index_col='index',encoding='UTF-8') 
rows_initial = len(data['payload'])
data = data[~data['payload'].isnull()]
print('null/NaN data points removed: ' + str(rows_initial - len(data)))

# Save to .csv
data.to_csv('dataset/payloads.csv',encoding='UTF-8')

Empty data points removed: 0
Malicious data points of size 1 removed: 0
Duplicate data points removed: 0
null/NaN data points removed: 0
