In [8]:
import numpy as np
import pandas as pd

In [9]:
csv_list = [
    r'UNSW-NB15 - CSV Files\UNSW-NB15_1.csv',
    r'UNSW-NB15 - CSV Files\UNSW-NB15_2.csv',
    r'UNSW-NB15 - CSV Files\UNSW-NB15_3.csv',
    r'UNSW-NB15 - CSV Files\UNSW-NB15_4.csv',
]
feature_name_df = pd.read_csv(r'UNSW-NB15 - CSV Files\NUSW-NB15_features.csv')
feature_name_list = list(feature_name_df['Name'])
feature_name_df

Unnamed: 0,No.,Name,Type,Description
0,1,srcip,nominal,Source IP address
1,2,sport,integer,Source port number
2,3,dstip,nominal,Destination IP address
3,4,dsport,integer,Destination port number
4,5,proto,nominal,Transaction protocol
5,6,state,nominal,Indicates to the state and its dependent proto...
6,7,dur,Float,Record total duration
7,8,sbytes,Integer,Source to destination transaction bytes
8,9,dbytes,Integer,Destination to source transaction bytes
9,10,sttl,Integer,Source to destination time to live value


### Read Dataset from CSV Files

In [10]:
data = pd.concat([pd.read_csv(csv,names=feature_name_list) for csv in csv_list],ignore_index=True)
data.head()

  data = pd.concat([pd.read_csv(csv,names=feature_name_list) for csv in csv_list],ignore_index=True)
  data = pd.concat([pd.read_csv(csv,names=feature_name_list) for csv in csv_list],ignore_index=True)


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0


### 去除与ip和时间等非描述Session的列

In [11]:
_data = data.drop(columns=['srcip','dstip','Stime','Ltime'])

### 整理列的数据格式

In [12]:
def to_intger(x):
    if isinstance(x, int):
        return x
    elif isinstance(x, str):
        try:
            if x[:2] == '0b': return int(x,2)
            elif x[:2] == '0o': return int(x,8)
            elif x[:2] == '0x': return int(x,16)
            else: return int(x)
        except ValueError as error:
            if x in '- ':
                return 0
            print(error)
            assert(False)
    elif isinstance(x, float):
        if x.is_integer():
            return int(x)
        else:
            return int(x)
    else:
        print("Unknown Dtype {}".format(type(x)))
        assert(False)

# clean up
_data['sport'] = _data['sport'].apply(to_intger)
_data['dsport'] = _data['dsport'].apply(to_intger)
_data['ct_ftp_cmd'] = _data['ct_ftp_cmd'].apply(to_intger)
print(_data['sport'].dtype, _data['dsport'].dtype,_data['ct_ftp_cmd'].dtype)

int64 int64 int64


### 数据清理

In [13]:
for i,c in enumerate(_data.columns):
    _l = len(_data[_data[c].isna()])
    if _l > 0:
        print(_data.columns[i],_l)

ct_flw_http_mthd 1348145
is_ftp_login 1429879
attack_cat 2218764


In [14]:
print(feature_name_df[feature_name_df['Name']=='ct_flw_http_mthd']['Description'])
print(_data['ct_flw_http_mthd'].unique())
_data['ct_flw_http_mthd'] = _data['ct_flw_http_mthd'].fillna(-1.0)
print(_data['ct_flw_http_mthd'].unique())

37    No. of flows that has methods such as Get and ...
Name: Description, dtype: object
[ 0.  1.  2.  4. 14.  8.  6. 12. 10.  3.  5. 36.  9. nan 16. 25. 30.]
[ 0.  1.  2.  4. 14.  8.  6. 12. 10.  3.  5. 36.  9. -1. 16. 25. 30.]


In [15]:
print(feature_name_df[feature_name_df['Name']=='is_ftp_login']['Description'])
print(_data['is_ftp_login'].unique())
_data['is_ftp_login'] = _data['is_ftp_login'].fillna(-1.0)
print(_data['is_ftp_login'].unique())

38    If the ftp session is accessed by user and pas...
Name: Description, dtype: object
[ 0.  1. nan  2.  4.]
[ 0.  1. -1.  2.  4.]


### 端口处理

In [16]:
print("Num of sport: {}".format(len(_data['sport'].unique())))
print("Num of dsport: {}".format(len(_data['dsport'].unique())))

Num of sport: 64599
Num of dsport: 64627


In [17]:
#信息熵
def entropy(data):
    a = pd.value_counts(data) / len(data)
    return sum(np.log2(a) * a * (-1))

#str1|str2的信息增益
def information_gain(data,str1,str2):
    e1 = data.groupby(str1).apply(lambda x:entropy(x[str2]))
    p1 = pd.value_counts(data[str1]) / len(data[str1])
    
    e2 = sum(e1 * p1)
    return entropy(data[str2]) - e2
    
# dsport对proto的信息增益率
information_gain(_data,'proto','dsport')/entropy(_data['proto'])

0.8070112169392398

In [18]:
_data = _data.drop(columns=['sport','dsport'])

### one-hot编码

In [19]:
print(_data['state'].unique())
print(_data['proto'].unique())
print(_data['service'].unique())


['CON' 'INT' 'FIN' 'URH' 'REQ' 'ECO' 'RST' 'CLO' 'TXD' 'URN' 'no' 'ACC'
 'PAR' 'MAS' 'TST' 'ECR']
['udp' 'arp' 'tcp' 'ospf' 'icmp' 'igmp' 'sctp' 'udt' 'sep' 'sun-nd'
 'swipe' 'mobile' 'pim' 'rtp' 'ipnip' 'ip' 'ggp' 'st2' 'egp' 'cbt' 'emcon'
 'nvp' 'igp' 'xnet' 'argus' 'bbn-rcc' 'chaos' 'pup' 'hmp' 'mux' 'dcn'
 'prm' 'trunk-1' 'xns-idp' 'trunk-2' 'leaf-1' 'leaf-2' 'irtp' 'rdp'
 'iso-tp4' 'netblt' 'mfe-nsp' 'merit-inp' '3pc' 'xtp' 'idpr' 'tp++' 'ddp'
 'idpr-cmtp' 'ipv6' 'il' 'idrp' 'ipv6-frag' 'sdrp' 'ipv6-route' 'gre'
 'rsvp' 'mhrp' 'bna' 'esp' 'i-nlsp' 'narp' 'ipv6-no' 'tlsp' 'skip'
 'ipv6-opts' 'any' 'cftp' 'sat-expak' 'kryptolan' 'rvd' 'ippc' 'sat-mon'
 'ipcv' 'visa' 'cpnx' 'cphb' 'wsn' 'pvp' 'br-sat-mon' 'wb-mon' 'wb-expak'
 'iso-ip' 'secure-vmtp' 'vmtp' 'vines' 'ttp' 'nsfnet-igp' 'dgp' 'tcf'
 'eigrp' 'sprite-rpc' 'larp' 'mtp' 'ax.25' 'ipip' 'micp' 'aes-sp3-d'
 'encap' 'etherip' 'pri-enc' 'gmtp' 'pnni' 'ifmp' 'aris' 'qnx' 'a/n'
 'scps' 'snp' 'ipcomp' 'compaq-peer' 'ipx-n-ip' 'vrrp' 

In [20]:
_data = pd.get_dummies(_data,columns=['state','proto','service'],dtype=bool)
_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2540047 entries, 0 to 2540046
Columns: 204 entries, dur to service_ssl
dtypes: bool(164), float64(12), int64(27), object(1)
memory usage: 1.1+ GB


### 整理Label和类别

In [21]:
_data['Label'] = _data['Label'].astype(bool)
print(_data['Label'].dtype)
print(_data['Label'].value_counts())

bool
False    2218764
True      321283
Name: Label, dtype: int64


In [22]:
_data.columns

Index(['dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'Sload',
       'Dload', 'Spkts',
       ...
       'service_ftp', 'service_ftp-data', 'service_http', 'service_irc',
       'service_pop3', 'service_radius', 'service_smtp', 'service_snmp',
       'service_ssh', 'service_ssl'],
      dtype='object', length=204)

In [23]:
print(_data['attack_cat'].unique())
_data['attack_cat'] = _data['attack_cat'].fillna('Normal')
_data['attack_cat'] = _data['attack_cat'].apply(str.strip)
print(_data['attack_cat'].value_counts())
print(_data['attack_cat'].unique())

[nan 'Exploits' 'Reconnaissance' 'DoS' 'Generic' 'Shellcode' ' Fuzzers'
 'Worms' 'Backdoors' 'Analysis' ' Reconnaissance ' 'Backdoor' ' Fuzzers '
 ' Shellcode ']
Normal            2218764
Generic            215481
Exploits            44525
Fuzzers             24246
DoS                 16353
Reconnaissance      13987
Analysis             2677
Backdoor             1795
Shellcode            1511
Backdoors             534
Worms                 174
Name: attack_cat, dtype: int64
['Normal' 'Exploits' 'Reconnaissance' 'DoS' 'Generic' 'Shellcode'
 'Fuzzers' 'Worms' 'Backdoors' 'Analysis' 'Backdoor']


In [24]:
attack_cat_dict = {'Normal':0, 
                   'Exploits':1,
                   'Reconnaissance':2,
                   'DoS':3, 
                   'Generic':4,
                   'Shellcode':5,
                   'Fuzzers':6,
                   'Worms':7, 
                   'Backdoors':8, 
                   'Analysis':9,
                   'Backdoor':10}
_data['attack_cat'] = _data['attack_cat'].apply(lambda x:attack_cat_dict[x])
print(_data['attack_cat'].value_counts())
print(_data['attack_cat'].unique())

0     2218764
4      215481
1       44525
6       24246
3       16353
2       13987
9        2677
10       1795
5        1511
8         534
7         174
Name: attack_cat, dtype: int64
[ 0  1  2  3  4  5  6  7  8  9 10]


### 输出数据文件为npy文件

In [25]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(_data,test_size=0.02,shuffle=True)
test,dev = train_test_split(test,test_size=0.5)

In [26]:
feature_col_list = list(_data.columns)
feature_col_list.remove('Label')
feature_col_list.remove('attack_cat')

In [27]:
Y_train_bin = np.asarray(train['Label']).astype(np.float32)
Y_train_mul = np.asarray(train['attack_cat']).astype(np.float32)
X_train = np.asarray(train[feature_col_list]).astype(np.float32)

Y_test_bin = np.asarray(test['Label']).astype(np.float32)
Y_test_mul = np.asarray(test['attack_cat']).astype(np.float32)
X_test = np.asarray(test[feature_col_list]).astype(np.float32)

Y_dev_bin = np.asarray(dev['Label']).astype(np.float32)
Y_dev_mul = np.asarray(dev['attack_cat']).astype(np.float32)
X_dev = np.asarray(dev[feature_col_list]).astype(np.float32)

In [30]:
import os
if not os.path.exists("npy_file"):
    os.mkdir("npy_file")
np.save(r'npy_file/Y_train_bin.npy',Y_train_bin)
np.save(r'npy_file/Y_train_mul.npy',Y_train_mul)
np.save(r'npy_file/X_train.npy',X_train)


np.save(r'npy_file/Y_test_bin.npy',Y_test_bin)
np.save(r'npy_file/Y_test_mul.npy',Y_test_mul)
np.save(r'npy_file/X_test.npy',X_test)

np.save(r'npy_file/Y_dev_bin.npy',Y_dev_bin)
np.save(r'npy_file/Y_dev_mul.npy',Y_dev_mul)
np.save(r'npy_file/X_dev.npy',X_dev)

#### 输出数据文件约1.92GB