Atividade 2 TÓPICOS COMPUTACIONAIS EM CIÊNCIA DE DADOS - Ciências da Computação
Bernardo Gontijo Vaz Guimarães
Dados escolhidos UNSW_NB15

In [1]:
# Importando as bibliotecas a serem utilizadas

import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Carregando os dados a serem utilizados

In [2]:
# Carregando a lista de Features
features = pd.read_csv('csv_files/NUSW-NB15_features.csv',
                       encoding_errors='ignore',
                       index_col='No.'
                      )

In [3]:
# Informações das colunas do Dataset
features

Unnamed: 0_level_0,Name,Type,Description
No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,srcip,nominal,Source IP address
2,sport,integer,Source port number
3,dstip,nominal,Destination IP address
4,dsport,integer,Destination port number
5,proto,nominal,Transaction protocol
6,state,nominal,Indicates to the state and its dependent proto...
7,dur,Float,Record total duration
8,sbytes,Integer,Source to destination transaction bytes
9,dbytes,Integer,Destination to source transaction bytes
10,sttl,Integer,Source to destination time to live value


In [4]:
# Lista para ser usada como header dos dados
header = list(features['Name'])
header

['srcip',
 'sport',
 'dstip',
 'dsport',
 'proto',
 'state',
 'dur',
 'sbytes',
 'dbytes',
 'sttl',
 'dttl',
 'sloss',
 'dloss',
 'service',
 'Sload',
 'Dload',
 'Spkts',
 'Dpkts',
 'swin',
 'dwin',
 'stcpb',
 'dtcpb',
 'smeansz',
 'dmeansz',
 'trans_depth',
 'res_bdy_len',
 'Sjit',
 'Djit',
 'Stime',
 'Ltime',
 'Sintpkt',
 'Dintpkt',
 'tcprtt',
 'synack',
 'ackdat',
 'is_sm_ips_ports',
 'ct_state_ttl',
 'ct_flw_http_mthd',
 'is_ftp_login',
 'ct_ftp_cmd',
 'ct_srv_src',
 'ct_srv_dst',
 'ct_dst_ltm',
 'ct_src_ ltm',
 'ct_src_dport_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'attack_cat',
 'Label']

In [5]:
# Selecionando colunas a serem usadas
columns = [
     'proto',
     'state',
     'dur',
     'sbytes',
     'dbytes',
     'sttl',
     'dttl',
     'sloss',
     'dloss',
     'service',
     'Sload',
     'Dload',
     'Spkts',
     'Dpkts',
     'swin',
     'dwin',
     'stcpb',
     'dtcpb',
     'smeansz',
     'dmeansz',
     'trans_depth',
     'res_bdy_len',
     'Sjit',
     'Djit',
     'tcprtt',
     'synack',
     'ackdat',
     'is_sm_ips_ports',
     'ct_state_ttl',
     'ct_flw_http_mthd',
     'is_ftp_login',
     'ct_ftp_cmd',
     'ct_srv_src',
     'ct_srv_dst',
     'ct_dst_ltm',
     'ct_src_ ltm',
     'ct_src_dport_ltm',
     'ct_dst_sport_ltm',
     'ct_dst_src_ltm',
     'attack_cat',
     'Label'
    ]

**Colunas não usadas:** <br>
srcip e dstip -> outros atributos já trabalham as relações entre a origem e o destino <br>
stime e ltime -> colunas dur já trabalha o tratamento de tempo necessário

In [6]:
converter = {
    'is_ftp_login': lambda x: np.int64(1) if x=='1' else np.int64(0),
    'is_sm_ips_ports': lambda x: np.int64(1) if x=='1' else np.int64(0),
    'ct_ftp_cmd': lambda x: np.int64(x) if x != ' ' else np.int64(0)
}

In [7]:
dtype = {
    'attack_cat': np.str_
}

In [8]:
data1 = pd.read_csv('csv_files/data/UNSW-NB15_1.csv',names=header, usecols=columns, dtype=dtype, converters = converter)

In [9]:
data2 = pd.read_csv('csv_files/data/UNSW-NB15_2.csv',names=header, usecols=columns, dtype=dtype, converters = converter)

In [10]:
data3 = pd.read_csv('csv_files/data/UNSW-NB15_3.csv',names=header, usecols=columns, dtype=dtype, converters = converter)

In [11]:
data4 = pd.read_csv('csv_files/data/UNSW-NB15_4.csv',names=header, usecols=columns, dtype=dtype, converters = converter)

In [12]:
data = pd.concat([data1,data2,data3,data4],ignore_index=True)
data.head()
data.shape

(2540047, 41)

In [13]:
# Verificando tipos das colunas e adicionando funções ao converter para consertar
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2540047 entries, 0 to 2540046
Data columns (total 41 columns):
 #   Column            Dtype  
---  ------            -----  
 0   proto             object 
 1   state             object 
 2   dur               float64
 3   sbytes            int64  
 4   dbytes            int64  
 5   sttl              int64  
 6   dttl              int64  
 7   sloss             int64  
 8   dloss             int64  
 9   service           object 
 10  Sload             float64
 11  Dload             float64
 12  Spkts             int64  
 13  Dpkts             int64  
 14  swin              int64  
 15  dwin              int64  
 16  stcpb             int64  
 17  dtcpb             int64  
 18  smeansz           int64  
 19  dmeansz           int64  
 20  trans_depth       int64  
 21  res_bdy_len       int64  
 22  Sjit              float64
 23  Djit              float64
 24  tcprtt            float64
 25  synack            float64
 26  ackdat        

In [14]:
# Verificando a COluna que possuia valores vazios
data2['ct_ftp_cmd'].unique()

array([0, 1, 4, 2])

In [15]:
# Verificar os valores nos atributos binários
print('Label Unique Values:',data['Label'].unique())
print('is_sm_ips_ports Unique Values:',data['is_sm_ips_ports'].unique())
print('is_ftp_login Unique Values:',data['is_ftp_login'].unique())

Label Unique Values: [0 1]
is_sm_ips_ports Unique Values: [0 1]
is_ftp_login Unique Values: [0 1]


In [16]:
descr_cols = data.describe().columns

In [17]:
# Verificar relação entre os atributos numéricos e label
data.groupby(['Label'])[descr_cols].mean().iloc[:,0:15]

Unnamed: 0_level_0,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,Sload,Dload,Spkts,Dpkts,swin,dwin,stcpb,dtcpb
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0.648411,4121.088225,41058.519224,37.100686,28.566358,5.534693,18.393768,28278460.0,2804086.0,36.797789,48.087476,165.135099,164.742815,1388011000.0,1388135000.0
1,0.730478,5848.631674,4446.658613,240.135647,45.96302,2.60339,2.073325,96886220.0,11509.53,9.056246,5.704986,46.179427,46.17784,389410300.0,389068700.0


In [18]:
data.groupby(['Label'])[descr_cols].mean().iloc[:,16:30]

Unnamed: 0_level_0,dmeansz,trans_depth,res_bdy_len,Sjit,Djit,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,311.460281,0.087121,4736.271352,1454.16596,801.282377,0.003667,0.002081,0.001586,0.001891,0.034048,0.222542,0.018695,0.022638,7.627581
1,36.425198,0.056542,829.518695,2520.446978,238.324164,0.02354,0.01162,0.01192,0.0,1.829543,0.596912,0.005942,0.006191,20.114295


In [19]:
data.groupby(['Label'])[descr_cols].mean().iloc[:,30:-1]

Unnamed: 0_level_0,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,7.397207,5.384763,5.829279,3.36224,2.608133,4.961239
1,19.981509,13.720321,14.302142,13.481059,10.392299,19.861163


In [20]:
data_mean_label = data.groupby(['Label'])[descr_cols].mean()

In [21]:
for i in range(len(data_mean_label.columns)):
    relation = (data_mean_label.iloc[:,i][1])/(data_mean_label.iloc[:,i][0])
    print(relation)

1.12656645299003
1.4191959391803253
0.10830051099180006
6.472539298692753
1.6089912714405996
0.47037663704751126
0.11271887178715687
3.4261494318049732
0.004104558253416788
0.2461084392630613
0.11863766618906847
0.2796463493413187
0.2803026045183392
0.2805527113907806
0.2802814871913142
0.7666745186850078
0.116949738427453
0.6490057931590423
0.1751417167028208
1.7332595093868435
0.29742843654821927
6.4196764957620704
5.583729473193418
7.5167125148305995
0.0
53.73456138352663
2.6822460767218343
0.31782682175336
0.2734661745844068
2.6370477125525724
2.701223480052429
2.54798967469068
2.453501051855317
4.009546874705317
3.984573753148697
4.003266881833508
inf


  relation = (data_mean_label.iloc[:,i][1])/(data_mean_label.iloc[:,i][0])


In [22]:
for i in range(len(data_mean_label.columns)):
    print(data_mean_label.iloc[:,i][1],data_mean_label.iloc[:,i][0])
    print(data_mean_label.iloc[:,i][1]/data_mean_label.iloc[:,i][0])

0.7304782731268072 0.6484111711191456
1.12656645299003
5848.631673633526 4121.088224795427
1.4191959391803253
4446.658612500506 41058.51922376602
0.10830051099180006
240.13564676624657 37.100685787222076
6.472539298692753
45.96302014112169 28.56635766579952
1.6089912714405996
2.603390157586925 5.534692738840183
0.47037663704751126
2.073324763526237 18.39376788157731
0.11271887178715687
96886215.2921104 28278455.806017935
3.4261494318049732
11509.533950680985 2804085.9064675276
0.004104558253416788
9.056246362241389 36.797788768882135
0.2461084392630613
5.704985946968872 48.08747618043199
0.11863766618906847
46.17942748293561 165.13509864050437
0.2796463493413187
46.17784009735965 164.7428149185763
0.2803026045183392
389410349.26732194 1388011355.6447296
0.2805527113907806
389068669.2377717 1388135453.1710534
0.2802814871913142
98.15900934689978 128.0321791772356
0.7666745186850078
36.425198345383976 311.46028058865204
0.116949738427453
0.05654205171141953 0.08712102774337424
0.64900579

  print(data_mean_label.iloc[:,i][1]/data_mean_label.iloc[:,i][0])


In [23]:
# Iterar sobre o agrupado e calcular a diferença entre as médias dos marcados como ataque e não marcados para encontrar os que têm diferença maior que 2x
big_mean_diff = list()
for i in range(len(data_mean_label.columns)):
    relation = (data_mean_label.iloc[:,i][1])/(data_mean_label.iloc[:,i][0])
    if relation >= 2 or relation <=0.5: 
        print(data_mean_label.columns[i],relation)
        big_mean_diff.append(data_mean_label.columns[i])
big_mean_diff.remove('Label')
big_mean_diff

dbytes 0.10830051099180006
sttl 6.472539298692753
sloss 0.47037663704751126
dloss 0.11271887178715687
Sload 3.4261494318049732
Dload 0.004104558253416788
Spkts 0.2461084392630613
Dpkts 0.11863766618906847
swin 0.2796463493413187
dwin 0.2803026045183392
stcpb 0.2805527113907806
dtcpb 0.2802814871913142
dmeansz 0.116949738427453
res_bdy_len 0.1751417167028208
Djit 0.29742843654821927
tcprtt 6.4196764957620704
synack 5.583729473193418
ackdat 7.5167125148305995
is_sm_ips_ports 0.0
ct_state_ttl 53.73456138352663
ct_flw_http_mthd 2.6822460767218343
is_ftp_login 0.31782682175336
ct_ftp_cmd 0.2734661745844068
ct_srv_src 2.6370477125525724
ct_srv_dst 2.701223480052429
ct_dst_ltm 2.54798967469068
ct_src_ ltm 2.453501051855317
ct_src_dport_ltm 4.009546874705317
ct_dst_sport_ltm 3.984573753148697
ct_dst_src_ltm 4.003266881833508
Label inf


  relation = (data_mean_label.iloc[:,i][1])/(data_mean_label.iloc[:,i][0])


['dbytes',
 'sttl',
 'sloss',
 'dloss',
 'Sload',
 'Dload',
 'Spkts',
 'Dpkts',
 'swin',
 'dwin',
 'stcpb',
 'dtcpb',
 'dmeansz',
 'res_bdy_len',
 'Djit',
 'tcprtt',
 'synack',
 'ackdat',
 'is_sm_ips_ports',
 'ct_state_ttl',
 'ct_flw_http_mthd',
 'is_ftp_login',
 'ct_ftp_cmd',
 'ct_srv_src',
 'ct_srv_dst',
 'ct_dst_ltm',
 'ct_src_ ltm',
 'ct_src_dport_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm']

Produzindo uma matriz de correlação para avaliar os atributos com

In [24]:
# Pearson correlation
num_corr = data.corr(numeric_only=True)
num_corr.iloc[:,-1]

dur                 0.001959
sbytes              0.010180
dbytes             -0.075543
sttl                0.904393
dttl                0.134947
sloss              -0.043272
dloss              -0.095855
Sload               0.192278
Dload              -0.219710
Spkts              -0.120880
Dpkts              -0.115947
swin               -0.315108
dwin               -0.313920
stcpb              -0.233422
dtcpb              -0.233513
smeansz            -0.065363
dmeansz            -0.272397
trans_depth        -0.029041
res_bdy_len        -0.027339
Sjit                0.020959
Djit               -0.054420
tcprtt              0.143109
synack              0.122253
ackdat              0.143443
is_sm_ips_ports    -0.015477
ct_state_ttl        0.873694
ct_flw_http_mthd    0.083188
is_ftp_login       -0.032715
ct_ftp_cmd         -0.029654
ct_srv_src          0.383007
ct_srv_dst          0.386509
ct_dst_ltm          0.339464
ct_src_ ltm         0.343246
ct_src_dport_ltm    0.396749
ct_dst_sport_l

In [25]:
# Kendall correlation
num_corr = data.corr(numeric_only=True, method='kendall')
num_corr.iloc[:,-1]

dur                -0.258458
sbytes             -0.348955
dbytes             -0.363397
sttl                0.736718
dttl               -0.416054
sloss              -0.280588
dloss              -0.281369
Sload               0.238405
Dload              -0.407619
Spkts              -0.292236
Dpkts              -0.364095
swin               -0.315107
dwin               -0.313918
stcpb              -0.239990
dtcpb              -0.240002
smeansz            -0.236983
dmeansz            -0.381872
trans_depth        -0.037630
res_bdy_len        -0.064622
Sjit               -0.201351
Djit               -0.233105
tcprtt             -0.180882
synack             -0.180980
ackdat             -0.181152
is_sm_ips_ports    -0.015477
ct_state_ttl        0.919764
ct_flw_http_mthd    0.132703
is_ftp_login       -0.032715
ct_ftp_cmd         -0.032716
ct_srv_src          0.222227
ct_srv_dst          0.215734
ct_dst_ltm          0.177125
ct_src_ ltm         0.192955
ct_src_dport_ltm    0.378263
ct_dst_sport_l

In [26]:
# Spearman correlation
num_corr = data.corr(numeric_only=True, method='spearman')
num_corr.iloc[:,-1]

dur                -0.315860
sbytes             -0.420797
dbytes             -0.433614
sttl                0.760892
dttl               -0.422565
sloss              -0.312964
dloss              -0.322032
Sload               0.291694
Dload              -0.491303
Spkts              -0.340038
Dpkts              -0.429284
swin               -0.315109
dwin               -0.313920
stcpb              -0.277674
dtcpb              -0.277680
smeansz            -0.283401
dmeansz            -0.455330
trans_depth        -0.037670
res_bdy_len        -0.065203
Sjit               -0.234670
Djit               -0.273119
tcprtt             -0.209009
synack             -0.209081
ackdat             -0.208811
is_sm_ips_ports    -0.015477
ct_state_ttl        0.932251
ct_flw_http_mthd    0.133600
is_ftp_login       -0.032715
ct_ftp_cmd         -0.032738
ct_srv_src          0.263057
ct_srv_dst          0.254983
ct_dst_ltm          0.206098
ct_src_ ltm         0.225864
ct_src_dport_ltm    0.407662
ct_dst_sport_l

**Colunas com correlação forte à label em todos os métodos:**<br>
sttl <br>
ct_state_ttl <br>

Investigando a proporção de linha para ataque e para não ataque.

In [27]:
data['Label'].value_counts()

Label
0    2218764
1     321283
Name: count, dtype: int64

In [28]:
# Percentual de linhas de ataque
perc_ataque = data['Label'].value_counts()[1]/data['Label'].value_counts().sum()
perc_normal = data['Label'].value_counts()[0]/data['Label'].value_counts().sum()
print('Percentual de Atq:',perc_ataque)
print('Percentual de Normal:',perc_normal)

Percentual de Atq: 0.12648702957071267
Percentual de Normal: 0.8735129704292873


**Investigando os atributos categóricos:**

In [29]:
# Colunas categóricas
data.select_dtypes(include='object').columns

Index(['proto', 'state', 'service', 'attack_cat'], dtype='object')

In [30]:
# Pegar colunas categóricas
cat_columns = list(data.select_dtypes(include='object').columns)
for coluna in cat_columns:
    print(data[coluna].value_counts())

proto
tcp     1495074
udp      990435
unas      16202
arp       10064
ospf       7798
         ...   
3pc         137
igmp         64
udt           8
rtp           7
esp           2
Name: count, Length: 135, dtype: int64
state
FIN    1478689
CON     560588
INT     490471
REQ       9043
RST        528
ECO        337
CLO        161
URH        108
ACC         43
PAR         30
TST          9
ECR          9
URN          8
no           8
MAS          8
TXD          7
Name: count, dtype: int64
service
-           1246397
dns          781668
http         206273
ftp-data     125783
smtp          81645
ftp           49090
ssh           47160
pop3           1533
dhcp            172
ssl             142
snmp            113
radius           40
irc              31
Name: count, dtype: int64
attack_cat
Generic             215481
Exploits             44525
 Fuzzers             19195
DoS                  16353
 Reconnaissance      12228
 Fuzzers              5051
Analysis              2677
Backdoor     

**Usar o teste de independência de Chi-Square para investigar a relação entre os atributos categóricos**

In [31]:
cat_columns[:-1]

['proto', 'state', 'service']

In [32]:
# Criando pares de Labels com as categorias
labels_columns = list()
for coluna in cat_columns[:-1]:
    labels_columns.append(['Label',coluna])
labels_columns

[['Label', 'proto'], ['Label', 'state'], ['Label', 'service']]

In [33]:
data[[*cat_columns[:-1],'Label']]

Unnamed: 0,proto,state,service,Label
0,udp,CON,dns,0
1,udp,CON,-,0
2,udp,CON,dns,0
3,udp,CON,dns,0
4,udp,CON,dns,0
...,...,...,...,...
2540042,tcp,FIN,ftp-data,0
2540043,tcp,CON,ftp,0
2540044,tcp,CON,ftp,0
2540045,tcp,CON,http,0


In [34]:
# Fatorizando os dados para poder aplicar o método
data_fact = data[[*cat_columns[:-1],'Label']].apply(lambda x : pd.factorize(x)[0])

In [36]:
cols_results = list()
# Loop through each pair
for par in labels_columns:
    col1, col2 = par[0], par[1]
    
    # 1. Create the contingency table
    contingency_table = pd.crosstab(data_fact[col1], data_fact[col2])
    
    # 2. Perform the Chi-Square test using chi2_contingency
    chi, p, dof, expect = chi2_contingency(contingency_table)
    
    # Store the results
    cols_results.append({
        'Col_Label': col1,
        'Col Categorica': col2,
        'Chi-square': chi,
        'p': p
    })

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(cols_results)

# Display the results
print(results_df)

  Col_Label Col Categorica     Chi-square    p
0     Label          proto  449122.969709  0.0
1     Label          state  907673.334961  0.0
2     Label        service  230615.728890  0.0


Forte correlação entre label e as variáveis categóricas

**Pre-processamento: OHE nos dados categóricos**

In [37]:
#One Hot encoding colunas categóricas
ohe_cat_data = pd.get_dummies(
    data[cat_columns[:-1]]
)

In [38]:
ohe_cat_data.shape

(2540047, 164)

In [39]:
ohe_cat_data.head()

Unnamed: 0,proto_3pc,proto_a/n,proto_aes-sp3-d,proto_any,proto_argus,proto_aris,proto_arp,proto_ax.25,proto_bbn-rcc,proto_bna,...,service_ftp,service_ftp-data,service_http,service_irc,service_pop3,service_radius,service_smtp,service_snmp,service_ssh,service_ssl
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [40]:
# concatenando os DFs em um só DF
preproc_data = pd.concat([data[descr_cols],ohe_cat_data],axis=1)
preproc_data

Unnamed: 0,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,Sload,Dload,Spkts,...,service_ftp,service_ftp-data,service_http,service_irc,service_pop3,service_radius,service_smtp,service_snmp,service_ssh,service_ssl
0,0.001055,132,164,31,29,0,0,500473.937500,621800.937500,2,...,False,False,False,False,False,False,False,False,False,False
1,0.036133,528,304,31,29,0,0,87676.085940,50480.171880,4,...,False,False,False,False,False,False,False,False,False,False
2,0.001119,146,178,31,29,0,0,521894.531300,636282.375000,2,...,False,False,False,False,False,False,False,False,False,False
3,0.001209,132,164,31,29,0,0,436724.562500,542597.187500,2,...,False,False,False,False,False,False,False,False,False,False
4,0.001169,146,178,31,29,0,0,499572.250000,609067.562500,2,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2540042,0.087306,320,1828,31,29,1,2,24465.671880,146610.765600,6,...,False,True,False,False,False,False,False,False,False,False
2540043,0.365058,456,346,31,29,2,2,8743.816406,6333.240234,8,...,True,False,False,False,False,False,False,False,False,False
2540044,6.335154,1802,2088,31,29,7,9,2204.839844,2549.582764,32,...,True,False,False,False,False,False,False,False,False,False
2540045,2.200934,3498,166054,31,29,2,57,12496.513670,598375.062500,58,...,False,False,True,False,False,False,False,False,False,False


**Dividindo os dados em samples de teste e treino**

In [41]:
#Usando o train test split do scikitlearn com stratify para garantir uma proporção similar de attack e normal
test_size = 0.25
X_train, X_test, y_train, y_test = train_test_split(
    preproc_data,
    data['Label'],
    test_size = 0.25,
    stratify = data['Label'],
    random_state=19
)

In [42]:
print('X_train Shape:',X_train.shape)
print('X_test Shape:',X_test.shape)
print('y_train Shape:',y_train.shape)
print('y_test Shape:',y_test.shape)

X_train Shape: (1905035, 201)
X_test Shape: (635012, 201)
y_train Shape: (1905035,)
y_test Shape: (635012,)


In [43]:
print('X_train Head:',X_train.head())
print('X_test Head:',X_test.head())
print('y_train Head:',y_train.head())
print('y_test Head:',y_test.head())

X_train Head:               dur  sbytes   dbytes  sttl  dttl  sloss  dloss         Sload  \
1809470  0.001059     146      178    31    29      0      0  5.514636e+05   
328557   0.019120    2854    29168    31    29      7     17  1.168201e+06   
841376   2.641740   19618  1087890    31    29      2    370  5.924883e+04   
1958201  0.000005     264        0    60     0      0      0  2.112000e+08   
2259848  0.361363     320     1908    31    29      1      2  5.910954e+03   

                Dload  Spkts  ...  service_ftp  service_ftp-data  \
1809470  6.723324e+05      2  ...        False             False   
328557   1.195021e+07     46  ...        False             False   
841376   3.290050e+06    368  ...        False             False   
1958201  0.000000e+00      2  ...        False             False   
2259848  3.697114e+04      6  ...        False              True   

         service_http  service_irc  service_pop3  service_radius  \
1809470         False        False      

In [44]:
#Normalizando os dados com standard scaler
scaler = StandardScaler()

In [45]:
# Fazer o fit dos dados usando a base de treino e depois transformar a base de teste e treino com eles
std_fit_train = scaler.fit(
    X_train
)

In [46]:
X_test_std = std_fit_train.transform(X_test)

In [47]:
X_train_std = std_fit_train.transform(X_train)

In [48]:
df_xtest_std = pd.DataFrame(X_test_std,columns=X_test.columns)
df_xtest_std.head()

Unnamed: 0,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,Sload,Dload,Spkts,...,service_ftp,service_ftp-data,service_http,service_irc,service_pop3,service_radius,service_smtp,service_snmp,service_ssh,service_ssl
0,-0.049711,-0.073193,-0.226294,-0.037321,-0.718477,-0.231972,-0.288696,2.649008,-0.579892,-0.411627,...,-0.140349,-0.228765,-0.297162,-0.002987,-0.024405,-0.004286,-0.182347,-0.006758,-0.137693,-0.007245
1,-0.037628,0.01488,0.339294,-0.425923,-0.040719,0.082225,0.400033,-0.309062,0.492747,0.745126,...,-0.140349,-0.228765,-0.297162,-0.002987,-0.024405,-0.004286,-0.182347,-0.006758,-0.137693,-0.007245
2,0.04452,0.157806,3.175028,-0.425923,-0.040719,0.71062,3.190267,-0.310515,0.254439,2.559126,...,-0.140349,-0.228765,-0.297162,-0.002987,-0.024405,-0.004286,-0.182347,-0.006758,-0.137693,-0.007245
3,-0.048998,-0.037928,-0.131764,-0.425923,-0.040719,0.082225,-0.094439,-0.295737,2.402591,0.009011,...,-0.140349,-0.228765,-0.297162,-0.002987,-0.024405,-0.004286,-0.182347,-0.006758,-0.137693,-0.007245
4,-0.049627,-0.07531,-0.225189,-0.425923,-0.040719,-0.231972,-0.288696,-0.306792,-0.427787,-0.411627,...,-0.140349,-0.228765,-0.297162,-0.002987,-0.024405,-0.004286,-0.182347,-0.006758,-0.137693,-0.007245


In [49]:
df_xtrain_std = pd.DataFrame(X_train_std,columns=X_train.columns)
df_xtrain_std.head()

Unnamed: 0,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,Sload,Dload,Spkts,...,service_ftp,service_ftp-data,service_http,service_irc,service_pop3,service_radius,service_smtp,service_snmp,service_ssh,service_ssl
0,-0.049631,-0.07531,-0.225189,-0.425923,-0.040719,-0.231972,-0.288696,-0.306587,-0.420749,-0.411627,...,-0.140349,-0.228765,-0.297162,-0.002987,-0.024405,-0.004286,-0.182347,-0.006758,-0.137693,-0.007245
1,-0.04826,-0.026735,-0.045325,-0.425923,-0.040719,0.082225,0.011519,-0.3014,2.248758,0.16675,...,-0.140349,-0.228765,-0.297162,-0.002987,-0.024405,-0.004286,-0.182347,-0.006758,-0.137693,-0.007245
2,0.150814,0.273969,6.523352,-0.425923,-0.040719,-0.142202,6.245396,-0.310726,0.198872,4.399415,...,-0.140349,-0.228765,3.365166,-0.002987,-0.024405,-0.004286,-0.182347,-0.006758,-0.137693,-0.007245
3,-0.049711,-0.073193,-0.226294,-0.037321,-0.718477,-0.231972,-0.288696,1.464915,-0.579892,-0.411627,...,-0.140349,-0.228765,-0.297162,-0.002987,-0.024405,-0.004286,-0.182347,-0.006758,-0.137693,-0.007245
4,-0.022282,-0.072189,-0.214456,-0.425923,-0.040719,-0.187087,-0.253377,-0.311175,-0.571141,-0.359047,...,-0.140349,4.371298,-0.297162,-0.002987,-0.024405,-0.004286,-0.182347,-0.006758,-0.137693,-0.007245
