# Preparación de datos

In [12]:
# importaciones
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

## 1. Utils

In [13]:
SEED = 9603

caracteristicas_numericas = [
  'flow_duration', 'total_fwd_packet', 'total_bwd_packets', 
  'total_length_of_fwd_packet', 'total_length_of_bwd_packet', 
  'fwd_packet_length_max', 'fwd_packet_length_min', 'fwd_packet_length_mean', 
  'fwd_packet_length_std', 'bwd_packet_length_max', 'bwd_packet_length_min', 
  'bwd_packet_length_mean', 'bwd_packet_length_std', 'flow_bytes/s', 
  'flow_packets/s', 'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 
  'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 
  'fwd_iat_max', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 
  'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags', 
  'bwd_psh_flags', 'fwd_header_length', 'bwd_header_length', 'fwd_packets/s', 
  'bwd_packets/s', 'packet_length_min', 'packet_length_max', 'packet_length_mean', 
  'packet_length_std', 'packet_length_variance', 'fin_flag_count', 'syn_flag_count',
  'rst_flag_count', 'psh_flag_count', 'ack_flag_count', 'cwr_flag_count', 
  'ece_flag_count', 'down/up_ratio', 'average_packet_size', 'fwd_segment_size_avg', 
  'bwd_segment_size_avg', 'fwd_bytes/bulk_avg', 'fwd_packet/bulk_avg', 
  'fwd_bulk_rate_avg', 'bwd_bytes/bulk_avg', 'bwd_packet/bulk_avg', 
  'bwd_bulk_rate_avg', 'subflow_fwd_packets', 'subflow_fwd_bytes', 
  'subflow_bwd_packets', 'subflow_bwd_bytes', 'fwd_init_win_bytes', 
  'bwd_init_win_bytes', 'fwd_act_data_pkts', 'fwd_seg_size_min', 
  'active_mean', 'active_std', 'active_max', 'active_min', 
  'idle_mean', 'idle_std', 'idle_max', 'idle_min'
]

caracteristicas_var_null = ['flow_bytes/s', 'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 'flow_iat_min']

caracteristicas_var_max_inf = ['flow_bytes/s', 'flow_packets/s']

caracteristicas_no_utiles = [
  'Flow ID', 'Src IP', 'Src Port', 
  'Dst IP', 'Timestamp', 'Fwd URG Flags', 
  'Bwd URG Flags', 'URG Flag Count'
]

caracteristicas_nominales = ['dst_port', 'protocol']

caracteristica_objetivo = "label"

new_chars_encoded = []

In [14]:
# Carga de datos
df = pd.read_csv(f"DB/dataset.csv")
print("Dataset cargado")

Dataset cargado


## 2. Selección de datos relevantes

In [15]:
df = df.drop(caracteristicas_no_utiles, axis=1)

## 3. Limpieza de datos

In [16]:
# Renombrando columnas
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print(f"Total de características: {len(df.columns)}")
print(df.columns)

Total de características: 76
Index(['dst_port', 'protocol', 'flow_duration', 'total_fwd_packet',
       'total_bwd_packets', 'total_length_of_fwd_packet',
       'total_length_of_bwd_packet', 'fwd_packet_length_max',
       'fwd_packet_length_min', 'fwd_packet_length_mean',
       'fwd_packet_length_std', 'bwd_packet_length_max',
       'bwd_packet_length_min', 'bwd_packet_length_mean',
       'bwd_packet_length_std', 'flow_bytes/s', 'flow_packets/s',
       'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 'flow_iat_min',
       'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max',
       'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 'bwd_iat_std',
       'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags',
       'fwd_header_length', 'bwd_header_length', 'fwd_packets/s',
       'bwd_packets/s', 'packet_length_min', 'packet_length_max',
       'packet_length_mean', 'packet_length_std', 'packet_length_variance',
       'fin_flag_count', 'syn_flag_count', 'rst_flag_

In [17]:
# Renombrando etiquetas de la variable objetivo
df[caracteristica_objetivo] = df[caracteristica_objetivo].str.upper().str.replace("� ", "").str.replace(" ", "_").str.replace("-", "_")
print(f"Total de etiquetas: {len(df[caracteristica_objetivo].unique())}")
print(df[caracteristica_objetivo].unique())

Total de etiquetas: 10
['BENIGN' 'DDOS' 'DOS_HULK' 'PORTSCAN' 'DOS_GOLDENEYE' 'FTP_PATATOR'
 'SSH_PATATOR' 'DOS_SLOWHTTPTEST' 'DOS_SLOWLORIS' 'BOT']


## 4. Creación de nuevos datos

In [18]:
# La característica destination_port es de tipo categórica y tiene alta cardinalidad
# por ello, se aplicará reducción de características mediante la clasificación de IANA
"""
  Clasificación de IANA de puertos
  0 - 1023 : Well-known ports, puertos utilizados por el sistema (o root) o programas con privilegios de usuario 
  1024 - 49151 : Registered ports, puertos utilizados en procesos o programas de usuarios ordinarios
  49152 - 65535 : Dynamic and/or Private ports, puertos privados o dinámicos
"""

chars_encoder = OneHotEncoder(drop="first", sparse_output=False)

bins = [-1, 1023, 49151, 65535]
labels = ["well_known", "registered", "dynamic_private"]
df["port_type"] = pd.cut(df["dst_port"], bins=bins, labels=labels)

chars_encoded_array = chars_encoder.fit_transform(df[["port_type"]])
df_chars_encoded = pd.DataFrame(
  columns=chars_encoder.get_feature_names_out(),
  data=chars_encoded_array
)

df = pd.concat(
  [
    df_chars_encoded.reset_index(drop=True), 
    df[caracteristicas_nominales + caracteristicas_numericas + [caracteristica_objetivo]].reset_index(drop=True)
  ],
  axis=1
)

for char in chars_encoder.get_feature_names_out() : 
  new_chars_encoded.append(char)

df[new_chars_encoded+caracteristicas_nominales]

Unnamed: 0,port_type_registered,port_type_well_known,dst_port,protocol
0,0.0,1.0,53,17
1,0.0,1.0,443,6
2,0.0,1.0,80,6
3,0.0,1.0,80,6
4,0.0,1.0,80,6
...,...,...,...,...
169923,0.0,1.0,80,6
169924,0.0,1.0,53,17
169925,0.0,1.0,80,6
169926,0.0,1.0,80,6


In [19]:
df['protocol_6'] = (df['protocol'] == 6).astype(int)
df['protocol_17'] = (df['protocol'] == 17).astype(int)
new_chars_encoded.append('protocol_6')
new_chars_encoded.append('protocol_17')
df[new_chars_encoded+caracteristicas_nominales]

Unnamed: 0,port_type_registered,port_type_well_known,protocol_6,protocol_17,dst_port,protocol
0,0.0,1.0,0,1,53,17
1,0.0,1.0,1,0,443,6
2,0.0,1.0,1,0,80,6
3,0.0,1.0,1,0,80,6
4,0.0,1.0,1,0,80,6
...,...,...,...,...,...,...
169923,0.0,1.0,1,0,80,6
169924,0.0,1.0,0,1,53,17
169925,0.0,1.0,1,0,80,6
169926,0.0,1.0,1,0,80,6


## 5. Integración de datos

In [20]:
df = df.drop(caracteristicas_nominales, axis=1)[new_chars_encoded+caracteristicas_numericas+[caracteristica_objetivo]]
df

Unnamed: 0,port_type_registered,port_type_well_known,protocol_6,protocol_17,flow_duration,total_fwd_packet,total_bwd_packets,total_length_of_fwd_packet,total_length_of_bwd_packet,fwd_packet_length_max,...,fwd_seg_size_min,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
0,0.0,1.0,0,1,31522,2,2,72.0,234.0,36.0,...,8,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,BENIGN
1,0.0,1.0,1,0,118265328,34,34,1013.0,34323.0,205.0,...,32,599222.5,742745.670065,1124423.0,74022.0,58518591.0,583972.620524,58931522.0,58105660.0,BENIGN
2,0.0,1.0,1,0,4014099,9,5,20.0,11595.0,20.0,...,20,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,DDOS
3,0.0,1.0,1,0,280949,10,6,391.0,11595.0,391.0,...,32,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,DOS_HULK
4,0.0,1.0,1,0,72016,10,8,339.0,11606.0,339.0,...,20,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,DOS_HULK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169923,0.0,1.0,1,0,3246,7,7,353.0,11595.0,353.0,...,32,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,DOS_HULK
169924,0.0,1.0,0,1,30774,4,2,136.0,250.0,34.0,...,8,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,BENIGN
169925,0.0,1.0,1,0,149954,7,7,344.0,11595.0,344.0,...,20,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,DOS_HULK
169926,0.0,1.0,1,0,5873867,8,6,20.0,11595.0,20.0,...,20,526011.0,0.000000,526011.0,526011.0,5347856.0,0.000000,5347856.0,5347856.0,DDOS


In [21]:
print(new_chars_encoded)

['port_type_registered', 'port_type_well_known', 'protocol_6', 'protocol_17']


In [22]:
df.to_csv('../4_modelado/DB/dataset.csv')
print('Guardado')

Guardado
