# Preparación del Conjunto de Datos

## Imports

In [1]:
# Importamos todas las librerias que usaremos 
# Leer el archivo
import arff
# Usar Dataframes
import pandas as pd
# Crear Arrays de N dimensiones
import numpy as np
# Usar la funcion que particionara el conjunto de datos
from sklearn.model_selection import train_test_split

## Funciones Auxiliares

### Función para leer BD

In [2]:
# Creamos la función y como parametro colocamos la BD
def load_kdd_dataset(data_path):
    # abrimos el archi en raw y lo nombramos train_set
    with open(data_path, 'r') as train_set:
    # abrimoos el archivo arff y la almacenamos en la var
        dataset = arff.load(train_set)
    # Tomamos la columna "attributes" de la columna en el archivo arff.
    # attr ira alamacenando en una lista cada valor en "attributes" 
    attributes = [attr[0] for attr in dataset["attributes"]]
    # Retornamos un Data Frame 
    #                  en horizontal iran los valores en columna data
    #                                   en vertical iran los valores en la lista que creamos "attributes" 
    return pd.DataFrame(dataset["data"], columns=attributes)

### Función para Particionar BD

In [3]:
# Creamos la funcion para segmentar en train, val, test la BD
#                        BD, Aleatoriedad, Mezcla, Muestra Estratificada
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    # 
    strat = df[stratify] if stratify else None
    # Hacemos primero la partición de train y test
    train_set, test_set = train_test_split(
        #BD      TAMAÑO          ALEATORIEDAD       MEZCLADO       MUESTREO ESTRATIFICADO
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    #--------------------------------------------------------------------------#
    # 
    strat = test_set[stratify] if stratify else None
    # Hacemos el particionado de test y val
    val_set, test_set = train_test_split(
        # #BD      TAMAÑO          ALEATORIEDAD       MEZCLADO       MUESTREO ESTRATIFICADO
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    # Retornamos todos las particiones
    return (train_set, test_set, test_set)

## Lectura del Conjunto Datos

In [4]:
df = load_kdd_dataset(r"C:\Users\Martin Farrera\OneDrive\Escritorio\Escritorio\Udemy\ML\3_Preparacion_de_Datos\NSL-KDD\KDDTrain+.arff")

## División del Conjunto de Datos

In [5]:
# Dividimos la bd en 3 sets con la funcion que creamos
train_set, val_set, test_set = train_val_test_split(df, stratify='protocol_type')

In [6]:
print("Longitud de Training Set: ", len(train_set))
print("Longtud de Validation Test: ", len(val_set))
print("Longitud de Test Set: ", len(test_set))

Longitud de Training Set:  75583
Longtud de Validation Test:  25195
Longitud de Test Set:  25195


## Limpiando los Datos 

- Recuperamos el conjunto de datos limpio y separamos sus etiquetas del resto de datos.
- No necesariamente queremos aplicar las mismas tranformacion a ambos conjuntos (ejemplos y etiquetas).

In [7]:
# Separamos etqiuetas y atributos en train_set
# Con drop elminamos la caracteristica "class" y la almacemanos en X_train
X_train = train_set.drop("class", axis = 1)
# Con el metodo copy() creamos una copia y lo hacemos especificamente en la columna "class" desde el train_set
y_train = train_set["class"].copy()

In [8]:
# EJEMPLO
# Solo como ilustracion
# Añadimos valores nulos en el conjunto que acabamos de crear X_train
# El metodo loc() señala el indice
# Se usa el metodo np.nan de numpy para poner en nulo el dato.
# Los valores que sean mayores a 400 y menores a 800 en "src_bytes" seran igual Nulo
X_train.loc[(X_train["src_bytes"]>400) & (X_train["src_bytes"]<800), "src_bytes"] = np.nan
X_train.loc[(X_train["dst_bytes"]>500) & (X_train["dst_bytes"]<2000), "dst_bytes"] = np.nan
X_train

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,tcp,http,SF,,53508.0,0,0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,tcp,http,SF,304.0,,0,0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,icmp,eco_i,SF,8.0,0.0,0,0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,tcp,systat,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,tcp,http,SF,210.0,,0,0.0,0.0,0.0,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,tcp,smtp,SF,889.0,328.0,0,0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,tcp,http,SF,284.0,444.0,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


La mayoria de los algoritmos de ML no pueden trabajar con caracteristicas que contengan datos nulos.
¿Qué podemos hacer?
- Eliminar el Ejemplar.
- Eliminar el atributo.
- Remplazar con un valor determinado (zero, media, mediana, ...)

In [9]:
# Comprobamos que no exista algun valor nulo.
# isna() = "Es NA" pone un true si en algun valor exite un NA
# any() si en alguna columna hay NA coloca un TRUE
# Una se comprementa de la otra
X_train.isna().any()

duration                       False
protocol_type                  False
service                        False
flag                           False
src_bytes                       True
dst_bytes                       True
land                           False
wrong_fragment                 False
urgent                         False
hot                            False
num_failed_logins              False
logged_in                      False
num_compromised                False
root_shell                     False
su_attempted                   False
num_root                       False
num_file_creations             False
num_shells                     False
num_access_files               False
num_outbound_cmds              False
is_host_login                  False
is_guest_login                 False
count                          False
srv_count                      False
serror_rate                    False
srv_serror_rate                False
rerror_rate                    False
s

In [10]:
# Seleccionamos las filas que contienen los valores nulos
filas_conNulos = X_train[X_train.isnull().any(axis=1)]
filas_conNulos

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,tcp,http,SF,,53508.0,0,0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
108116,0.0,tcp,http,SF,304.0,,0,0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
64957,1.0,tcp,smtp,SF,,329.0,0,0.0,0.0,0.0,...,198.0,181.0,0.65,0.03,0.01,0.01,0.02,0.02,0.0,0.0
100052,0.0,tcp,http,SF,206.0,,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
99158,0.0,tcp,http,SF,291.0,,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117260,0.0,tcp,http,SF,321.0,,0,0.0,0.0,0.0,...,2.0,255.0,1.00,0.00,0.50,0.02,0.00,0.00,0.0,0.0
110723,0.0,tcp,http,SF,361.0,,0,0.0,0.0,0.0,...,40.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
58053,0.0,tcp,http,SF,202.0,,0,0.0,0.0,0.0,...,83.0,255.0,1.00,0.00,0.01,0.01,0.00,0.00,0.0,0.0
70184,0.0,tcp,http,SF,315.0,,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


### Opción 1: Eliminamos los ejemplos (filas) con valores nulos

In [11]:
# Copiamos el conjunto de datos para no altenar el original
X_train_copy = X_train.copy()

In [12]:
# Eliminamos los ejemplos (filas) con valores nulos
# Usamos .dropna() para eliminar los valores 
X_train_copy.dropna(subset=["src_bytes", "dst_bytes"], inplace=True)
X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
31899,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.0,0.0,0.0
89913,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.0,0.0,0.0
106319,0.0,icmp,eco_i,SF,8.0,0.0,0,0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.0,0.0,0.0
98007,0.0,udp,domain_u,SF,46.0,139.0,0,0.0,0.0,0.0,...,255.0,254.0,1.00,0.01,0.00,0.00,0.00,0.0,0.0,0.0
16447,0.0,tcp,smtp,SF,1790.0,363.0,0,0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90665,0.0,tcp,ftp_data,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,63.0,0.25,0.02,0.02,0.00,1.00,1.0,0.0,0.0
64559,0.0,tcp,systat,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.0,0.0,0.0
32452,3.0,tcp,smtp,SF,889.0,328.0,0,0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.0,0.0,0.0
112657,0.0,tcp,http,SF,284.0,444.0,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0


In [13]:
# Contamos el numero de filas
print("El numero de filas eliminadas es:", len(X_train) - len(X_train_copy))

El numero de filas eliminadas es: 9886


### Opción 2: Eliminamos los atributos con valores nulos.

In [14]:
# Copiamos el número de filas eliminadas
X_train_copy = X_train.copy()

In [15]:
# Usamos el metodo drop() para eliminar la columna de atributo
X_train_copy.drop(["src_bytes", "dst_bytes"], axis = 1, inplace=True)
X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,tcp,http,SF,0,0.0,0.0,0.0,0.0,1,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,tcp,private,S0,0,0.0,0.0,0.0,0.0,0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,tcp,http,SF,0,0.0,0.0,0.0,0.0,1,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,tcp,private,S0,0,0.0,0.0,0.0,0.0,0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,icmp,eco_i,SF,0,0.0,0.0,0.0,0.0,0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,tcp,systat,S0,0,0.0,0.0,0.0,0.0,0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,tcp,http,SF,0,0.0,0.0,0.0,0.0,1,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,tcp,smtp,SF,0,0.0,0.0,0.0,0.0,1,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,tcp,http,SF,0,0.0,0.0,0.0,0.0,1,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


In [16]:
print("El numero de atributos eliminados es: ", len(list(X_train)) - len(list(X_train_copy)))

El numero de atributos eliminados es:  2


### Opción 3: Rellenar los valores nulos con un valor determinado

In [17]:
# Copiamos el conjunto de datos para no alfectar el origial
X_train_copy= X_train.copy()

In [18]:
# Rellenamos los valores nulos con la media de los valores del atributos.
# Calculamos la media (Usamos el metodo ".mean()") de los atributos donde hay valores nulos.
media_srcbytes = X_train_copy["src_bytes"].mean()
media_dstbytes = X_train_copy["dst_bytes"].mean()

media_srcbytes
media_dstbytes

# Dentro deL DF los NA los llenamos con la media que obtuvimos
# Seleccionamos la columna con el atributos
# Depues... Usamos el metodo fillna() para llenar los valores nulos
X_train_copy["src_bytes"].fillna(media_srcbytes, inplace=True)
X_train_copy["dst_bytes"].fillna(media_dstbytes, inplace=True)

X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,tcp,http,SF,66914.530762,53508.000000,0,0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,tcp,private,S0,0.000000,0.000000,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,tcp,http,SF,304.000000,9181.334754,0,0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,tcp,private,S0,0.000000,0.000000,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,icmp,eco_i,SF,8.000000,0.000000,0,0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,tcp,systat,S0,0.000000,0.000000,0,0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,tcp,http,SF,210.000000,9181.334754,0,0.0,0.0,0.0,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,tcp,smtp,SF,889.000000,328.000000,0,0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,tcp,http,SF,284.000000,444.000000,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


#### Llenar los valores NA con la Mediana

Un valor muy alto en el atributo puede disparar la media. Por eso podemos usar la mediana que es el valor que esta exactamente por la mitad de todos los valores.

In [19]:
# Copiamos el conjunto de datos
X_train_copy = X_train.copy()

In [20]:
# Repetimos el proceso que usamos en la media
# Obtenemos la mediana con el metodo .median()
mediana_srcbytes = X_train_copy["src_bytes"].median()
mediana_dstbytes = X_train_copy["dst_bytes"].median()

X_train_copy["src_bytes"].fillna(mediana_srcbytes, inplace=True)
X_train_copy["dst_bytes"].fillna(mediana_srcbytes, inplace=True)

X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,tcp,http,SF,43.0,53508.0,0,0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,tcp,http,SF,304.0,43.0,0,0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,icmp,eco_i,SF,8.0,0.0,0,0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,tcp,systat,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,tcp,http,SF,210.0,43.0,0,0.0,0.0,0.0,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,tcp,smtp,SF,889.0,328.0,0,0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,tcp,http,SF,284.0,444.0,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


#### Alternativa: Usar la clase Imputer de Sklearn

Para rellenar los valores nulos con la mediana

In [21]:
X_train_copy = X_train.copy()

In [22]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [23]:
##############################################################################
#La clase imputer no admite valores categoricos
# Eliminamos los atributos categoricos

#X_train_copy_num_error = X_train_copy.select_dtypes(exclude=['object'])
#X_train_copy_num_error.info()

###########################################################

In [24]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75583 entries, 113467 to 99030
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     75583 non-null  float64
 1   protocol_type                75583 non-null  object 
 2   service                      75583 non-null  object 
 3   flag                         75583 non-null  object 
 4   src_bytes                    73696 non-null  float64
 5   dst_bytes                    67572 non-null  float64
 6   land                         75583 non-null  object 
 7   wrong_fragment               75583 non-null  float64
 8   urgent                       75583 non-null  float64
 9   hot                          75583 non-null  float64
 10  num_failed_logins            75583 non-null  float64
 11  logged_in                    75583 non-null  object 
 12  num_compromised              75583 non-null  float64
 13  root_shell 

In [26]:
# La clase imputer no admite valores categoricos
# Eliminamos los atributos categoricos

X_train_copy_num = X_train_copy.select_dtypes(exclude=['object'])
X_train_copy_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75583 entries, 113467 to 99030
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     75583 non-null  float64
 1   src_bytes                    73696 non-null  float64
 2   dst_bytes                    67572 non-null  float64
 3   wrong_fragment               75583 non-null  float64
 4   urgent                       75583 non-null  float64
 5   hot                          75583 non-null  float64
 6   num_failed_logins            75583 non-null  float64
 7   num_compromised              75583 non-null  float64
 8   root_shell                   75583 non-null  float64
 9   su_attempted                 75583 non-null  float64
 10  num_root                     75583 non-null  float64
 11  num_file_creations           75583 non-null  float64
 12  num_shells                   75583 non-null  float64
 13  num_access_