# Creación de Transformadores y Pipelines personalizados
En este notebook se muestra la creación de transformadores y Pipelines personalizados

# Imports
Importamos librerias que usaremos

In [1]:
import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

# Funciones Auxiliares

## Lector de DataSet

In [2]:
def load_kdd_dataset(data_path):
    with open(data_path, 'r') as train_set:
        dataset = arff.load(train_set)
    # Extraemos el nombre de los atributos para colocarlo como columnas
    attributes = [attr[0] for attr in dataset['attributes']]
    return pd.DataFrame(dataset['data'], columns=attributes)

## Divisor de DataSet

In [3]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    # Primero se agrega la parte del ds
    # Despues se ejecuta la condicion 
    # Se coloca si hay dato en Stratify y si no hay nada es None
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size =0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)  

# Ejecucion de Funciones Auxiliares

## Lectura del Conjunto de Datos

In [4]:
df = load_kdd_dataset("NSL-KDD/KDDTrain+.arff")

In [5]:
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,tcp,ftp_data,SF,491.0,0.0,0,0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0.0,udp,other,SF,146.0,0.0,0,0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
3,0.0,tcp,http,SF,232.0,8153.0,0,0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0.0,tcp,http,SF,199.0,420.0,0,0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
125969,8.0,udp,private,SF,105.0,145.0,0,0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,normal
125970,0.0,tcp,smtp,SF,2231.0,384.0,0,0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,normal
125971,0.0,tcp,klogin,S0,0.0,0.0,0,0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly


## División del Conjunto de Datos

In [6]:
train_set, val_set, test_set = train_val_test_split(df, stratify='protocol_type')

In [7]:
print('Longitud de Train Set: ', len(train_set))
print('Longitud de Val Set: ', len(val_set))
print('Longitud de Test Set: ', len(test_set))

Longitud de Train Set:  75583
Longitud de Val Set:  25195
Longitud de Test Set:  25195


# API's de Sklearn
Funcionamiento de las APIs de Sklearn:
- ESTIMADORES .:. fit()
1. Cualquier objeto que haga una estimación algun parámetro.
2. Usa un DS como _argumento_.

--------------------------

- TRANSFORMERS .:. transform()
1. Estimadores que transforman el DS.
2. Recibe un DS como _parametro_ de entrada.

-------------------------------


- PREDICTORS .:. predict()
1. Estimadores que realizan predicciones.
2. Reciben un DS como _entrada_.
3. Retornan un DS con predicciones.
4. Tinen un metodo score() para evaluar resultados de la predicción.

# Construyendo Transformadores Personalizados
- Estos transformadores permiten tener el codigo limpio y estructurado.

- Facilitan la reutilzación de codigo en para otros proyectos.

In [8]:
X_train = train_set.drop('class', axis=1)
y_train = train_set['class'].copy()

In [9]:
# Para ilustrar esta sección vamos a añadir algunos valores nulos 
# a algunas características del conjunto de datos
X_train.loc[(X_train["src_bytes"]>400) & (X_train["src_bytes"]<800), "src_bytes"] = np.nan
X_train.loc[(X_train["dst_bytes"]>500) & (X_train["dst_bytes"]<2000), "dst_bytes"] = np.nan

In [10]:
X_train

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,tcp,http,SF,,53508.0,0,0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,tcp,http,SF,304.0,,0,0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,icmp,eco_i,SF,8.0,0.0,0,0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,tcp,systat,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,tcp,http,SF,210.0,,0,0.0,0.0,0.0,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,tcp,smtp,SF,889.0,328.0,0,0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,tcp,http,SF,284.0,444.0,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


## Transformador para atributos numéricos

In [11]:
# Desde el modulo base de la libreria sklearn
# Importamos las clases BaseEstimador y TransformerMixin
# * BaseEstimator .:. Es la clase base para crear Estimadores
# * TransformerMix .:. Es la clase base para crear Transformadores
from sklearn.base import BaseEstimator, TransformerMixin

### Tranformador creado para eliminar las filas con valores nulos

In [12]:
# Creamos la clase del tranformador
# Esta clase es Hija de BaseEstimator y TranformerMixin
class DeleteNanRows(BaseEstimator, TransformerMixin):
    # Creamos la función inicializadora
    def __init__(self):
        pass
    # Creamos la fución estimadora
    # Tienen 2 parametros "X" y "y"
    # "y" por defecto esta en None
    def fit(self, X, y=None):
        return self
    # Creamos la función transformadora
    def transform(self, X, y=None):
        return X.dropna()

In [13]:
# Instanciamos la clase creada para usarla
delete_nan = DeleteNanRows()
# Creamos un objeto para almacenar la tranformacion del DS 
# Para usar un transformador declaramos fit seguido de transform 
X_train_prep = delete_nan.fit_transform(X_train)

In [14]:
X_train_prep

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
31899,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.0,0.0,0.0
89913,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.0,0.0,0.0
106319,0.0,icmp,eco_i,SF,8.0,0.0,0,0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.0,0.0,0.0
98007,0.0,udp,domain_u,SF,46.0,139.0,0,0.0,0.0,0.0,...,255.0,254.0,1.00,0.01,0.00,0.00,0.00,0.0,0.0,0.0
16447,0.0,tcp,smtp,SF,1790.0,363.0,0,0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90665,0.0,tcp,ftp_data,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,63.0,0.25,0.02,0.02,0.00,1.00,1.0,0.0,0.0
64559,0.0,tcp,systat,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.0,0.0,0.0
32452,3.0,tcp,smtp,SF,889.0,328.0,0,0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.0,0.0,0.0
112657,0.0,tcp,http,SF,284.0,444.0,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0


### Transformador diseñado para escalar únicamente unas columnas seleccionadas
Un transformador que contiene otro transformador

In [15]:
# Creamos la clase del transformador
class CustomScaler(BaseEstimator, TransformerMixin):
    # Creamos la funcion inicializadora a la que le vamos a pasamos el parametro que necesita para funcionar
    # en este caso le pasamos los atributos que tranformaremos en forma de lista [src y dst].
    def __init__(self, attributes):
        self.attributes = attributes
    def fit(self, X, y=None):
        return self # No hay nada que hacer
    def transform(self, X, y=None):
        # Se crea el objeto con la copia de los datos del data set a transformar
        X_copy = X.copy()
        # Creamos un objeto con los datos de la lista attributes de las columnas que vamos a escalar
        scale_attrs = X_copy[self.attributes]
        # Instanciamos RobustScaler()
        robust_scaler = RobustScaler()
        # Se crea un objeto con la tranformacion de los datos de la lista attributes
        X_scaled = robust_scaler.fit_transform(scale_attrs)
        # Creamos DataFrame con: Ejemplares, Nombre de Atributos, Indices
        X_scaled = pd.DataFrame(X_scaled, columns=self.attributes, index=X_copy.index)
        # Se hace el cambio del valor normal al escalado
        # attr sirve para identificar uno contra el otro.
        for attr in self.attributes:
            X_copy[attr] = X_scaled[attr]
        return X_copy            

In [16]:
# Instanciamos la clase con los datos que pondremos en el parametro attributes
custom_scaler = CustomScaler(["src_bytes", "dst_bytes"])
# Creamos el objeto donde alamacenaremos la tranformación de los datos
# Aplicamos el tranformador y la función fit_tranform() a los datos (en este caso al DF sin NA)
#                                          X_train_prep = seria el parametro X en el transformador
X_train_prep = custom_scaler.fit_transform(X_train_prep)

In [17]:
X_train_prep[["src_bytes", "dst_bytes"]]

Unnamed: 0,src_bytes,dst_bytes
31899,-0.034632,0.000000
89913,-0.034632,0.000000
106319,0.000000,0.000000
98007,0.164502,0.448387
16447,7.714286,1.170968
...,...,...
90665,-0.034632,0.000000
64559,-0.034632,0.000000
32452,3.813853,1.058065
112657,1.194805,1.432258


In [18]:
X_train.head(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,tcp,http,SF,,53508.0,0,0.0,0.0,0.0,...,9.0,255.0,1.0,0.0,0.11,0.03,0.0,0.0,0.0,0.0
31899,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.0,0.0,1.0,1.0,0.0,0.0
108116,0.0,tcp,http,SF,304.0,,0,0.0,0.0,0.0,...,39.0,255.0,1.0,0.0,0.03,0.06,0.0,0.0,0.0,0.0
89913,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0
106319,0.0,icmp,eco_i,SF,8.0,0.0,0,0.0,0.0,0.0,...,2.0,7.0,1.0,0.0,1.0,0.57,0.0,0.0,0.0,0.0
98007,0.0,udp,domain_u,SF,46.0,139.0,0,0.0,0.0,0.0,...,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
16447,0.0,tcp,smtp,SF,1790.0,363.0,0,0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.0,0.0,0.0,0.0
64957,1.0,tcp,smtp,SF,,329.0,0,0.0,0.0,0.0,...,198.0,181.0,0.65,0.03,0.01,0.01,0.02,0.02,0.0,0.0
100052,0.0,tcp,http,SF,206.0,,0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28800,0.0,tcp,ftp_data,SF,334.0,0.0,0,0.0,0.0,0.0,...,8.0,28.0,1.0,0.0,1.0,0.11,0.0,0.0,0.0,0.0


In [19]:
X_train_prep.head(20)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
31899,0.0,tcp,private,S0,-0.034632,0.0,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.0,0.0,1.0,1.0,0.0,0.0
89913,0.0,tcp,private,S0,-0.034632,0.0,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0
106319,0.0,icmp,eco_i,SF,0.0,0.0,0,0.0,0.0,0.0,...,2.0,7.0,1.0,0.0,1.0,0.57,0.0,0.0,0.0,0.0
98007,0.0,udp,domain_u,SF,0.164502,0.448387,0,0.0,0.0,0.0,...,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
16447,0.0,tcp,smtp,SF,7.714286,1.170968,0,0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.0,0.0,0.0,0.0
28800,0.0,tcp,ftp_data,SF,1.411255,0.0,0,0.0,0.0,0.0,...,8.0,28.0,1.0,0.0,1.0,0.11,0.0,0.0,0.0,0.0
78082,0.0,tcp,ftp_data,SF,44.939394,0.0,0,0.0,0.0,0.0,...,93.0,51.0,0.23,0.03,0.23,0.04,0.0,0.0,0.0,0.0
69315,0.0,tcp,systat,S0,-0.034632,0.0,0,0.0,0.0,0.0,...,255.0,5.0,0.02,0.07,0.0,0.0,1.0,1.0,0.0,0.0
100360,0.0,tcp,private,S0,-0.034632,0.0,0,0.0,0.0,0.0,...,255.0,13.0,0.05,0.07,0.0,0.0,1.0,1.0,0.0,0.0
29208,0.0,tcp,http,SF,1.419913,13.816129,0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Transformador para codificar únicamente las columnas categoricas y devolver un Data Frame

In [57]:
# Tranformador para calificar únicamente las columnas categoricas y devuelve un DataFrame
# Agregamos las clases padres de la clase que crearemos
class CustomOneHotEncoding1(BaseEstimator, TransformerMixin):
    def __int__(self):
        # Intanciamos la clase OneHotEncoder para usar sus funciones
        # sparce = True; Devolverá una matriz dispersa(sparce)
        # sparce = False; devolverá una matriz.
        self._oh = OneHotENCODER(sparce=False)
        self._columns = None
    def fit(self, X, y=None):
        # Creamos un objeto donde almacenaremos el DF con los datos del tipo object
        X_cat = X.select_dtypes(include=['object'])
        # Le asignamos los columnas de la codificamos los datos con "get dummies"
        # CREA UN DATAFRAME CON LAS VARIABLES CATEGORICAS CODIFICADAS
        self._columns = pd.get_dummies(X_cat).columns
        # Aplicamos la funcion fit dentro de la clase OneHotEncoder
        # Lo aplicamos a todo nuestros datos en nuesto DS
        # NO confundir con la función fit de tranformador que creamos
        self._oh.fit(X_cat)
        return self
    def transform(self, X, y=None):
        X_copy = X.copy()
        # Separamos datos con categoricos
        X_cat = X_copy.select_dtypes(include=['object'])
        # Separamos datos no categoricos
        X_num = X_copy.select_dtypes(exclude=['object'])
        # Aplicamos metodos transform desde One Hot Encoder
        X_cat_oh = self._oh.transform(X_cat)
        # Creamos DF con categoricos codificados, columnas, indice
        X_cat_oh = pd.DataFrame(X_cat_oh, colums=self._columns, index=X_copy.index)
        # Eliminamos de X_copy los datos en X_cat
        x_copy.drop(list(X_cat), axis=1, inplace =True)
        # Retornamos la union de X_cat_oh a X_copy
        return X_copy.join(X_cat_oh)

In [58]:
class CustomOneHotEncoding(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._oh = OneHotEncoder(sparse=False)
        self._columns = None
    def fit(self, X, y=None):
        X_cat = X.select_dtypes(include=['object'])
        self._columns = pd.get_dummies(X_cat).columns
        self._oh.fit(X_cat)
        return self
    def transform(self, X, y=None):
        X_copy = X.copy()
        X_cat = X_copy.select_dtypes(include=['object'])
        X_num = X_copy.select_dtypes(exclude=['object'])
        X_cat_oh = self._oh.transform(X_cat)
        X_cat_oh = pd.DataFrame(X_cat_oh, 
                                columns=self._columns, 
                                index=X_copy.index)
        X_copy.drop(list(X_cat), axis=1, inplace=True)
        return X_copy.join(X_cat_oh)

In [54]:
custom_oh = CustomOneHotEncoding()
X_train_prep = custom_oh.fit_transform(X_train_prep)

ValueError: No objects to concatenate

# Construyendo Pipelines Personalizados
Los pipelines nos permiten agrupar en un flujo de ejecución todas las operaciones de transformación que necesitemos realizar sobre un DS. 

Esto facilita las transformaciones de diferentes conjuntos de datos.

------------------------

Debemos tener en cuenta en las estructuras es lo siguiente:
- Recibe un conjunto de pares (nombre, estimador)
- TODOS menos El Ultimo DEBEN ser transformadoers.
* El pipeline expone los mismos métodos que **el último estimador**:
    - predictor: fit() y predict()
    - transformador: fit() y transform()
- Cuando se llama al metodo fit() **del pipeline**, se llama secuencialmente al metodo fit_transform() **de los estimadores** y se les pasa de manera secuencial el output del anterior como input del siguiente. El ultimo invoca el método fit()

## Contrucción de pipelines para atributos númericos

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

# Instanciar Clase Pipeline
num_pipeline = Pipeline([
    #  Nombre,  Estimador
    ('imputer', SimpleImputer(strategy='median')),
    ('rbst_scaler', RobustScaler())
])

In [22]:
# La clase imputer no admite valores categoricos, eliminamos losa tributes categoricos
X_train_num = X_train.select_dtypes(exclude=['object'])

# USO DEL PIPELINE
#                          USO DE LOS METODOS
X_train_prep = num_pipeline.fit_transform(X_train_num)
# Creamos un data frame
X_train_prep = pd.DataFrame(X_train_prep, columns=X_train_num.columns, index=X_train_num.index)

In [23]:
X_train_num

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,,53508.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,304.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,210.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,889.0,328.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,284.0,444.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


In [24]:
X_train_prep

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,0.000000,235.718062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.421965,0.787755,0.515789,-0.428571,1.833333,1.5,0.00,0.00,0.0,0.0
31899,0.0,-0.174089,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,-0.236735,-0.515789,0.285714,0.000000,0.0,1.00,1.00,0.0,0.0
108116,0.0,1.056680,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.248555,0.787755,0.515789,-0.428571,0.500000,3.0,0.00,0.00,0.0,0.0
89913,0.0,-0.174089,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,-0.191837,-0.473684,0.571429,0.000000,0.0,1.00,1.00,0.0,0.0
106319,0.0,-0.141700,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.462428,-0.224490,0.515789,-0.428571,16.666667,28.5,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,-0.174089,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,-0.171429,-0.452632,0.428571,0.000000,0.0,1.00,1.00,0.0,0.0
67272,0.0,0.676113,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.786127,0.787755,0.515789,-0.428571,0.166667,1.0,0.02,0.01,0.0,0.0
32452,3.0,3.425101,1.444934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.832370,0.379592,0.136842,0.142857,0.166667,0.5,0.01,0.00,0.0,0.0
112657,0.0,0.975709,1.955947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.787755,0.515789,-0.428571,0.000000,0.0,0.00,0.00,0.0,0.0


## Metodo Column Transform
Ejecuta todos los pipelines en paralelo y concatena el resultado.
- Para ello el resultado debe estar en valor numérico.

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Creamos objetos con y sin categorias
num_attribs = list(X_train.select_dtypes(exclude=['object']))
cat_attribs = list(X_train.select_dtypes(include=['object']))

# Instanciamos el Ejecutador de los pipelines
full_pipeline = ColumnTransformer([
    # nombre,    Clase,    datos 
    ("num", num_pipeline, num_attribs),
    # nombre,    Clase,      datos
    ("cat", OneHotEncoder(), cat_attribs)
])

In [26]:
# Uso del Pipeline
X_train_prep = full_pipeline.fit_transform(X_train)

In [27]:
# Crear DataFrame
X_train_prep = pd.DataFrame(X_train_prep, columns=list(pd.get_dummies(X_train)), index=X_train.index)

In [28]:
X_train_prep.head(10)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,flag_S3,flag_SF,flag_SH,land_0,land_1,logged_in_0,logged_in_1,is_host_login_0,is_guest_login_0,is_guest_login_1
113467,0.0,0.0,235.718062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
31899,0.0,-0.174089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
108116,0.0,1.05668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
89913,0.0,-0.174089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
106319,0.0,-0.1417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
98007,0.0,0.012146,0.612335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
16447,0.0,7.072874,1.599119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
64957,1.0,0.0,1.449339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
100052,0.0,0.659919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
28800,0.0,1.178138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0


In [29]:
X_train.head(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,tcp,http,SF,,53508.0,0,0.0,0.0,0.0,...,9.0,255.0,1.0,0.0,0.11,0.03,0.0,0.0,0.0,0.0
31899,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.0,0.0,1.0,1.0,0.0,0.0
108116,0.0,tcp,http,SF,304.0,,0,0.0,0.0,0.0,...,39.0,255.0,1.0,0.0,0.03,0.06,0.0,0.0,0.0,0.0
89913,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0
106319,0.0,icmp,eco_i,SF,8.0,0.0,0,0.0,0.0,0.0,...,2.0,7.0,1.0,0.0,1.0,0.57,0.0,0.0,0.0,0.0
98007,0.0,udp,domain_u,SF,46.0,139.0,0,0.0,0.0,0.0,...,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
16447,0.0,tcp,smtp,SF,1790.0,363.0,0,0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.0,0.0,0.0,0.0
64957,1.0,tcp,smtp,SF,,329.0,0,0.0,0.0,0.0,...,198.0,181.0,0.65,0.03,0.01,0.01,0.02,0.02,0.0,0.0
100052,0.0,tcp,http,SF,206.0,,0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28800,0.0,tcp,ftp_data,SF,334.0,0.0,0,0.0,0.0,0.0,...,8.0,28.0,1.0,0.0,1.0,0.11,0.0,0.0,0.0,0.0
