# A. Librerías de trabajo

In [0]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder, Normalizer

import warnings
warnings.filterwarnings('ignore')



# 01. Carga del dataset raw

In [0]:
file_name = 'train.csv'
url_input = 'https://github.com/alexandergribenchenko/DS_Datasets/blob/main/Kaggle_Titanic/'

In [0]:
def import_raw_csv_github(file_name,url_input):
  path = url_input+file_name +'?raw=true'
  df_output = pd.read_csv(path, dtype=object)
  return df_output

In [0]:
df_raw = import_raw_csv_github(file_name,url_input)[:10]
df_raw

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [0]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  10 non-null     object
 1   Survived     10 non-null     object
 2   Pclass       10 non-null     object
 3   Name         10 non-null     object
 4   Sex          10 non-null     object
 5   Age          9 non-null      object
 6   SibSp        10 non-null     object
 7   Parch        10 non-null     object
 8   Ticket       10 non-null     object
 9   Fare         10 non-null     object
 10  Cabin        3 non-null      object
 11  Embarked     10 non-null     object
dtypes: object(12)
memory usage: 1.1+ KB


# 02. Transformadores

## 02.01. FeatureSelector

In [0]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self, feature_names):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        return X[self._feature_names] 

In [0]:
columns_selected = ['PassengerId', 'Pclass', 'Sex', 'Age', 'Fare','Survived']

In [0]:
Transformer_FeatureSelector = FeatureSelector(feature_names=columns_selected)

In [0]:
df_transformed_01 = Transformer_FeatureSelector.transform(df_raw)
df_transformed_01

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Survived
0,1,3,male,22.0,7.25,0
1,2,1,female,38.0,71.2833,1
2,3,3,female,26.0,7.925,1
3,4,1,female,35.0,53.1,1
4,5,3,male,35.0,8.05,0
5,6,3,male,,8.4583,0
6,7,1,male,54.0,51.8625,0
7,8,3,male,2.0,21.075,0
8,9,3,female,27.0,11.1333,1
9,10,2,female,14.0,30.0708,1


In [0]:
df_raw

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## 02.02. TypeAssignatorFloat

In [0]:
class TypeAssignatorFloat(BaseEstimator, TransformerMixin):
    
    def __init__(self, add_log_scaler = True, feature_names=None):
        self.add_log_scaler = add_log_scaler
        self.feature_names = feature_names
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.feature_names is None:
          salida = X
        else:
          salida = X.copy()
          for i in self.feature_names:
            salida[i] = df_transformed_01[i].astype(float)
        return salida

In [0]:
df_transformed_01

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Survived
0,1,3,male,22.0,7.25,0
1,2,1,female,38.0,71.2833,1
2,3,3,female,26.0,7.925,1
3,4,1,female,35.0,53.1,1
4,5,3,male,35.0,8.05,0
5,6,3,male,,8.4583,0
6,7,1,male,54.0,51.8625,0
7,8,3,male,2.0,21.075,0
8,9,3,female,27.0,11.1333,1
9,10,2,female,14.0,30.0708,1


In [0]:
df_transformed_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  10 non-null     object
 1   Pclass       10 non-null     object
 2   Sex          10 non-null     object
 3   Age          9 non-null      object
 4   Fare         10 non-null     object
 5   Survived     10 non-null     object
dtypes: object(6)
memory usage: 608.0+ bytes


In [0]:
Transformer_TypeAssignatorFloat = TypeAssignatorFloat(feature_names=['Age','Fare'])

In [0]:
df_transformed_02 = Transformer_TypeAssignatorFloat.transform(df_transformed_01)
df_transformed_02

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Survived
0,1,3,male,22.0,7.25,0
1,2,1,female,38.0,71.2833,1
2,3,3,female,26.0,7.925,1
3,4,1,female,35.0,53.1,1
4,5,3,male,35.0,8.05,0
5,6,3,male,,8.4583,0
6,7,1,male,54.0,51.8625,0
7,8,3,male,2.0,21.075,0
8,9,3,female,27.0,11.1333,1
9,10,2,female,14.0,30.0708,1


In [0]:
df_transformed_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  10 non-null     object 
 1   Pclass       10 non-null     object 
 2   Sex          10 non-null     object 
 3   Age          9 non-null      float64
 4   Fare         10 non-null     float64
 5   Survived     10 non-null     object 
dtypes: float64(2), object(4)
memory usage: 608.0+ bytes


In [0]:
df_transformed_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  10 non-null     object
 1   Pclass       10 non-null     object
 2   Sex          10 non-null     object
 3   Age          9 non-null      object
 4   Fare         10 non-null     object
 5   Survived     10 non-null     object
dtypes: object(6)
memory usage: 608.0+ bytes


## 02.03. LogScaler

In [0]:
class LogScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, add_log_scaler = True, feature_names=None):
        self.add_log_scaler = add_log_scaler
        self.feature_names = feature_names
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.feature_names is None:
          salida = X
        else:
          salida = X.copy()
          for i in self.feature_names:
            salida[i] = np.log(salida[i])
        return salida

In [0]:
df_transformed_02

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Survived
0,1,3,male,22.0,7.25,0
1,2,1,female,38.0,71.2833,1
2,3,3,female,26.0,7.925,1
3,4,1,female,35.0,53.1,1
4,5,3,male,35.0,8.05,0
5,6,3,male,,8.4583,0
6,7,1,male,54.0,51.8625,0
7,8,3,male,2.0,21.075,0
8,9,3,female,27.0,11.1333,1
9,10,2,female,14.0,30.0708,1


In [0]:
df_transformed_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  10 non-null     object 
 1   Pclass       10 non-null     object 
 2   Sex          10 non-null     object 
 3   Age          9 non-null      float64
 4   Fare         10 non-null     float64
 5   Survived     10 non-null     object 
dtypes: float64(2), object(4)
memory usage: 608.0+ bytes


In [0]:
Transformer_LogScaler = LogScaler(feature_names=['Age','Fare'])

In [0]:
df_transformed_03 = Transformer_LogScaler.transform(df_transformed_02)
df_transformed_03

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Survived
0,1,3,male,3.091042,1.981001,0
1,2,1,female,3.637586,4.266662,1
2,3,3,female,3.258097,2.070022,1
3,4,1,female,3.555348,3.972177,1
4,5,3,male,3.555348,2.085672,0
5,6,3,male,,2.135148,0
6,7,1,male,3.988984,3.948596,0
7,8,3,male,0.693147,3.048088,0
8,9,3,female,3.295837,2.409941,1
9,10,2,female,2.639057,3.403555,1


# 03. Componer

## 03.01. Pipeline

In [0]:
my_pipeline = Pipeline(steps=[
    ('NameFeatureSelector', FeatureSelector(feature_names = ['PassengerId', 'Pclass', 'Sex', 'Age', 'Fare','Survived'])),
    ('NameTypeAssignatorFloat',  TypeAssignatorFloat(feature_names=['Age','Fare'])),
    ('NameLogScaler',  LogScaler(feature_names=['Age','Fare']))
])

In [0]:
df_raw

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [0]:
my_pipeline.transform(df_raw)

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Survived
0,1,3,male,3.091042,1.981001,0
1,2,1,female,3.637586,4.266662,1
2,3,3,female,3.258097,2.070022,1
3,4,1,female,3.555348,3.972177,1
4,5,3,male,3.555348,2.085672,0
5,6,3,male,,2.135148,0
6,7,1,male,3.988984,3.948596,0
7,8,3,male,0.693147,3.048088,0
8,9,3,female,3.295837,2.409941,1
9,10,2,female,2.639057,3.403555,1


In [0]:
perro = my_pipeline.fit_transform(df_raw)
perro

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Survived
0,1,3,male,3.091042,1.981001,0
1,2,1,female,3.637586,4.266662,1
2,3,3,female,3.258097,2.070022,1
3,4,1,female,3.555348,3.972177,1
4,5,3,male,3.555348,2.085672,0
5,6,3,male,,2.135148,0
6,7,1,male,3.988984,3.948596,0
7,8,3,male,0.693147,3.048088,0
8,9,3,female,3.295837,2.409941,1
9,10,2,female,2.639057,3.403555,1


## 03.02. ColumnTransformer

In [0]:
ct = ColumnTransformer(
    [("norm1", Normalizer(norm='l1'), [4]),
     ("norm2", OneHotEncoder(sparse=False,), [1,2])])

In [0]:
ct.fit_transform(perro)

Out[28]: array([[1., 0., 0., 1., 0., 1.],
       [1., 1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 1., 0.],
       [1., 1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0., 1.],
       [1., 1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 1., 0.],
       [1., 0., 1., 0., 1., 0.]])

In [0]:
perro

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Survived
0,1,3,male,3.091042,1.981001,0
1,2,1,female,3.637586,4.266662,1
2,3,3,female,3.258097,2.070022,1
3,4,1,female,3.555348,3.972177,1
4,5,3,male,3.555348,2.085672,0
5,6,3,male,,2.135148,0
6,7,1,male,3.988984,3.948596,0
7,8,3,male,0.693147,3.048088,0
8,9,3,female,3.295837,2.409941,1
9,10,2,female,2.639057,3.403555,1
