## Data transformation

There is one of the crucial aspect of many machine learning algorithms.

Here we will explore different preprocessing algorithms.

The result will be available in titanic.data.preprocessing module.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib widget

import titanic.data.load
import titanic.data.wrangling as wrng

train_df_orig, test_df_orig = titanic.data.load.from_csv()
train_df_orig.info()

train_df = wrng.wrangling(train_df_orig)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
train_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,CabLet,Alone,Familiars
0,3,male,22.0,1,0,7.2500,S,Mr,T,0,1
1,1,female,38.0,1,0,71.2833,C,Mrs,C,0,1
2,3,female,26.0,0,0,7.9250,S,Miss,T,1,0
3,1,female,35.0,1,0,53.1000,S,Mrs,C,0,1
4,3,male,35.0,0,0,8.0500,S,Mr,T,1,0
...,...,...,...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S,Rev,T,1,0
887,1,female,19.0,0,0,30.0000,S,Miss,B,1,0
888,3,female,22.0,1,2,23.4500,S,Miss,T,0,3
889,1,male,26.0,0,0,30.0000,C,Mr,C,1,0


In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

column_trans = ColumnTransformer(
    [('Categ', OneHotEncoder(), ['Sex', 'Embarked', 'CabLet']),
     ('Title', CountVectorizer(), 'Title')],
    remainder=StandardScaler())
column_trans.fit(train_df)

ColumnTransformer(remainder=StandardScaler(),
                  transformers=[('Categ', OneHotEncoder(),
                                 ['Sex', 'Embarked', 'CabLet']),
                                ('Title', CountVectorizer(), 'Title')])

In [34]:
pd.DataFrame(column_trans.transform(train_df),columns=column_trans.get_feature_names_out())

Unnamed: 0,Categ__Sex_female,Categ__Sex_male,Categ__Embarked_C,Categ__Embarked_Q,Categ__Embarked_S,Categ__CabLet_A,Categ__CabLet_B,Categ__CabLet_C,Categ__CabLet_D,Categ__CabLet_E,...,Title__mrs,Title__rev,Title__sir,remainder__Pclass,remainder__Age,remainder__SibSp,remainder__Parch,remainder__Fare,remainder__Alone,remainder__Familiars
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.827377,-0.582415,0.432793,-0.476267,-0.516026,-1.225891,0.057788
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,-1.566107,0.624899,0.432793,-0.476267,0.773351,-1.225891,0.057788
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.827377,-0.280586,-0.474545,-0.476267,-0.502434,0.815733,-0.562564
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,-1.566107,0.398528,0.432793,-0.476267,0.407211,-1.225891,0.057788
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.827377,0.398528,-0.474545,-0.476267,-0.499917,0.815733,-0.562564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-0.369365,-0.205129,-0.474545,-0.476267,-0.400244,0.815733,-0.562564
887,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.566107,-0.808786,-0.474545,-0.476267,-0.057931,0.815733,-0.562564
888,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.827377,-0.582415,0.432793,2.005334,-0.189822,-1.225891,1.298492
889,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,-1.566107,-0.280586,-0.474545,-0.476267,-0.057931,0.815733,-0.562564


Here we repeat all operations above but for test sample

In [22]:
fare_class = test_df.groupby('Pclass').Fare.mean()
test_df.Fare = test_df[['Pclass', 'Fare']].apply(lambda c: fare_class[c.Pclass] if c.Fare == 0 or np.isnan(c.Fare) else c.Fare, axis=1)

test_df.Name = test_df.Name.str.replace('Mlle', 'Miss')
test_df.Name = test_df.Name.str.replace('Mme', 'Mrs')
test_df['Title'] = test_df.Name.apply(lambda n: str(n)[str(n).find(',')+1:].strip().split(' ')[0][:-1])
test_df.Title = test_df.Title.replace('th', 'Countess')
test_df.Title = test_df.Title.replace('Ms', 'Miss')
title_age = test_df.groupby('Title').Age.mean().round()
test_df.Age = test_df[['Title', 'Age']].apply(lambda a: title_age[a.Title] if math.isnan(a.Age) else a.Age, axis=1)

test_df['CabLet'] = test_df.Cabin.astype(str).str[0].replace('n', 'X')
test_df['CabLet'] = test_df[['CabLet', 'Fare']].apply(assingCabinBasedOnFare, axis=1)
test_df['Alone'] = test_df[['SibSp', 'Parch']].apply(lambda p: 0 if (p[0] + p[1] != 0) else 1, axis=1)
test_df['Familiars'] = test_df.SibSp + test_df.Parch

categories = {"female": 1, "male": 0}
test_df['Sex']= test_df['Sex'].map(categories)

categories = {"S": 1, "C": 2, "Q": 3}
test_df['Embarked']= test_df['Embarked'].map(categories)

categories = test_df.CabLet.unique()
test_df['CabLet'] = test_df.CabLet.astype("category").cat.codes

categories = test_df.Title.unique()
test_df['Title'] = test_df.Title.astype("category").cat.codes


test_df = test_df.drop('Name', axis=1)  # Dropping label to normalize
test_df = test_df.drop('Cabin', axis=1)  # Dropping label to normalize
test_df = test_df.drop('Ticket', axis=1)  # Dropping label to normalize
# test_df = test_df.drop('Title', axis=1)  # Dropping label to normalize
test_df = test_df.drop('PassengerId', axis=1)  # Dropping label to normalize

scaler = MinMaxScaler()
scaled_test = scaler.fit_transform(test_df)

scaled_test = pd.DataFrame(scaled_test, columns=test_df.columns, index=test_df.index)

scaled_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,CabLet,Alone,Familiars
0,1.0,0.0,0.452723,0.0,0.0,0.009149,1.0,0.714286,1.0,1.0,0.0
1,1.0,1.0,0.617566,0.125,0.0,0.007521,0.0,0.857143,1.0,0.0,0.1
2,0.5,0.0,0.815377,0.0,0.0,0.012799,1.0,0.714286,1.0,1.0,0.0
3,1.0,0.0,0.353818,0.0,0.0,0.010786,0.0,0.714286,1.0,1.0,0.0
4,1.0,1.0,0.287881,0.125,0.111111,0.017905,0.0,0.857143,1.0,0.0,0.2
