## Phase 3 (Data Preparation): v10

* Autorin: Anna (i3-Versicherung)
* Webseite: [Data Science Training - Kapitel 21](https://data-science.training/kapitel-21/)
* Datum: 23.03.2023

Ausgangspunkt sind die Datenaufbereitungen der Versionen 7 und 8.

In [4]:
# Pandas Paket (Package) importieren
#  Datenstrukturen und Datenanalyse, I/O
#  https://pandas.pydata.org/pandas-docs/stable/
import pandas as pd
# NumPy Paket (Package) importieren
#  Mehrdimensionale Datenstrukturen (Vektoren, Matrizen, Tensoren, Arrays), Lineare Algebra
#  https://numpy.org/doc/
import numpy as np

In [5]:
# Trainings-, Test- und Anwendungsdaten als Pandas Data Frame (df) aus CSV-Dateien laden
#  (KNIME: "CSV Reader")
df_train = pd.read_csv('../../data/titanic/original/train.csv')
df_test  = pd.read_csv('../../data/titanic/original/test.csv')
df_app   = pd.read_csv('../../data/titanic/original/application.csv')

In [6]:
# Ergebnisse der Regression als Pandas Data Frame (df) aus Excel-Datei laden
#  (KNIME: "Excel Reader")
df_res   = pd.read_excel('../../data/titanic/age/results_v3.xlsx')

In [7]:
# Trainings- und Testdaten zusammenführen
#  (KNIME "Concatenate")
df = pd.concat([df_train, df_test], ignore_index=True)

In [8]:
# Datentypen automatisch konvertieren
df = df.convert_dtypes()
df_app = df_app.convert_dtypes()

In [9]:
# Fehlende Werte prüfen
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [10]:
# Neues Feature: Child - Teil 1
#  (KNIME: "Rule Engine")
df['Child'] = (df['Age'] < 12).fillna(False).astype('int')
df_app['Child'] = (df_app['Age'] < 12).fillna(False).astype('int')

In [11]:
# Neues Feature: Child - Teil 2
# Prognose-Ergebnisse der Binären Klassifikation hinzufügen
df = df.merge(df_res, how='left', on='PassengerId')

In [12]:
# Neues Feature: Child - Teil 3
# Verknüfen der gegebenen und der prognostizierten Werte
df['Child'] = df['Child_y'].fillna(df['Child_x']).astype('int')
df = df.drop(['Child_x', 'Child_y'], axis=1)
display(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S,0
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C,0
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S,0
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S,0


In [13]:
# Fehlende Werte behandeln (d.h. schätzen) - Teil 2
#  (KNIME: "Missing Values")
# Fare (Kardinalskala): 1 fehlender Wert => Benutze den konstanten Wert 7,896
#df['Fare'] = df['Fare'].fillna(7.896)
display(df[df['Fare'].isna()])
df.loc[1043, 'Fare'] = 7.896
display(df[df['PassengerId'] == 1044])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child
1043,1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,0


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Child
1043,1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,7.896,,S,0


In [14]:
# Neues Feature Title
#  (KNIME: "Cell Splitter", "Column Rename", "Table Creator", "Cell Replacer")
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
df['Title'] = df['Title'].replace(['Ms', 'Mlle'], 'Miss')
df['Title'] = df['Title'].replace(['Mme', 'Lady', 'the Countess', 'Dona'], 'Mrs')
df['Title'] = df['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Rare')
#
df_app['Title'] = df_app['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
df_app['Title'] = df_app['Title'].replace(['Ms', 'Mlle'], 'Miss')
df_app['Title'] = df_app['Title'].replace(['Mme', 'Lady', 'the Countess', 'Dona'], 'Mrs')
df_app['Title'] = df_app['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Rare')

In [15]:
# Neues Feature "FamilySize"
#  (KNIME: "Math Formula")
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df_app['FamilySize'] = df_app['SibSp'] + df_app['Parch'] + 1

In [16]:
# Hilfsgröße TicketCount
#  (KNIME: "GroupBy", "Joiner", "Column Rename")
ticketCount = df.groupby('Ticket', as_index=False)['PassengerId'].count()
ticketCount = ticketCount.rename(columns={'PassengerId': 'TicketCount'})
df = df.merge(ticketCount, how='left', on='Ticket')
#
ticketCount = df_app.groupby('Ticket', as_index=False)['PassengerId'].count()
ticketCount = ticketCount.rename(columns={'PassengerId': 'TicketCount'})
df_app = df_app.merge(ticketCount, how='left', on='Ticket')

In [17]:
# Neues Feature "LogFare"
#  (KNIME: "Math Formula")
df['LogFare'] = np.log( 1 + df['Fare'] / df['TicketCount'] )
df_app['LogFare'] = np.log( 1 + df_app['Fare'] / df_app['TicketCount'] )

In [18]:
# Ergebnis des Feature Engineering anzeigen
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   Int64  
 1   Survived     891 non-null    Int64  
 2   Pclass       1309 non-null   Int64  
 3   Name         1309 non-null   string 
 4   Sex          1309 non-null   string 
 5   Age          1046 non-null   Float64
 6   SibSp        1309 non-null   Int64  
 7   Parch        1309 non-null   Int64  
 8   Ticket       1309 non-null   string 
 9   Fare         1309 non-null   Float64
 10  Cabin        295 non-null    string 
 11  Embarked     1307 non-null   string 
 12  Child        1309 non-null   int32  
 13  Title        1309 non-null   string 
 14  FamilySize   1309 non-null   Int64  
 15  TicketCount  1309 non-null   Int64  
 16  LogFare      1309 non-null   Float64
dtypes: Float64(3), Int64(7), int32(1), string(6)
memory usage: 181.7 KB


In [19]:
# One Hot Encoding => Dummy-Variablen
#  für Pclass, Title
cols  = ['Pclass', 'Title']
df = pd.get_dummies(df, columns=cols, dtype=int)
df_app = pd.get_dummies(df_app, columns=cols, dtype=int)
# Ergebnis anzeigen
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 23 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   1309 non-null   Int64  
 1   Survived      891 non-null    Int64  
 2   Name          1309 non-null   string 
 3   Sex           1309 non-null   string 
 4   Age           1046 non-null   Float64
 5   SibSp         1309 non-null   Int64  
 6   Parch         1309 non-null   Int64  
 7   Ticket        1309 non-null   string 
 8   Fare          1309 non-null   Float64
 9   Cabin         295 non-null    string 
 10  Embarked      1307 non-null   string 
 11  Child         1309 non-null   int32  
 12  FamilySize    1309 non-null   Int64  
 13  TicketCount   1309 non-null   Int64  
 14  LogFare       1309 non-null   Float64
 15  Pclass_1      1309 non-null   int32  
 16  Pclass_2      1309 non-null   int32  
 17  Pclass_3      1309 non-null   int32  
 18  Title_Master  1309 non-null 

In [20]:
# Aufräumen: Attribute (manuell) herausfiltern
#  (KNIME "Column Filter")
df = df.drop(['Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Ticket', 'Embarked', 'Pclass_1', 'Title_Rare'], axis=1)
df_app = df_app.drop(['Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Ticket', 'Embarked', 'Pclass_1', 'Title_Rare'], axis=1)
# Ergebnis anzeigen
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   1309 non-null   Int64  
 1   Survived      891 non-null    Int64  
 2   Child         1309 non-null   int32  
 3   FamilySize    1309 non-null   Int64  
 4   TicketCount   1309 non-null   Int64  
 5   LogFare       1309 non-null   Float64
 6   Pclass_2      1309 non-null   int32  
 7   Pclass_3      1309 non-null   int32  
 8   Title_Master  1309 non-null   int32  
 9   Title_Miss    1309 non-null   int32  
 10  Title_Mr      1309 non-null   int32  
 11  Title_Mrs     1309 non-null   int32  
dtypes: Float64(1), Int64(4), int32(7)
memory usage: 93.4 KB


In [21]:
# Daten wieder aufteilen
#  (KNIME: "Row Splitter")
df_train = df[df['Survived'].notna()]
df_test  = df[df['Survived'].isna()]

In [22]:
# Irrelevante Attribute filtern
#  (KNIME: "Column Filter")
# Trainingsdaten: PassengerId
df_train = df_train.drop(['PassengerId'], axis=1)
# Testdaten: Survived
df_test = df_test.drop(['Survived'], axis=1)

In [23]:
display(df_train.head())

Unnamed: 0,Survived,Child,FamilySize,TicketCount,LogFare,Pclass_2,Pclass_3,Title_Master,Title_Miss,Title_Mr,Title_Mrs
0,0,0,2,1,2.110213,0,1,0,0,1,0
1,1,0,2,2,3.601186,0,0,0,0,0,1
2,1,0,1,1,2.188856,0,1,0,1,0,0
3,1,0,2,2,3.316003,0,0,0,0,0,1
4,0,0,1,1,2.202765,0,1,0,0,1,0


In [24]:
display(df_test.head())

Unnamed: 0,PassengerId,Child,FamilySize,TicketCount,LogFare,Pclass_2,Pclass_3,Title_Master,Title_Miss,Title_Mr,Title_Mrs
891,892,0,1,1,2.178064,0,1,0,0,1,0
892,893,0,2,1,2.079442,0,1,0,0,0,1
893,894,0,1,1,2.369075,1,0,0,0,1,0
894,895,0,1,1,2.268252,0,1,0,0,1,0
895,896,0,3,2,1.966238,0,1,0,0,0,1


In [25]:
display(df_app.head())

Unnamed: 0,PassengerId,Child,FamilySize,TicketCount,LogFare,Pclass_2,Pclass_3,Title_Master,Title_Miss,Title_Mr,Title_Mrs
0,1310,0,1,1,0.0,0,0,0,0,1,0
1,1311,0,1,1,3.411148,0,0,0,0,1,0
2,1312,0,1,1,2.917771,1,0,0,0,1,0
3,1313,0,1,1,0.0,0,0,0,0,0,0
4,1314,0,2,2,0.916291,0,0,0,0,1,0


In [26]:
# Daten als Excel-Dateien speichern
#  (KNIME: "Excel Writer")
# Trainingsdaten
df_train.to_excel('../../data/titanic/new/training_v10.xlsx', index=False)
# Testdaten
df_test.to_excel('../../data/titanic/new/test_v10.xlsx', index=False)
#  (KNIME: "Excel Writer")
df_app.to_excel('../../data/titanic/new/application_v10.xlsx', index=False)