## Phase 3 (Data Preparation): v6 (string)

* Autorin: Anna (i3-Versicherung)
* Webseite: [Data Science Training - Kapitel 17](https://data-science.training/kapitel-17/)
* Datum: 23.03.2023

Ausgangspunkt ist die Datenaufbereitung v3.

Wir fügen nun die Ergebnisse der Mehrklassen-Klassifikation für das Attribut AgeBinned hinzu.

In [4]:
# Pandas Paket (Package) importieren
#  Datenstrukturen und Datenanalyse, I/O
#  https://pandas.pydata.org/pandas-docs/stable/
import pandas as pd
# NumPy Paket (Package) importieren
#  Mehrdimensionale Datenstrukturen (Vektoren, Matrizen, Tensoren, Arrays), Lineare Algebra
#  https://numpy.org/doc/
import numpy as np
# Eigene Module importieren
#  zur Berechnung der Korrelationskoeffizienten
import sys
sys.path.append('../00_DST_Module/')
import dst_correlation_functions as cf

In [5]:
# Trainings-, Test- und Anwendungsdaten als Pandas Data Frame (df) aus CSV-Dateien laden
#  (KNIME: "CSV Reader")
df_train = pd.read_csv('../../data/titanic/original/train.csv')
df_test  = pd.read_csv('../../data/titanic/original/test.csv')
df_app   = pd.read_csv('../../data/titanic/original/application.csv')

In [6]:
# Ergebnisse der Multiklassen-Klassifikation als Pandas Data Frame (df) aus Excel-Datei laden
#  (KNIME: "Excel Reader")
df_res   = pd.read_excel('../../data/titanic/age/results_v2.xlsx')

In [7]:
# Trainings- und Testdaten zusammenführen
#  (KNIME "Concatenate")
df = pd.concat([df_train, df_test], ignore_index=True)

In [8]:
# Datentypen automatisch konvertieren
df = df.convert_dtypes()
df_app = df_app.convert_dtypes()

In [9]:
# Fehlende Werte prüfen
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [10]:
# Age Binning: Manuelles Binning
bins = [0, 10, 20, 30, 40, 60, 120]
labels = ['Child', 'Teenage', 'Young', 'Adult','Middle','Senior']
df['AgeBinned'] = pd.cut(df['Age'], bins, right=False, labels=labels)
df_app['AgeBinned'] = pd.cut(df_app['Age'], bins, right=False, labels=labels)

In [11]:
# Fehlende Werte prüfen
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
AgeBinned       263
dtype: int64

In [12]:
# Prognose-Ergebnisse der Multiklassen-Klassifikation hinzufügen
df = df.merge(df_res, how='left', on='PassengerId')

In [13]:
#display(df)
df['AgeBinned'] = df['AgeBinned_x'].fillna(df['AgeBinned_y'])
df = df.drop(['AgeBinned_x', 'AgeBinned_y'], axis=1)
display(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBinned
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Young
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Young
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S,Child
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C,Adult
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S,Adult
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S,Adult


In [14]:
# Fehlende Werte prüfen
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
AgeBinned         0
dtype: int64

In [15]:
# Falsche Kabinennummer löschen
#  (KNIME: "Rule Engine")
display(df[df['Cabin'] == 'B51 B53 B55'])
display(df[df['PassengerId'] == 873])
df.loc[872, 'Cabin'] = np.nan
display(df[df['PassengerId'] == 873])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBinned
679,680,1.0,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C,Adult
872,873,0.0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0,B51 B53 B55,S,Adult
1234,1235,,1,"Cardeza, Mrs. James Warburton Martinez (Charlo...",female,58.0,0,1,PC 17755,512.3292,B51 B53 B55,C,Middle


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBinned
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0,B51 B53 B55,S,Adult


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBinned
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0,,S,Adult


In [16]:
# Neues Feature: KnownCabin
#  (KNIME: "Rule Engine")
df['KnownCabin'] = (df['Cabin'].notna()).astype('int')
#
# Nicht für df_app, weil dieses Feature später wieder herausgefiltert wird.

In [17]:
# Fehlende Werte behandeln (d.h. schätzen) - Teil 1
#  (KNIME: "Missing Values")
# Embarked (Nominalskala): 2 fehlende Werte => Benutze den Modus (häufigster Wert)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].dropna().mode()[0])

In [18]:
# Fehlende Werte behandeln (d.h. schätzen) - Teil 2
#  (KNIME: "Missing Values")
# Fare (Kardinalskala): 1 fehlender Wert => Benutze den konstanten Wert 7,896
#df['Fare'] = df['Fare'].fillna(7.896)
display(df[df['Fare'].isna()])
df.loc[1043, 'Fare'] = 7.896
display(df[df['PassengerId'] == 1044])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBinned,KnownCabin
1043,1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,Senior,0


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBinned,KnownCabin
1043,1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,7.896,,S,Senior,0


In [19]:
# Hinweis: Wir löschen später die Features Age und Cabin.
# Deshalb ersetzen wir zu diesen Features keine fehlenden Werte.

In [20]:
# Neues Feature Title
#  (KNIME: "Cell Splitter", "Column Rename", "Table Creator", "Cell Replacer")
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
df['Title'] = df['Title'].replace(['Ms', 'Mlle'], 'Miss')
df['Title'] = df['Title'].replace(['Mme', 'Lady', 'the Countess', 'Dona'], 'Mrs')
df['Title'] = df['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Rare')
#
# Nicht für df_app, weil dieses Feature später wieder herausgefiltert wird.

In [21]:
# Neues Feature FamilySizeBinned
#  (KNIME: "Math Formula", "Table Creator", "Binner (Directory)")
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df_app['FamilySize'] = df_app['SibSp'] + df_app['Parch'] + 1
bins   = [0, 2, 5, 99]
labels = ['No', 'Small', 'Large']
df['FamilySizeBinned'] = pd.cut(df['FamilySize'], bins, right=False, labels=labels)
df_app['FamilySizeBinned'] = pd.cut(df_app['FamilySize'], bins, right=False, labels=labels)

In [22]:
# Neues Feature FareBinned
#  (KINME: "Table Creator", "Binner (Directory)")
bins   = [-1, 8, 16, 32, 1024]
labels = ['Low', 'Medium', 'High', 'VeryHigh']
df['FareBinned'] = pd.cut(df['Fare'], bins, right=False, labels=labels)
df_app['FareBinned'] = pd.cut(df_app['Fare'], bins, right=False, labels=labels)

In [23]:
# Ergebnis des Feature Engineering anzeigen
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   PassengerId       1309 non-null   Int64   
 1   Survived          891 non-null    Int64   
 2   Pclass            1309 non-null   Int64   
 3   Name              1309 non-null   string  
 4   Sex               1309 non-null   string  
 5   Age               1046 non-null   Float64 
 6   SibSp             1309 non-null   Int64   
 7   Parch             1309 non-null   Int64   
 8   Ticket            1309 non-null   string  
 9   Fare              1309 non-null   Float64 
 10  Cabin             294 non-null    string  
 11  Embarked          1309 non-null   string  
 12  AgeBinned         1309 non-null   category
 13  KnownCabin        1309 non-null   int32   
 14  Title             1309 non-null   string  
 15  FamilySize        1309 non-null   Int64   
 16  FamilySizeBinned  1309 n

In [24]:
# Aufräumen: Attribute (manuell) herausfiltern
#  (KNIME "Column Filter")
df = df.drop(['Name', 'Age', 'SibSp', 'Parch', 'Fare' , 'Cabin', 'FamilySize', 'Ticket'], axis=1)
df_app = df_app.drop(['Name', 'Age', 'SibSp', 'Parch', 'Fare' , 'Cabin', 'FamilySize', 'Ticket'], axis=1)
# Ergebnis anzeigen
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   PassengerId       1309 non-null   Int64   
 1   Survived          891 non-null    Int64   
 2   Pclass            1309 non-null   Int64   
 3   Sex               1309 non-null   string  
 4   Embarked          1309 non-null   string  
 5   AgeBinned         1309 non-null   category
 6   KnownCabin        1309 non-null   int32   
 7   Title             1309 non-null   string  
 8   FamilySizeBinned  1309 non-null   category
 9   FareBinned        1309 non-null   category
dtypes: Int64(3), category(3), int32(1), string(3)
memory usage: 74.8 KB


In [25]:
# Version 3: Datentyp string
df = df.astype('string')
df_app = df_app.astype('string')
df['PassengerId'] = df['PassengerId'].astype('int') # Ausnahme: Primärschlüsselattribut
df_app['PassengerId'] = df_app['PassengerId'].astype('int') # Ausnahme: Primärschlüsselattribut
# Ergebnis anzeigen
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   PassengerId       1309 non-null   int32 
 1   Survived          891 non-null    string
 2   Pclass            1309 non-null   string
 3   Sex               1309 non-null   string
 4   Embarked          1309 non-null   string
 5   AgeBinned         1309 non-null   string
 6   KnownCabin        1309 non-null   string
 7   Title             1309 non-null   string
 8   FamilySizeBinned  1309 non-null   string
 9   FareBinned        1309 non-null   string
dtypes: int32(1), string(9)
memory usage: 97.3 KB


In [26]:
# Korrelationen: Kategorische Attribute
corr_matrix = cf.dst_categorical_correlation_matrix(df)
display(corr_matrix)
#
corr_measures = cf.dst_correlation_measures_filtered(corr_matrix)
display(corr_measures)

Unnamed: 0,Survived,Pclass,Sex,Embarked,AgeBinned,KnownCabin,Title,FamilySizeBinned,FareBinned
Survived,1.0,0.195107,0.445849,0.105109,0.035078,0.262437,0.330078,0.167747,0.167388
Pclass,0.195107,1.0,0.118532,0.276939,0.279329,0.776096,0.181315,0.178457,0.578515
Sex,0.445849,0.118532,0.998333,0.114465,0.067982,0.134244,0.997245,0.282279,0.220821
Embarked,0.105109,0.276939,0.114465,1.0,0.074287,0.275706,0.158759,0.139615,0.274292
AgeBinned,0.035078,0.279329,0.067982,0.074287,1.0,0.257864,0.286122,0.186531,0.163351
KnownCabin,0.262437,0.776096,0.134244,0.275706,0.257864,0.997805,0.192596,0.210096,0.598643
Title,0.330078,0.181315,0.997245,0.158759,0.286122,0.192596,1.0,0.387857,0.199751
FamilySizeBinned,0.167747,0.178457,0.282279,0.139615,0.186531,0.210096,0.387857,1.0,0.388928
FareBinned,0.167388,0.578515,0.220821,0.274292,0.163351,0.598643,0.199751,0.388928,1.0


Survived    Survived      1.000000
Pclass      Pclass        1.000000
Sex         Sex           0.998333
KnownCabin  KnownCabin    0.997805
Sex         Title         0.997245
Pclass      KnownCabin    0.776096
dtype: float64

### Schlussfolgerungen

Zwischen den Attributen (bzw. Features) Sex und Title sowie zwischen Pclass und KnownCabin gibt es einen starken Zusammenhang. Wir können also eines dieser Features eliminieren. Wir entscheiden uns dafür, die ursprünglichen Attribute (Sex und Pclass) zu benutzen und die neuen Features (Title und KnownCabin) herauszufiltern.

In [28]:
# Aufräumen: Attribute (manuell) herausfiltern
#  (KNIME "Column Filter")
df = df.drop(['Title', 'KnownCabin'], axis=1)
# Ergebnis anzeigen
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   PassengerId       1309 non-null   int32 
 1   Survived          891 non-null    string
 2   Pclass            1309 non-null   string
 3   Sex               1309 non-null   string
 4   Embarked          1309 non-null   string
 5   AgeBinned         1309 non-null   string
 6   FamilySizeBinned  1309 non-null   string
 7   FareBinned        1309 non-null   string
dtypes: int32(1), string(7)
memory usage: 76.8 KB


In [29]:
# Daten wieder aufteilen
#  (KNIME: "Row Splitter")
df_train = df[df['Survived'].notna()]
df_test  = df[df['Survived'].isna()]

In [30]:
# Irrelevante Attribute filtern
#  (KNIME: "Column Filter")
# Trainingsdaten: PassengerId
df_train = df_train.drop(['PassengerId'], axis=1)
# Testdaten: Survived
df_test = df_test.drop(['Survived'], axis=1)

In [31]:
display(df_train.head())

Unnamed: 0,Survived,Pclass,Sex,Embarked,AgeBinned,FamilySizeBinned,FareBinned
0,0,3,male,S,Young,Small,Low
1,1,1,female,C,Adult,Small,VeryHigh
2,1,3,female,S,Young,No,Low
3,1,1,female,S,Adult,Small,VeryHigh
4,0,3,male,S,Adult,No,Medium


In [32]:
display(df_test.head())

Unnamed: 0,PassengerId,Pclass,Sex,Embarked,AgeBinned,FamilySizeBinned,FareBinned
891,892,3,male,Q,Adult,No,Low
892,893,3,female,S,Middle,Small,Low
893,894,2,male,Q,Senior,No,Medium
894,895,3,male,S,Young,No,Medium
895,896,3,female,S,Young,Small,Medium


In [33]:
display(df_app.head())

Unnamed: 0,PassengerId,Pclass,Sex,Embarked,AgeBinned,FamilySizeBinned,FareBinned
0,1310,1,male,C,Senior,No,Low
1,1311,1,male,C,Adult,No,High
2,1312,2,male,S,Middle,No,High
3,1313,1,male,S,Adult,No,Low
4,1314,1,male,S,Senior,Small,Low


In [34]:
# Daten als Excel-Dateien speichern
#  (KNIME: "Excel Writer")
# Trainingsdaten
df_train.to_excel('../../data/titanic/new/training_v6.xlsx', index=False)
# Testdaten
df_test.to_excel('../../data/titanic/new/test_v6.xlsx', index=False)
# Anwendungsdaten
df_app.to_excel('../../data/titanic/new/application_v6.xlsx', index=False)