## Phase 3 (Data Preparation): v2

* Autorin: Anna (i3-Versicherung)
* Webseite: [Data Science Training - Kapitel 4](https://data-science.training/kapitel-4/)
* Datum: 23.03.2023

Wir führen eine verbesserte Datenvorbereitung durch. Die fehlenden Werte werde folgendermaßen geschätzt:

* Embarked: Modus (=> 'S')
* Fare: fester Wert 7,896
* Age: Interpolation mit Methode "linear" (default)
* Cabin: 'Unknown' (als fester Wert)

In [4]:
# Pandas Paket (Package) importieren
#  Datenstrukturen und Datenanalyse, I/O
#  https://pandas.pydata.org/pandas-docs/stable/
import pandas as pd
# NumPy Paket (Package) importieren
#  Mehrdimensionale Datenstrukturen (Vektoren, Matrizen, Tensoren, Arrays), Lineare Algebra
#  https://numpy.org/doc/
import numpy as np
# MinMaxScaler aus scikit-learn Paket
#  https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
from sklearn.preprocessing import MinMaxScaler

In [5]:
# Trainings-, Test- und Anwendungsdaten als Pandas Data Frame (df) aus CSV-Dateien laden
#  (KNIME: "CSV Reader")
df_train = pd.read_csv('../../data/titanic/original/train.csv')
df_test  = pd.read_csv('../../data/titanic/original/test.csv')
df_app   = pd.read_csv('../../data/titanic/original/application.csv')

In [6]:
# Trainings- und Testdaten zusammenführen
#  (KNIME "Concatenate")
df = pd.concat([df_train, df_test], ignore_index=True)

In [7]:
# Datentypen automatisch konvertieren
df = df.convert_dtypes()

In [8]:
# Fehlende Werte prüfen
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [9]:
# Fehlende Werte behandeln
#  (KNIME: "Missing Values")
#
# Embarked (Nominalskala): 2 fehlende Werte => Benutze den Modus (häufigster Wert)
embarked_mode  = df['Embarked'].dropna().mode()[0]
df['Embarked'] = df['Embarked'].fillna(embarked_mode)
#
# Fare (Kardinalskala): 1 fehlender Wert => Benutze den konstanten Wert 7,896
df['Fare'] = df['Fare'].fillna(7.896) # Schnelle Variante (aber nicht allgemein gültig!)
# Alternative Variante, um zielgenau diesen einen fehlenden Wert zu setzen
#display(df[df['PassengerId'] == 1044])
#df.loc[1043, 'Fare'] = 7.896
#display(df[df['PassengerId'] == 1044])
#
# Age (Kardinalskala): 263 fehlende Werte => Benutze die Interpolationsfunktion "linear" (default)
age_1 = df_train['Age'].interpolate()
age_2 = df_test['Age'].interpolate()
age   = pd.concat([age_1, age_2], ignore_index=True)
df['Age'] = age
#
# Cabin (Nominalskala): 1014 fehlende Werte => Benutze den festen Wert 'Unknown'
df['Cabin'] = df['Cabin'].fillna('Unknown')

In [10]:
# Fehlende Werte prüfen
df.isnull().sum()

PassengerId      0
Survived       418
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [11]:
# Strings herausfiltern, d.h. nur Attribute benutzen, die numerische Werte enthalten 
#
# Filter auf Trainingsdaten anwenden
df = df.select_dtypes(include=[np.number])
#
# Filter auch auf Anwendungsdaten anwenden
df_app = df_app.select_dtypes(include=[np.number])

In [12]:
# Beschreibende Attribute extrahieren
# (ohne PassengerId und Survived; also Pclass, Age, SibSp, Parch und Fare)
X = df.iloc[:,2:].values
X_app = df_app.iloc[:,1:].values

In [13]:
# Daten auf Intervall [0, 1] normieren
scaler = MinMaxScaler()
Xt = scaler.fit_transform(X)
Xt_app = scaler.transform(X_app)

In [14]:
# Transformierte Daten zurück in den Data Frame kopieren
df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']] = Xt
df_app[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']] = Xt_app

In [15]:
# Daten wieder aufteilen
#  (KNIME: "Row Splitter")
df_train = df[df['Survived'].notna()]
df_test  = df[df['Survived'].isna()]

In [16]:
# Irrelevante Attribute filtern
#  (KNIME: "Column Filter")
#
# Trainingsdaten: PassengerId
df_train = df_train.drop(['PassengerId'], axis=1)
#
# Testdaten: Survived
df_test = df_test.drop(['Survived'], axis=1)

In [17]:
# Trainingsdaten anzeigen
display(df_train)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
0,0,1.0,0.273456,0.125,0.000000,0.014151
1,1,0.0,0.473882,0.125,0.000000,0.139136
2,1,1.0,0.323563,0.000,0.000000,0.015469
3,1,0.0,0.436302,0.125,0.000000,0.103644
4,0,1.0,0.436302,0.000,0.000000,0.015713
...,...,...,...,...,...,...
886,0,0.5,0.336089,0.000,0.000000,0.025374
887,1,0.0,0.235876,0.000,0.000000,0.058556
888,0,1.0,0.279719,0.125,0.222222,0.045771
889,1,0.0,0.323563,0.000,0.000000,0.058556


In [18]:
# Testdaten anzeigen
display(df_test)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
891,892,1.0,0.430039,0.000,0.000000,0.015282
892,893,1.0,0.586622,0.125,0.000000,0.013663
893,894,0.5,0.774521,0.000,0.000000,0.018909
894,895,1.0,0.336089,0.000,0.000000,0.016908
895,896,1.0,0.273456,0.125,0.111111,0.023984
...,...,...,...,...,...,...
1304,1305,1.0,0.417512,0.000,0.000000,0.015713
1305,1306,0.0,0.486409,0.000,0.000000,0.212559
1306,1307,1.0,0.480145,0.000,0.000000,0.014151
1307,1308,1.0,0.480145,0.000,0.000000,0.015713


In [19]:
# Daten als Excel-Dateien speichern
#  (KNIME: "Excel Writer")
#
# Trainingsdaten
df_train.to_excel('../../data/titanic/new/training_v2.xlsx', index=False)
#
# Testdaten
df_test.to_excel('../../data/titanic/new/test_v2.xlsx', index=False)
#
# Anwendungsdaten
df_app.to_excel('../../data/titanic/new/application_v2.xlsx', index=False)