## Laden Module

In [None]:
import pandas as pd                                                
import matplotlib.pyplot as plt                                    
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from graphviz import Source
from sklearn.tree import export_graphviz

## Laden der Daten

In [None]:
titanic_data = pd.read_csv('/content/sample_data/titanic_data.csv', sep = ';', decimal = ',')

## Data Understanding

In [None]:
titanic_data.info()

Pclass: Passagier-Klasse (P1, P2, P3)



Age: Alter

Fare: Ticketpreis

Embarked: Hafen der Einschiffung (C = Cherbourg; Q = Queenstown; S = Southampton) 

FamilySize: Anzahl der Familienmitglieder  

GenderClass: Geschlecht oder Kind (male, female,  child)

Survived: Label (0: nicht überlebt, 1: überlebt)

In [None]:
titanic_data.describe()

In [None]:
sns.catplot(data = titanic_data, x = 'Survived', kind='count')

In [None]:
sns.catplot(data = titanic_data, x = 'GenderClass', hue = 'Survived',
            kind='count')

In [None]:
sns.catplot(data = titanic_data, x = 'Pclass', hue = 'Survived',
            kind='count')

## Data Preparation

In [None]:
titanic = pd.get_dummies(titanic_data, columns=['Pclass', 'GenderClass','Embarked'])
X = titanic.loc[:,titanic.columns != 'Survived']
y = titanic.Survived 


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [None]:
print(X_train.shape)
print(y_train.shape)

## Modeling

In [None]:
model_1 = DecisionTreeClassifier(random_state = 0)
model_1.fit(X_train, y_train)

In [None]:
graph_1 = Source( export_graphviz(model_1, 
                                  out_file=None,
                                  filled=True,
                                  rounded=True,  
                                  special_characters=True, 
                                  feature_names=X.columns))
graph_1.format = 'png'
graph_1.render('model_1',view = True)

## Evaluation

In [None]:
y_pred_train_1 = model_1.predict(X_train)  
y_pred_test_1 = model_1.predict(X_test)
print('Accuracy Score Train Data:', accuracy_score(y_train,y_pred_train_1))
print('Accuracy Score Test Data:', accuracy_score(y_test,y_pred_test_1))


In [None]:
confusion_matrix_1 = pd.DataFrame(confusion_matrix(y_test, y_pred_test_1))

confusion_matrix_1.index = ['nicht überlebt','überlebt']
confusion_matrix_1.columns = ['Vorhersage nicht überlebt','Vorhersage überlebt']
print(confusion_matrix_1)

## Optimierung: Hyperparameteroptimierung

In [None]:
params = {
    'criterion':['gini','entropy'],
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'max_features': [2, 3, 4, 5, 6]
    }

In [None]:
grid_1 = GridSearchCV(model_1, 
                      param_grid = params, 
                      cv=5,
                      verbose = 2)

grid_1.fit(X_train, y_train)

In [None]:
grid_1.best_estimator_ 

In [None]:
model_2 = DecisionTreeClassifier(random_state = 0,
                                         max_depth = 6,
                                         max_features = 5)
model_2.fit(X_train, y_train)

In [None]:
graph_2 = Source( export_graphviz(model_2, 
                                  out_file=None,
                                  filled=True,
                                  rounded=True,  
                                  special_characters=True, 
                                  feature_names=X.columns))
graph_2.format = 'png'
graph_2.render('model_2',view = True)

In [None]:
y_pred_train_2 = model_2.predict(X_train)  
y_pred_test_2 = model_2.predict(X_test)
print('Accuracy Score Train Data:', accuracy_score(y_train,y_pred_train_2))
print('Accuracy Score Test Data:', accuracy_score(y_test,y_pred_test_2))

In [None]:
confusion_matrix_2 = pd.DataFrame(confusion_matrix(y_test, y_pred_test_2))

confusion_matrix_2.index = ['nicht überlebt','überlebt']
confusion_matrix_2.columns = ['Vorhersage nicht überlebt','Vorhersage überlebt']
print(confusion_matrix_2)

## Optimierung 2: Feature Selection & Hyperparameteroptimierung

In [None]:
X_train_2 = X_train.drop(['Pclass_P1', 'Pclass_P2'], axis = 1)
X_test_2 = X_test.drop(['Pclass_P1', 'Pclass_P2'], axis = 1)

grid_2 = GridSearchCV(model_1, 
                      param_grid = params,
                      scoring = 'roc_auc',
                      cv=5,
                      verbose = 2)

grid_2.fit(X_train_2, y_train)

In [None]:
grid_2.best_estimator_ 

In [None]:
model_3 = DecisionTreeClassifier(random_state = 0,
                                         max_depth = 4,
                                         max_features = 6)
model_3.fit(X_train_2, y_train)

In [None]:
graph_3 = Source( export_graphviz(model_3, 
                                  out_file=None,
                                  filled=True,
                                  rounded=True,  
                                  special_characters=True, 
                                  feature_names=X.drop(['Pclass_P1', 'Pclass_P2'], axis = 1).columns))
graph_3.format = 'png'
graph_3.render('model_3',view = True)

In [None]:
y_pred_train_3 = model_3.predict(X_train_2)  
y_pred_test_3 = model_3.predict(X_test_2)
print('Accuracy Score Train Data:', accuracy_score(y_train, y_pred_train_3))
print('Accuracy Score Test Data:', accuracy_score(y_test, y_pred_test_3))

In [None]:
confusion_matrix_3 = pd.DataFrame(confusion_matrix(y_test, y_pred_test_3))

confusion_matrix_3.index = ['nicht überlebt','überlebt']
confusion_matrix_3.columns = ['Vorhersage nicht überlebt','Vorhersage überlebt']
print(confusion_matrix_3)