In [1]:
import kaggle
import pandas as pd
import numpy as np
import zipfile
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import metrics 
from sklearn.neural_network import MLPClassifier

from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV




In [2]:
# !kaggle competitions download -c titanic
# with zipfile.ZipFile('titanic.zip', 'r') as zip_ref:
#     zip_ref.extractall()

In [4]:
df = pd.read_csv('train.csv')

# Spalte isFemale hinzugefügt
df['isFemale'] = df.apply(lambda x: x['Sex']=='female', axis=1)
df.drop('Sex', axis=1, inplace=True)

# Für die zwei fehlenden Werte den Modus eingesetzt
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
one_hot = pd.get_dummies(df['Embarked'])
one_hot.rename(columns={'S':'Embarked_S', 'C':'Embarked_C', 'Q':'Embarked_Q'}, inplace=True)
# df.drop('Embarked',axis = 1, inplace=True)
df = df.join(one_hot)

# Dummies für die Klasse
one_hot = pd.get_dummies(df['Pclass'])
one_hot.rename(columns={1:'Class_1', 2:'Class_2', 3:'Class_3'}, inplace=True)
# df.drop('Pclass',axis = 1, inplace=True)
df = df.join(one_hot)

# Dummy-Variablen für Mr, Miss, Mrs und Master
title = df.apply(lambda x: x['Name'].split(',')[1].split('.')[0].lstrip(), axis=1)

df['isMr'] = title.apply(lambda x: x == 'Mr')
df['isMiss'] = title.apply(lambda x: x == 'Miss')
df['isMrs'] = title.apply(lambda x: x == 'Mrs')
df['isMaster'] = title.apply(lambda x: x == 'Master')

df['logFare'] = df.apply(lambda x: np.log(x['Fare']+1), axis=1)

df['Cabin'].fillna('X')
df['Cabin'] = df.apply(lambda x: str(x['Cabin'])[0], axis=1)
one_hot = pd.get_dummies(df['Cabin'], prefix='Cabin')
df.drop('Cabin', axis=1, inplace=True)
df = df.join(one_hot)

df['lenTicket'] = df.apply(lambda x: len(str(x['Ticket'])), axis=1)
df['lenTicket'].fillna(0)

df['aloneM'] = df.apply(lambda x: (x['SibSp'] + x['Parch'] + x['isFemale']) == 0, axis=1)

In [5]:
mask = df['Age'].isna()

exog_impute = ['SibSp', 'Fare', 'Embarked_S', 'Embarked_C', 'Embarked_Q', 'isFemale', 
    'Class_1', 'Class_2', 'Class_3', 'isMiss', 'isMr', 'isMrs', 'isMaster']

df_std = df[exog_impute]

col_names = df_std.columns
scaler = StandardScaler().fit(df_std.values)
df_std = scaler.transform(df_std.values)
df_std = pd.DataFrame(df_std, columns=col_names)

df_ageMissing = df_std[mask]
df_age = df_std[~mask]
df_age.insert(0, 'Age', df[~mask]['Age'])

df_age.reset_index(drop=True, inplace=True)
# df_ageMissing.reset_index(drop=True, inplace=True)



tree = KDTree(df_age[exog_impute].values, leaf_size=2)
imputed_values_knn_1 = []

for index, entry in enumerate(df_ageMissing.values):

    dist, ind = tree.query([entry], k=1)
    ind = ind[0][0]
    imputed_values_knn_1.append(df_age['Age'][ind])

df.loc[mask, 'Age'] = imputed_values_knn_1


df['AgeSq'] = df.apply(lambda x: x['Age']**2, axis=1)
df['logAge'] = df.apply(lambda x: np.log(x['Age']+1), axis=1)
df['isUnderage'] = df.apply(lambda x: x['Age'] < 18, axis=1)

In [6]:
exog = ['SibSp', 'Age', 'logFare', 'Embarked_C', 'Embarked_Q',
    'Class_2', 'Class_3', 'isMiss', 'isMr', 'isMrs', 'isMaster',
    'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E']
    
X_train = df[exog]
X_train = sm.add_constant(X_train)
Y_train = df['Survived']
X_train = X_train.astype(float)
Y_train = Y_train.astype(float)

model = sm.Logit(Y_train, X_train).fit()

print(model.summary())

np.round(np.mean((model.predict(X_train)>0.5) == df['Survived']),4)

Optimization terminated successfully.
         Current function value: 0.407777
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               Survived   No. Observations:                  891
Model:                          Logit   Df Residuals:                      874
Method:                           MLE   Df Model:                           16
Date:                Tue, 04 Jun 2024   Pseudo R-squ.:                  0.3876
Time:                        08:28:12   Log-Likelihood:                -363.33
converged:                       True   LL-Null:                       -593.33
Covariance Type:            nonrobust   LLR p-value:                 9.040e-88
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0115      0.893     -0.013      0.990      -1.762       1.739
SibSp         -0.6810      0.

0.8316

##### Random Forest

In [19]:
exog = ['SibSp', 'Parch', 'Age', 'logFare', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
    'Class_1', 'Class_2', 'Class_3', 'isMiss', 'isMr', 'isMrs', 'isMaster',
    'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E']
    
X_train = df[exog]
Y_train = df['Survived']
X_train = X_train.astype(float)
Y_train = Y_train.astype(float)

In [35]:
rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=0, n_jobs=-1)

param_grid = {"min_samples_leaf" : [1], "min_samples_split" : [7], "n_estimators": [200,300,400,500]}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

gs = gs.fit(X_train, Y_train)

print(gs.best_score_)
print(gs.best_params_)

  warn(


0.8294051627384961
{'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 500}


In [37]:
clf = RandomForestClassifier(random_state=0, n_estimators=500, min_samples_split=7, min_samples_leaf=1)
clf.fit(X_train, Y_train)

y_pred = clf.predict(X_train)

print("ACCURACY OF THE MODEL:", metrics.accuracy_score(Y_train, y_pred))

ACCURACY OF THE MODEL: 0.9225589225589226


In [38]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, Y_train)

y_pred = clf.predict(X_train)

print("ACCURACY OF THE MODEL:", metrics.accuracy_score(Y_train, y_pred))

ACCURACY OF THE MODEL: 0.9797979797979798


#### Modell fertig erstellt

In [39]:
df_test = pd.read_csv('test.csv')

# Spalte isFemale hinzugefügt
df_test['isFemale'] = df_test.apply(lambda x: x['Sex']=='female', axis=1)
df_test.drop('Sex', axis=1, inplace=True)

# Für die zwei fehlenden Werte den Modus eingesetzt
df_test['Embarked'].fillna(df_test['Embarked'].mode()[0], inplace=True)
df_test['Fare'].fillna(df_test['Fare'].median(), inplace=True)
one_hot = pd.get_dummies(df_test['Embarked'])
one_hot.rename(columns={'S':'Embarked_S', 'C':'Embarked_C', 'Q':'Embarked_Q'}, inplace=True)
df_test.drop('Embarked',axis = 1, inplace=True)
df_test = df_test.join(one_hot)


# Dummies für die Klasse
one_hot = pd.get_dummies(df_test['Pclass'])
one_hot.rename(columns={1:'Class_1', 2:'Class_2', 3:'Class_3'}, inplace=True)
df_test.drop('Pclass',axis = 1, inplace=True)
df_test = df_test.join(one_hot)

# Dummy-Variablen für Mr, Miss, Mrs und Master
title = df_test.apply(lambda x: x['Name'].split(',')[1].split('.')[0].lstrip(), axis=1)

df_test['isMr'] = title.apply(lambda x: x == 'Mr')
df_test['isMiss'] = title.apply(lambda x: x == 'Miss')
df_test['isMrs'] = title.apply(lambda x: x == 'Mrs')
df_test['isMaster'] = title.apply(lambda x: x == 'Master')

df_test['logFare'] = df_test.apply(lambda x: np.log(x['Fare']+1), axis=1)

df_test['Cabin'].fillna('X')
df_test['Cabin'] = df_test.apply(lambda x: str(x['Cabin'])[0], axis=1)
one_hot = pd.get_dummies(df_test['Cabin'], prefix='Cabin')
df_test.drop('Cabin', axis=1, inplace=True)
df_test = df_test.join(one_hot)


In [40]:
mask = df_test['Age'].isna()

df_std = df_test[exog_impute]

col_names = df_std.columns
scaler = StandardScaler().fit(df_std.values)
df_std = scaler.transform(df_std.values)
df_std = pd.DataFrame(df_std, columns=col_names)

df_ageMissing = df_std[mask]
df_age = df_std[~mask]
df_age.insert(0, 'Age', df_test[~mask]['Age'])

df_age.reset_index(drop=True, inplace=True)
# df_ageMissing.reset_index(drop=True, inplace=True)


In [41]:
tree = KDTree(df_age[exog_impute].values, leaf_size=2)
imputed_values_knn_1 = []

for index, entry in enumerate(df_ageMissing.values):

    dist, ind = tree.query([entry], k=1)
    ind = ind[0][0]
    imputed_values_knn_1.append(df_age['Age'][ind])

df_test.loc[mask, 'Age'] = imputed_values_knn_1

In [42]:
exog = ['SibSp', 'Age', 'logFare', 'Embarked_C', 'Embarked_Q',
    'Class_2', 'Class_3', 'isMiss', 'isMr', 'isMrs', 'isMaster',
    'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E']

X_test = df_test[exog]
X_test = sm.add_constant(X_test)
X_test = X_test.astype(float)

vec = model.predict(X_test)

In [43]:
data = {'PassengerId':df_test['PassengerId'],
        'Survived': (vec>0.5).astype(int)}
result = pd.DataFrame(data)
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [44]:
result.to_csv('submission.csv', index=False)

*** Random Forest ***

In [45]:
df_test = pd.read_csv('test.csv')

# Spalte isFemale hinzugefügt
df_test['isFemale'] = df_test.apply(lambda x: x['Sex']=='female', axis=1)
df_test.drop('Sex', axis=1, inplace=True)

# Für die zwei fehlenden Werte den Modus eingesetzt
df_test['Embarked'].fillna(df_test['Embarked'].mode()[0], inplace=True)
one_hot = pd.get_dummies(df_test['Embarked'], prefix='Embarked')
df_test.drop('Embarked',axis = 1, inplace=True)
df_test = df_test.join(one_hot)

# Dummies für die Klasse
one_hot = pd.get_dummies(df_test['Pclass'], prefix='Class')
df_test = df_test.join(one_hot)

# Dummy-Variablen für Mr, Miss, Mrs und Master
title = df_test.apply(lambda x: x['Name'].split(',')[1].split('.')[0].lstrip(), axis=1)

df_test['isMr'] = title.apply(lambda x: x == 'Mr')
df_test['isMiss'] = title.apply(lambda x: x == 'Miss')
df_test['isMrs'] = title.apply(lambda x: x == 'Mrs')
df_test['isMaster'] = title.apply(lambda x: x == 'Master')

df_test['Fare'].fillna(df_test['Fare'].median(), inplace=True)
df_test['logFare'] = df_test.apply(lambda x: np.log(x['Fare']+1), axis=1)

df_test['Cabin'].fillna('X')
df_test['Cabin'] = df_test.apply(lambda x: str(x['Cabin'])[0], axis=1)
one_hot = pd.get_dummies(df_test['Cabin'], prefix='Cabin')
df_test.drop('Cabin', axis=1, inplace=True)
df_test = df_test.join(one_hot)

df_test['aloneM'] = df_test.apply(lambda x: (x['SibSp'] + x['Parch'] + x['isFemale']) == 0, axis=1)


mask = df_test['Age'].isna()

df_test_std = df_test[exog_impute]

col_names = df_test_std.columns
scaler = StandardScaler().fit(df_test_std.values)
df_test_std = scaler.transform(df_test_std.values)
df_test_std = pd.DataFrame(df_test_std, columns=col_names)

df_test_ageMissing = df_test_std[mask]
df_test_age = df_test_std[~mask]
df_test_age.insert(0, 'Age', df_test[~mask]['Age'])

df_test_age.reset_index(drop=True, inplace=True)
# df_test_ageMissing.reset_index(drop=True, inplace=True)


tree = KDTree(df_test_age[exog_impute].values, leaf_size=2)
imputed_values_knn_1 = []

for index, entry in enumerate(df_test_ageMissing.values):

    dist, ind = tree.query([entry], k=1)
    ind = ind[0][0]
    imputed_values_knn_1.append(df_test_age['Age'][ind])

df_test.loc[mask, 'Age'] = imputed_values_knn_1


df_test['AgeSq'] = df_test.apply(lambda x: x['Age']**2, axis=1)
df_test['logAge'] = df_test.apply(lambda x: np.log(x['Age']+1), axis=1)
df_test['isUnderage'] = df_test.apply(lambda x: x['Age'] < 18, axis=1)


In [46]:
exog = ['SibSp', 'Parch', 'Age', 'logFare', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
    'Class_1', 'Class_2', 'Class_3', 'isMiss', 'isMr', 'isMrs', 'isMaster',
    'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E']

X_test = df_test[exog]
y_pred = clf.predict(X_test)

In [47]:
data = {'PassengerId':df_test['PassengerId'],
        'Survived': y_pred.astype(int)}
result_2 = pd.DataFrame(data)

In [48]:
result_2.to_csv('submission.csv', index=False)

In [50]:
clf = MLPClassifier(random_state=1, hidden_layer_sizes=[4], max_iter=1000).fit(X_train, Y_train)

clf.score(X_train, Y_train)

0.8406285072951739