In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier

In [10]:
data = pd.read_csv('titanic_prepared.csv')

data.fillna(data.mean(), inplace=True)

data.head()

Unnamed: 0.1,Unnamed: 0,sex,row_number,liters_drunk,drink,check_number,label,age_child,age_adult,age_old,morning,day,evening
0,0,0,90.0,1.0,1,8092,1,False,False,True,False,False,True
1,1,0,81.0,4.0,1,4252,1,False,True,False,False,True,False
2,2,1,14.0,3.0,0,6913,1,False,False,True,False,False,True
3,3,1,0.0,1.0,0,8479,1,False,False,True,False,True,False
4,4,1,84.0,1.0,0,6279,0,False,False,True,True,False,False


In [11]:
train, test = train_test_split(data, test_size=0.1, random_state=0)

X_train = train.drop(columns=['label'])
y_train = train['label']

X_test = test.drop(columns=['label'])
y_test = test['label']

In [12]:
models = {
    "Logistic Regression": LogisticRegression(C=0.1, max_iter=200),
    "Decision Tree": DecisionTreeClassifier(criterion='entropy', random_state=0),
    "XGBoost": XGBClassifier(max_depth=2, n_estimators=40)
}

In [14]:
for name, model in models.items():
    
    models[name].fit(X_train, y_train)
    
    y_test_pred = models[name].predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred)
            
    print(f"{name}. Accuracy: {accuracy}")

Logistic Regression. Accuracy: 0.8780120481927711
Decision Tree. Accuracy: 0.8930722891566265
XGBoost. Accuracy: 0.9051204819277109


In [15]:
importances = models["Decision Tree"].feature_importances_
features = X_test.columns
indices = np.argsort(importances)
imp_f = sorted([i for i in zip(features[indices], importances[indices])], key=lambda x: -x[1]) 

In [16]:
imp_f = [[i[0] for i in imp_f][0], [i[0] for i in imp_f][1]]

X_train = X_train[imp_f]
X_test = X_test[imp_f]

for name, model in models.items():
    
    models[name].fit(X_train, y_train)
    
    y_test_pred = models[name].predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred)
            
    print(f"{name}. Accuracy: {accuracy}")

Logistic Regression. Accuracy: 0.8810240963855421
Decision Tree. Accuracy: 0.8810240963855421
XGBoost. Accuracy: 0.8810240963855421
