In [7]:
import pandas as pd
from paths_titanic import get_project_path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.preprocessing import LabelEncoder

from train import fit_model
from metric import metric_model

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
base_path = get_project_path()
df = pd.read_csv('titanic.csv')

In [3]:
df["fam_size"] = df["SibSp"] + df["Parch"]
df["Sex"] = LabelEncoder().fit_transform(df['Sex'])
df["Embarked"] = LabelEncoder().fit_transform(df['Embarked'])

titles = ("Capt.","Col.","Major.","Sir.","Lady.","Rev.","Dr.","Don.","Jonkheer.","Countess.","Mrs.","Ms.","Mr.","Mme.","Mlle.","Miss.","Master.")

titled_names = []
for name in df.Name:
    for title in titles:
        if title in name.split(' '):
            titled_names.append(title)
            break


df.insert(12, 'Title', titled_names)

df = df.drop(['Cabin', 'Name', 'Ticket'], axis=1 )
df["Title"] = LabelEncoder().fit_transform(df['Title'])
df = df.fillna(df.mean())

In [4]:
X = df.drop('Survived', axis = 1)
survived = df['Survived']
X_train, X_test, survived_train, survived_test = train_test_split(X, survived, test_size=0.2, random_state=500, stratify=df["Survived"])

scaler = MinMaxScaler()
X_train_normal = scaler.fit_transform(X_train)
X_test_normal = scaler.transform(X_test)

In [5]:
models ={
    'LR': LogisticRegression(solver='lbfgs', max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'RF': RandomForestClassifier(),
    'SVC': SVC(),
    'GB': GradientBoostingClassifier(),
    'XGB': XGBClassifier(subsample=1, n_estimators=100, colsample_bytree=1, min_child_weight=1, max_depth=6, learning_rate=0.3),
    'CB': CatBoostClassifier(iterations=1000, learning_rate=0.3, depth=6, verbose=0)
}

In [8]:
for name, model in models.items():
    trained_model = fit_model(model, X_train_normal, survived_train)
    f1 = metric_model(trained_model,X_test_normal, survived_test)
    print(f'F1 for {name}: {f1 * 100:.2f}%')

F1 for LR: 83.43%
F1 for KNN: 82.21%
F1 for RF: 82.75%
F1 for SVC: 85.47%
F1 for GB: 83.86%
F1 for XGB: 83.24%
F1 for CB: 81.67%
