# (・ω・)

In [73]:
# code completion
%config Completer.use_jedi = False

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier


In [74]:
# load data

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")


In [75]:
# data formatting

train_data["Age"] = train_data["Age"].fillna(train_data["Age"].mean())
train_data["Cabin"] = train_data["Cabin"].fillna("z")
cabin_mapping = {
    cabin: idx for idx, cabin in enumerate(
        set(train_data["Cabin"].unique()) | set(test_data["Cabin"].unique())
    )
}
train_data["Cabin"] = train_data["Cabin"].map(cabin_mapping)
# print(train_data.isnull().sum() / len(train_data))

test_data["Age"] = test_data["Age"].fillna(test_data["Age"].mean())
test_data["Fare"] = test_data["Fare"].fillna(test_data["Fare"].mean())
test_data["Cabin"] = test_data["Cabin"].fillna("z")
test_data["Cabin"] = test_data["Cabin"].map(cabin_mapping)
# print(test_data.isnull().sum() / len(test_data))

train_data["FamilySize"] = train_data["SibSp"] + train_data["Parch"] + 1
display(train_data.head())
test_data["FamilySize"] = test_data["SibSp"] + test_data["Parch"] + 1

Y_train = train_data["Survived"]
features = ["Pclass", "Age", "Sex", "Fare", "Cabin", "FamilySize"]
X_train = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])


In [82]:
# create models for predict

models = {
    "logistic regression": LogisticRegression(max_iter=100000),
    "support vector machines": SVC(),
    "k neighbors classifier": KNeighborsClassifier(n_neighbors=3),
    "gaussian naive bayes": GaussianNB(),
    "perceptron": Perceptron(),
    "linear svc": LinearSVC(max_iter=100000),
    "stochastic gradient descent classifier": SGDClassifier(),
    "decision tree classifier": DecisionTreeClassifier(),
    "random forest classifier": RandomForestClassifier(n_estimators=100),
}


In [83]:
# fitting model, do predict, and check acc

max_acc = 0
for name, model in models.items():
    model.fit(X_train, Y_train) # fitting
    predictions = model.predict(X_test) # predict
    acc = round(model.score(X_train, Y_train) * 100, 3) # check acc
    print(f"[{name}] / acc:{acc}")
    if max_acc < acc:
        max_acc = acc
        best_model_name = name


In [84]:
# create submission data

print("\n> best model name:", best_model_name)
print("> max acc:", max_acc)
best_model = models[best_model_name]
best_model.fit(X_train, Y_train)
predictions = best_model.predict(X_test)
output = pd.DataFrame({
    "PassengerId": test_data.PassengerId,
    "Survived": predictions
})
output.to_csv("submission.csv", index=False)
print("> submission was successfully saved.")
