In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. 데이터 읽기

In [2]:
df_train = pd.read_csv("/kaggle/input/titanic/train.csv")

In [3]:
df_train

In [4]:
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")
df_test

In [5]:
df_submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
df_submission

In [6]:
df_submission["Survived"] = 1

In [7]:
df_submission

In [8]:
df_submission.to_csv("submission1.csv", index = False)

## 2.1 EDA

In [9]:
len(df_train[df_train["Survived"] == 0]),len(df_train[df_train["Survived"] == 1]) 

In [10]:
df_train["Survived"].value_counts()

In [11]:
import seaborn as sns
sns.countplot(x = "Survived", data = df_train)

## 2.2 EDA 2

In [12]:
sns.barplot(x = "Sex", y = "Survived", data = df_train)

In [13]:
sns.barplot(x = "Pclass", y = "Survived", data = df_train)

## 2.3 features

In [14]:
df_train.isnull().sum()

In [15]:
df_train.info()

In [16]:
for col in df_train:
    print(col)

In [17]:
df_train

In [18]:
input_features = [col for col in df_train]
output_features = ["Survived"]
    
useless_features = ["PassengerId", "Survived", "Name", "Ticket", "Cabin"]
    
for f in useless_features:
    input_features.remove(f)

X, y = df_train[input_features], df_train[output_features]

In [19]:
X

In [20]:
X["Sex"] = X["Sex"].astype("category").cat.codes
X["Embarked"] = X["Embarked"].astype("category").cat.codes

In [21]:
X

In [22]:
X.info()

## 3. 결측치

In [23]:
X = X.fillna(-1)

In [24]:
X.info()

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [26]:
len(X_train), len(X_test)

## 4.1 Tree

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [28]:
tree = DecisionTreeClassifier()

In [29]:
tree

In [30]:
tree.fit(X_train, y_train) # train

In [31]:
# y_pred = tree.predict(X_test)

# print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))
# sns.heatmap(confusion_matrix(y_test, y_pred), annot = True)

def summarize_classification_result(model, X, y):
    y_pred = model.predict(X)
    print(accuracy_score(y, y_pred))
    print(classification_report(y, y_pred))
    print(confusion_matrix(y, y_pred))
    sns.heatmap(confusion_matrix(y, y_pred), annot = True)

summarize_classification_result(tree, X_test, y_test)

In [32]:
X.columns

In [33]:
import matplotlib.pyplot as plt
tree.feature_importances_

feature_names = X.columns
plt.bar(x = feature_names, height = tree.feature_importances_)

In [34]:
from sklearn.tree import plot_tree

# plot_tree(tree, feature_names = feature_names)
# plt.show()

## 5.1 hyperparameter

In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
max_leaf_nodes = range(4, 12)
criterion = ["gini", "entropy"]

params = {"max_leaf_nodes": max_leaf_nodes, "criterion": criterion}

tree_grid = GridSearchCV(DecisionTreeClassifier(), 
                         params, 
                         cv = 5, 
                         n_jobs = -1, 
                         verbose = 1, 
                         scoring = "accuracy")
tree_grid.fit(X_train, y_train)

In [37]:
tree_grid.best_estimator_

In [38]:
tree_grid.best_params_

In [39]:
summarize_classification_result(tree_grid.best_estimator_, X_test, y_test)

## 6. Ensemble

### random forest

In [40]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()
forest.fit(X_train, y_train)
summarize_classification_result(forest, X_test, y_test)
plt.bar(x = feature_names, height = forest.feature_importances_)

In [41]:
max_leaf_nodes = range(10, 20)
criterion = ["gini", "entropy"]
n_estimators = range(100,200,20)
n_jobs = [-1]

params = {"max_leaf_nodes": max_leaf_nodes, "criterion": criterion, "n_estimators": n_estimators}

forest_grid = GridSearchCV(RandomForestClassifier(), params, cv=5, n_jobs = -1, verbose = 0, scoring = "accuracy")
forest_grid.fit(X_train, y_train)
print("best parameters: " + str(forest_grid.best_params_))
summarize_classification_result(forest_grid.best_estimator_, X_test, y_test)

### Boosting

In [42]:
import xgboost
from xgboost import XGBClassifier
import lightgbm
from lightgbm import LGBMClassifier

In [43]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
summarize_classification_result(xgb_model, X_test, y_test)
plt.bar(x = feature_names, height = xgb_model.feature_importances_)

In [44]:
lgbm_model = LGBMClassifier()
lgbm_model.fit(X_train, y_train)
summarize_classification_result(lgbm_model, X_test, y_test)

In [45]:
plt.bar(x = feature_names, height = lgbm_model.feature_importances_)

In [46]:
learning_rates = [0.01, 0.1]
n_estimators = [5, 10]
gamma = [0, 0.1]
max_leaf_nodes = [10]
criterion = ["gini"]

params = {
    "learning_rates": learning_rates,
    "n_estimators": n_estimators,
    "gamma": gamma,
    "max_leaf_nodes": max_leaf_nodes, 
    "criterion": criterion, 
}

xgb_grid = GridSearchCV(XGBClassifier(), params, cv=5, n_jobs = -1, verbose = 1, scoring = "accuracy")
xgb_grid.fit(X_train, y_train)
print("best parameters: " + str(xgb_grid.best_params_))

In [47]:
summarize_classification_result(xgb_grid.best_estimator_, X_test, y_test)

In [48]:
learning_rates = [0.01, 0.1]
n_estimators = [50, 100]
lambda_l1 = [0, 0.1]
max_leaf_nodes = [10]
criterion = ["gini", "entropy"]

params = {
    "learning_rates": learning_rates,
    "n_estimators": n_estimators,
    "lambda_l1": lambda_l1,
    "max_leaf_nodes": max_leaf_nodes, 
    "criterion": criterion, 
}

lgbm_grid = GridSearchCV(LGBMClassifier(), params, cv=5, n_jobs = -1, verbose = 0, scoring = "accuracy")
lgbm_grid.fit(X_train, y_train)
print("best parameters: " + str(lgbm_grid.best_params_))

In [49]:
summarize_classification_result(lgbm_grid.best_estimator_, X_test, y_test)

### voting classifier

In [50]:
from sklearn.ensemble import VotingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

In [58]:
clf1 = DecisionTreeClassifier()
clf2 = RandomForestClassifier()
clf3 = XGBClassifier()
clf4 = LGBMClassifier()
clf5 = LogisticRegression()

In [59]:
hardvoting = VotingClassifier(estimators = [("tree", clf1),("forest", clf2),("xgb", clf3),("lgbm", clf4),("lr", clf5)], voting = "hard")
softvoting = VotingClassifier(estimators = [("tree", clf1),("forest", clf2),("xgb", clf3),("lgbm", clf4),("lr", clf5)], voting = "soft")

In [60]:
hardvoting.fit(X_train, y_train)

In [61]:
summarize_classification_result(hardvoting, X_test, y_test)

In [63]:
softvoting.fit(X_train, y_train)
summarize_classification_result(softvoting, X_test, y_test)

In [64]:
my_model = lgbm_grid.best_estimator_

In [65]:
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")

submission_X = df_test[input_features]
submission_X["Sex"] = submission_X["Sex"].astype("category").cat.codes
submission_X["Embarked"] = submission_X["Embarked"].astype("category").cat.codes
submission_X = submission_X.fillna(-1)
submission_y = my_model.predict(submission_X)

In [66]:
submission_X

In [67]:
submission_y

In [68]:
df_submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [69]:
df_submission["Survived"] = submission_y

In [70]:
df_submission

In [71]:
df_submission.to_csv("submission2.csv", index = False)