In [36]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

In [37]:
def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    os.environ["PYTHONHASHSEED"] = str(seed_value)
    

SEED = 42
set_seed(SEED)

In [38]:
df = pd.read_csv('titanic_numerical.csv')
from sklearn.model_selection import train_test_split

X = df.drop(['Survived', 'PassengerId', 'Age', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin','Embarked', 'Fare'], axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [39]:
cross_valid_scores = {}
scores_tuple=[]

<a id="2"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>Decision Tree<center><h2>

In [40]:
%%time
parameters = {
    "max_depth": [3, 5, 7, 9, 11, 13],
}

model_desicion_tree = DecisionTreeClassifier(
    random_state=SEED,
    class_weight='balanced',
)

model_desicion_tree = GridSearchCV(
    model_desicion_tree, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_desicion_tree.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_desicion_tree.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + \
    f'{model_desicion_tree.best_score_:.3f}'
)
cross_valid_scores['desicion_tree'] = model_desicion_tree.best_score_
scores_tuple.append(['desicion_tree',model_desicion_tree.best_score_])
print('-----')

-----
Best parameters {'max_depth': 3}
Mean cross-validated accuracy score of the best_estimator: 0.805
-----
CPU times: user 100 ms, sys: 14 ms, total: 114 ms
Wall time: 129 ms


<a id="3"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>Random Forest<center><h2>

In [41]:
%%time
parameters = {
    "n_estimators": [5, 10, 15, 20, 25], 
    "max_depth": [3, 5, 7, 9, 11, 13],
}

model_random_forest = RandomForestClassifier(
    random_state=SEED,
    class_weight='balanced',
)

model_random_forest = GridSearchCV(
    model_random_forest, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_random_forest.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_random_forest.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: '+ \
    f'{model_random_forest.best_score_:.3f}'
)
cross_valid_scores['random_forest'] = model_random_forest.best_score_
scores_tuple.append(['random_forest',model_random_forest.best_score_])

print('-----')

-----
Best parameters {'max_depth': 7, 'n_estimators': 10}
Mean cross-validated accuracy score of the best_estimator: 0.807
-----
CPU times: user 3.23 s, sys: 36.3 ms, total: 3.27 s
Wall time: 3.31 s


<a id="4"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>AdaBoost<center><h2>

In [42]:
%%time
parameters = {
    "n_estimators": [5, 10, 15, 20, 25, 50, 75, 100], 
    "learning_rate": [0.001, 0.01, 0.1, 1.],
}

model_adaboost = AdaBoostClassifier(
    random_state=SEED,
)

model_adaboost = GridSearchCV(
    model_adaboost, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_adaboost.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_adaboost.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: '+ \
    f'{model_adaboost.best_score_:.3f}'
)
cross_valid_scores['ada_boost'] = model_adaboost.best_score_
scores_tuple.append(['ada_boost',model_adaboost.best_score_])

print('-----')

-----
Best parameters {'learning_rate': 0.1, 'n_estimators': 75}
Mean cross-validated accuracy score of the best_estimator: 0.805
-----
CPU times: user 7.85 s, sys: 53.6 ms, total: 7.91 s
Wall time: 7.94 s


<a id="5"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>XGBoost<center><h2>

In [44]:
%%time
parameters = {
    'max_depth': [3, 5, 7, 9], 
    'n_estimators': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1]
}

model_xgb = xgb.XGBClassifier(
    random_state=SEED,
    use_label_encoder=False
)

model_xgb = GridSearchCV(
    model_xgb, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_xgb.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_xgb.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_xgb.best_score_:.3f}'
)
cross_valid_scores['xgboost'] = model_xgb.best_score_
scores_tuple.append(['xgboost',model_xgb.best_score_])

print('-----')

























-----
Best parameters {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}
Mean cross-validated accuracy score of the best_estimator: 0.820
-----
CPU times: user 1min 45s, sys: 3.85 s, total: 1min 49s
Wall time: 9.85 s


<a id="6"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>LightGBM<center><h2>

In [9]:
%%time
parameters = {
    'n_estimators': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [7, 15, 31],
}

model_lgbm = lgbm.LGBMClassifier(
    random_state=SEED,
    class_weight='balanced',
)

model_lgbm = GridSearchCV(
    model_lgbm, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_lgbm.fit(
    X_train, 
    y_train, 
    #categorical_feature=categorical_columns
)

print('-----')
print(f'Best parameters {model_lgbm.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_lgbm.best_score_:.3f}'
)
cross_valid_scores['lightgbm'] = model_lgbm.best_score_
scores_tuple.append(['xgboost',model_xgb.best_score_])

print('-----')

-----
Best parameters {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 7}
Mean cross-validated accuracy score of the best_estimator: 0.798
-----
CPU times: user 43.5 s, sys: 915 ms, total: 44.4 s
Wall time: 3.88 s


<a id="7"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>CatBoost<center><h2>

In [10]:
%%time
parameters = {
    'iterations': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 5, 7, 9, 11, 13],
}

model_catboost = cb.CatBoostClassifier(
    verbose=False,
)

model_catboost = GridSearchCV(
    model_catboost, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_catboost.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_catboost.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_catboost.best_score_:.3f}'
)
cross_valid_scores['catboost'] = model_catboost.best_score_
scores_tuple.append(['catboost',model_catboost.best_score_])

print('-----')

-----
Best parameters {'depth': 7, 'iterations': 100, 'learning_rate': 0.05}
Mean cross-validated accuracy score of the best_estimator: 0.820
-----
CPU times: user 1min 2s, sys: 5.39 s, total: 1min 7s
Wall time: 12.8 s


<a id="8"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>Logistic Regression<center><h2>

In [11]:
%%time
parameters = {
    "C": [0.001, 0.01, 0.1, 1.],
    "penalty": ["l1", "l2"]
}

model_logistic_regression = LogisticRegression(
    random_state=SEED,
    class_weight="balanced",
    solver="liblinear",
)

model_logistic_regression = GridSearchCV(
    model_logistic_regression, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_logistic_regression.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_logistic_regression.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_logistic_regression.best_score_:.3f}'
)
cross_valid_scores['logistic_regression'] = model_logistic_regression.best_score_
scores_tuple.append(['logistic_regression',model_logistic_regression.best_score_])

print('-----')

-----
Best parameters {'C': 0.1, 'penalty': 'l2'}
Mean cross-validated accuracy score of the best_estimator: 0.798
-----
CPU times: user 133 ms, sys: 2.5 ms, total: 136 ms
Wall time: 135 ms


<a id="9"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>SVC<center><h2>

In [12]:
%%time
parameters = {
    "C": [0.001, 0.01, 0.1, 1.],
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
}

model_svc = SVC(
    random_state=SEED,
    class_weight="balanced",
    probability=True,
)

model_svc = GridSearchCV(
    model_svc, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_svc.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_svc.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_svc.best_score_:.3f}'
)
cross_valid_scores['svc'] = model_svc.best_score_
print('-----')

-----
Best parameters {'C': 1.0, 'gamma': 'scale', 'kernel': 'linear'}
Mean cross-validated accuracy score of the best_estimator: 0.805
-----
CPU times: user 3.3 s, sys: 9.19 ms, total: 3.31 s
Wall time: 3.32 s


<a id="10"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>K-Nearest Neighbors<center><h2>

In [13]:
%%time
parameters = {
    "weights": ["uniform", "distance"],
}

model_k_neighbors = KNeighborsClassifier(
)

model_k_neighbors = GridSearchCV(
    model_k_neighbors, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_k_neighbors.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_k_neighbors.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_k_neighbors.best_score_:.3f}'
)
cross_valid_scores['k_neighbors'] = model_k_neighbors.best_score_
print('-----')

-----
Best parameters {'weights': 'distance'}
Mean cross-validated accuracy score of the best_estimator: 0.813
-----
CPU times: user 57 ms, sys: 3.21 ms, total: 60.2 ms
Wall time: 57.8 ms


<a id="100"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>Melhor Modelo<center><h2>

In [27]:
df_model= pd.DataFrame(cross_valid_scores, 
             index=['cross_valid_score']).T.reset_index().sort_values('cross_valid_score',
                                                                      ascending=False)
df_model

Unnamed: 0,index,cross_valid_score
5,catboost,0.820349
3,xgboost,0.820314
8,k_neighbors,0.812784
1,random_forest,0.807194
0,desicion_tree,0.805343
2,ada_boost,0.805325
7,svc,0.805325
6,logistic_regression,0.797849
4,lightgbm,0.797831


# Escolhendo o melhor modelo

In [34]:
from sklearn.metrics import accuracy_score

y_pred_test = model_catboost.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_test) * 100
print("A acurácia foi de {:.2f}%.".format(accuracy))

y_pred_test = model_xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_test) * 100
print("A acurácia foi de {:.2f}%.".format(accuracy))

y_pred_test = model_k_neighbors.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_test) * 100
print("A acurácia foi de {:.2f}%.".format(accuracy))

A acurácia foi de 80.11%.
A acurácia foi de 80.11%.
A acurácia foi de 81.23%.


In [15]:
# def create_submission(model, X_test, test_passenger_id, model_name):
#     y_pred_test = model.predict_proba(X_test)[:, 1]
#     submission = pd.DataFrame(
#         {
#             'PassengerId': test_passenger_id, 
#             'Survived': (y_pred_test >= 0.5).astype(int),
#         }
#     )
#     submission.to_csv(f"submission_{model_name}.csv", index=False)
    
#     return y_pred_test

In [16]:
# X_test =  X_test

In [17]:
# test_pred_decision_tree = create_submission(
#     model_desicion_tree, X_test, y_test, "decision_tree"
# )
# test_pred_random_forest = create_submission(
#     model_random_forest, X_test, y_test, "random_forest"
# )
# test_pred_adaboost = create_submission(
#     model_adaboost, X_test, y_test, "adaboost"
# )
# test_pred_xgboost = create_submission(
#     model_xgb, X_test, y_test, "xgboost"
# )
# test_pred_lightgbm = create_submission(
#     model_lgbm, X_test, y_test, "lightgbm"
# )
# test_pred_catboost = create_submission(
#     model_catboost, X_test, y_test, "catboost"
# )
# test_pred_logistic_regression = create_submission(
#     model_logistic_regression, X_test, y_test, "logistic_regression"
# )
# test_pred_svc = create_submission(
#     model_svc, X_test, y_test, "svc"
# )
# test_pred_k_neighbors = create_submission(
#     model_k_neighbors, X_test, y_test, "k_neighbors"
# )

In [18]:
# test_pred_merged = (
#     test_pred_decision_tree + 
#     test_pred_random_forest + 
#     test_pred_adaboost +
#     test_pred_xgboost + 
#     test_pred_lightgbm + 
#     test_pred_catboost +
#     test_pred_logistic_regression + 
#     test_pred_svc +
#     test_pred_k_neighbors
# )
# test_pred_merged = np.round(test_pred_merged / 9)

In [19]:
# submission = pd.DataFrame(
#     {
#         'PassengerId': test_df["PassengerId"], 
#         'Survived': test_pred_merged.astype(int),
#     }
# )
# submission.to_csv(f"submission_merged.csv", index=False)