In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [113]:
#carregar base de dados
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [114]:
#salvar os índices dos datasets
train_idx = train.shape[0]
test_idx = test.shape[0]

#salvar PassengerId para submissao
passengerId = test['PassengerId']

#extrair coluna 'Survived' e exclui-la do dataset treino
target = train.Survived.copy()
train.drop(['Survived'], axis=1, inplace=True)

# concatenar treino e teste em um único DataFrame
df_merged = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)

print("df_merged.shape: ({} x {})".format(df_merged.shape[0], df_merged.shape[1]))

df_merged.shape: (1309 x 11)


In [115]:
#removendo informações que não serão utilizadas
df_merged.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df_merged.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1046.0,1309.0,1309.0,1308.0
mean,2.294882,29.881138,0.498854,0.385027,33.295479
std,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.17,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.8958
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,39.0,1.0,0.0,31.275
max,3.0,80.0,8.0,9.0,512.3292


In [116]:
#verificar valores faltantes
df_merged.isnull().sum()

Pclass        0
Sex           0
Age         263
SibSp         0
Parch         0
Fare          1
Embarked      2
dtype: int64

In [0]:
#completar idades faltantes com o valor da mediana
age_median = df_merged['Age'].median()
df_merged['Age'].fillna(age_median, inplace=True)

#completar tarifas faltantes com o valor da mediana
fare_median = df_merged['Fare'].median()
df_merged['Fare'].fillna(fare_median, inplace=True)

#completar embarque com valor de maior frequência
embarked_top = df_merged['Embarked'].value_counts()[0]
df_merged['Embarked'].fillna(embarked_top, inplace=True)

In [24]:
#converter sexo em 0 e 1
df_merged['Sex'] = df_merged['Sex'].map({'male': 0, 'female': 1})

#variáveis dummie
embarked_dummies = pd.get_dummies(df_merged['Embarked'], prefix='Embarked')
df_merged = pd.concat([df_merged, embarked_dummies], axis=1)
df_merged.drop('Embarked', axis=1, inplace=True)

display(df_merged.head())

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_914,Embarked_C,Embarked_Q,Embarked_S
0,3,0,22.0,1,0,7.25,0,0,0,1
1,1,1,38.0,1,0,71.2833,0,1,0,0
2,3,1,26.0,0,0,7.925,0,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,3,0,35.0,0,0,8.05,0,0,0,1


In [0]:
#recuperar datasets de treino e teste
train = df_merged.iloc[:train_idx]
test = df_merged.iloc[train_idx:]

In [72]:
#LINEAR
from sklearn import linear_model
linear_model = linear_model.Ridge(alpha=.5)
linear_model.fit(train, target)

#acurácia do modelo
acc_logReg = round(linear_model.score(train, target) * 100, 2)
print("Accuracy Linear Model: {}".format(acc_logReg))

y_pred_linear = linear_model.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_linear
})

submission.to_csv('submission_linear.csv', index=False)

Accuracy Linear Model: 39.79


In [71]:
#SVM

from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(train, target)

#acurácia do modelo
acc_logReg = round(svm_model.score(train, target) * 100, 2)
print("Accuracy SVM: {}".format(acc_logReg))

y_pred_svm = svm_model.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_svm
})

submission.to_csv('submission_svm.csv', index=False)

Accuracy SVM: 68.46


In [85]:
#SGD
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
sgd_model.fit(train, target)
SGDClassifier(max_iter=5)

#acurácia do modelo
acc_logReg = round(sgd_model.score(train, target) * 100, 2)
print("Accuracy SGD: {}".format(acc_logReg))

y_pred_sgd = sgd_model.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_sgd
})

submission.to_csv('./submission_sgd.csv', index=False)

Acurácia do modelo de Regressão Logística: 64.87




In [93]:
#NCA
from sklearn.neighbors import (NeighborhoodComponentsAnalysis, KNeighborsClassifier)
from sklearn.pipeline import Pipeline

nca = NeighborhoodComponentsAnalysis(random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)
nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
nca_pipe.fit(train, target)

#acurácia do modelo
acc_logReg = round(nca_pipe.score(train, target) * 100, 2)
print("Accuracy NCA: {}".format(acc_logReg))

y_pred_nca = nca_pipe.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_nca
})

submission.to_csv('./submission_nca.csv', index=False)

Accuracy SGD: 87.77


In [84]:
#NEAREST CENTROID
from sklearn.neighbors import NearestCentroid

nc_model = NearestCentroid()
nc_model.fit(train, target)

#acurácia do modelo
acc_logReg = round(nc_model.score(train, target) * 100, 2)
print("Accuracy Nearest Centroid: {}".format(acc_logReg))

y_pred_nc = nc_model.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_nc
})

submission.to_csv('./submission_nc.csv', index=False)

Acurácia do modelo de Regressão Logística: 66.89


In [92]:
#GAUSSIANNB
from sklearn.naive_bayes import GaussianNB
gnb_model = GaussianNB()
gnb_model.fit(train, target)

#acurácia do modelo
acc_logReg = round(gnb_model.score(train, target) * 100, 2)
print("Accuracy GaussianNB: {}".format(acc_logReg))

y_pred_gnb = gnb_model.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_gnb
})

submission.to_csv('./submission_gnb.csv', index=False)

Accuracy GaussianNB: 68.46


In [110]:
#DECISION TREE CLASSIFIER
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier(max_depth=3)
tree_model.fit(train, target)

#acurácia do modelo
acc_tree = round(tree_model.score(train, target) * 100, 2)
print("Accuracy Decision Tree Classifier: {}".format(acc_tree))

y_pred_tree = tree_model.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_tree
})

submission.to_csv('./submission_tree.csv', index=False)

Accuracy Decision Tree Classifier: 82.72


In [95]:
#DECISION TREE REGRESSOR
from sklearn.tree import DecisionTreeRegressor
tree_regressor_model = DecisionTreeRegressor(max_depth=3)
tree_regressor_model.fit(train, target)

#acurácia do modelo
acc_logReg = round(tree_regressor_model.score(train, target) * 100, 2)
print("Accuracy Decision Tree Regressor: {}".format(acc_logReg))

y_pred_tree = tree_regressor_model.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_tree
})

submission.to_csv('./submission_tree.csv', index=False)

Accuracy Decision Tree Regressor: 46.95


In [109]:
#RANFOM FOREST
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=10)
rf_model.fit(train, target)

#acurácia do modelo
acc_logReg = round(rf_model.score(train, target) * 100, 2)
print("Accuracy Random Forest: {}".format(acc_logReg))

y_pred_rf = rf_model.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_rf
})

submission.to_csv('./submission_rf.csv', index=False)

Accuracy Random Forest: 96.41


In [99]:
#Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls')
gbr_model.fit(train, target)

#acurácia do modelo
acc_logReg = round(gbr_model.score(train, target) * 100, 2)
print("Accuracy Gradient Boosting Regressor: {}".format(acc_logReg))

y_pred_gbr = gbr_model.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_gbr
})

submission.to_csv('./submission_gbr.csv', index=False)

Accuracy Gradient Boosting Regressor: 42.43


In [101]:
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gbrc_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
gbrc_model.fit(train, target)

#acurácia do modelo
acc_logReg = round(gbrc_model.score(train, target) * 100, 2)
print("Accuracy Gradient Boosting Classifier: {}".format(acc_logReg))

y_pred_gbrc = gbrc_model.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_gbrc
})

submission.to_csv('./submission_gbrc.csv', index=False)

Accuracy Gradient Boosting Classifier: 85.52


In [111]:
#Hist Gradient Boosting Classifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
hgb_model = HistGradientBoostingClassifier(max_iter=100)
hgb_model.fit(train, target)

#acurácia do modelo
acc_logReg = round(hgb_model.score(train, target) * 100, 2)
print("Accuracy Hist Gradient Boosting: {}".format(acc_logReg))

y_pred_hgb = hgb_model.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_hgb
})

submission.to_csv('./submission_hgb.csv', index=False)

Accuracy Hist Gradient Boosting: 93.94


In [108]:
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(solver='liblinear')
lr_model.fit(train, target)

#acurácia do modelo
acc_logReg = round(lr_model.score(train, target) * 100, 2)
print("Accuracy Logistic Regression: {}".format(acc_logReg))

y_pred_lr = lr_model.predict(test)

submission = pd.DataFrame({
    "PassengerId": passengerId,
    "Survived": y_pred_lr
})

submission.to_csv('submission_lr.csv', index=False)

Accuracy Logistic Regression: 80.13
