In [1]:
import numpy as np
import pandas as pd 



In [2]:
# Leitura dos dados
dados_treino = pd.read_csv(r'dados\train.csv')
dados_teste = pd.read_csv(r'dados\test.csv')

In [3]:
# Verificando ausência de dados
dados_treino.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
dados_teste.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
# É possível notar que tanto na base de dados de treino quanto de teste existem dados faltantes para as colunas Age, e especificamente na base de treino existem dados faltantes para a coluna Embarked, e Fare para os dados de treino
# Tratamento de dados faltantes para a coluna Age
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='median')
# Tratamento de dados categóricos realizados pelo mais frequente
imp_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')


In [6]:
dados_treino['Age']=imp.fit_transform(dados_treino['Age'].values.reshape(-1,1))
dados_treino['Embarked'] = imp_cat.fit_transform(dados_treino['Embarked'].values.reshape(-1,1))
dados_teste['Age']=imp.fit_transform(dados_teste['Age'].values.reshape(-1,1))
dados_teste['Fare']=imp.fit_transform(dados_teste['Fare'].values.reshape(-1,1))

In [7]:
# Utilizando variáveis dummie nas colunas Embarked e Sex
dados_treino = pd.get_dummies(dados_treino, columns=['Embarked', 'Sex'], drop_first=True)
dados_teste = pd.get_dummies(dados_teste, columns=['Embarked', 'Sex'], drop_first=True)

In [8]:
dados_treino.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_Q,Embarked_S,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,1,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,1,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,1,1


In [30]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dados_treino['Age'] = scaler.fit_transform(dados_treino['Age'].values.reshape(-1,1))
dados_treino['Fare'] = scaler.fit_transform(dados_treino['Fare'].values.reshape(-1,1))
dados_teste['Age'] = scaler.fit_transform(dados_teste['Age'].values.reshape(-1,1))
dados_teste['Fare'] = scaler.fit_transform(dados_teste['Fare'].values.reshape(-1,1))

In [33]:
# Dados a serem utilizados para predição se um passageiro sobreviveu ou não
# Pclass, Sex, Age, Sibsp, Parch, Q, S, Fare
# Modelos para classificação: Regressão Logística, KNN, Árvores de Descisão, Gradient Boosting, SVM
# Separação dos features do target
y = dados_treino['Survived']
X = dados_treino.drop(['PassengerId', 'Name', 'Survived', 'Ticket', 'Cabin'], axis=1)
base_teste_final = dados_teste.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

In [34]:
# Primeiro modelo: Árvore de Decisão
# Separação em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, test_size=0.3)

In [21]:
grid_params = {'criterion': ['gini', 'entropy'], 
'max_depth': np.arange(1,10)}
tree_model = DecisionTreeClassifier()
gs_tree_model = GridSearchCV(tree_model, grid_params, cv=10, scoring='accuracy')


In [22]:
gs_tree_model.fit(X_train, y_train)


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
             scoring='accuracy')

In [23]:
gs_tree_model.best_params_

{'criterion': 'entropy', 'max_depth': 4}

In [24]:
gs_tree_model.best_score_

0.817153097798259

In [25]:
# Melhor modelo GridSearch
gs_tree_model.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [26]:
pred_tree = gs_tree_model.predict(X_test);

In [27]:
accuracy_score(y_test, pred_tree)

0.7649253731343284

In [37]:
# Previsão no dataset de teste
pred_teste_final = gs_tree_model.predict(base_teste_final);

In [50]:
df_final = pd.DataFrame()
df_final['Survived'] = pred_teste_final
df_final['PassengerId'] = dados_teste['PassengerId']



In [51]:
df_final.to_csv('final_teste1.csv', index=False)

In [35]:
# Estimando o modelo SVM 
grid_params_svm = {
    'C':[0.001, 0.01, 0.1, 1, 10],
    'kernel':['linear', 'rbf', 'poly'],
    }
svm = SVC()
gs_svm_model = GridSearchCV(svm, grid_params_svm, cv=10, scoring='accuracy', n_jobs=-1)

In [36]:
#Fitando o modelo nos dados
gs_svm_model.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'kernel': ['linear', 'rbf', 'poly']},
             scoring='accuracy')

In [37]:
gs_svm_model.best_estimator_

SVC(C=1)

In [38]:
gs_svm_model.best_score_

0.821838197644649

In [39]:
pred_teste_final_svm = gs_svm_model.predict(base_teste_final)

In [40]:
df_final_svm = pd.DataFrame()
df_final_svm['Survived'] = pred_teste_final_svm
df_final_svm['PassengerId'] = dados_teste['PassengerId']

In [41]:
df_final_svm.to_csv('pred_svm_standard.csv', index=False)

In [42]:
# Prevendo modelo GradientBoost
grid_params_GB = {
        "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}
gb = GradientBoostingClassifier()
gs_gb_model = GridSearchCV(gb, grid_params_GB, cv=10, n_jobs=-1, scoring='accuracy')


In [43]:
gs_gb_model.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.1, 1, 10, 100],
                         'max_depth': [1, 3, 5, 7, 9],
                         'n_estimators': [5, 50, 250, 500]},
             scoring='accuracy')

In [24]:
gs_gb_model.best_estimator_

GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=250)

In [44]:
gs_gb_model.best_score_

0.8379928315412186

In [45]:
y_gb_model_pred = gs_gb_model.predict(X_test);
accuracy_score(y_test, y_gb_model_pred)

0.7947761194029851

In [46]:
df_final_gb = pd.DataFrame()
df_final_gb['Survived'] = gs_gb_model.predict(base_teste_final)
df_final_gb['PassengerId'] = dados_teste['PassengerId']
df_final_gb.to_csv('pred_gb_standarlization.csv', index=False)

In [47]:
knn = KNeighborsClassifier()
param_grid_knn = {
    'n_neighbors': range(1,50)
}
gs_knn_model = GridSearchCV(knn, param_grid_knn, cv=10, n_jobs=-1)

In [48]:
gs_knn_model.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': range(1, 50)})

In [49]:
gs_knn_model.best_estimator_

KNeighborsClassifier(n_neighbors=17)

In [50]:
gs_knn_model.best_score_

0.8107014848950334

In [51]:
y_pred_knn = gs_knn_model.predict(X_test)

In [52]:
accuracy_score(y_test, y_pred_knn)

0.7947761194029851

In [54]:
df_final_knn = pd.DataFrame()
df_final_knn['Survived'] = gs_knn_model.predict(base_teste_final)
df_final_knn['PassengerId'] = dados_teste['PassengerId']
df_final_knn.to_csv('pred_knn_standarlization.csv', index=False)