In [1]:
import numpy as np
import pandas as pd 



In [2]:
# Leitura dos dados
dados = pd.read_csv(r'dados\train.csv')
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
dados.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
# Tratamento de dados faltantes

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='median')
# Tratamento de dados categóricos realizados pelo mais frequente
imp_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
dados['Age']=imp.fit_transform(dados['Age'].values.reshape(-1,1))
dados['Embarked'] = imp_cat.fit_transform(dados['Embarked'].values.reshape(-1,1))

In [5]:
# Utilizando variáveis dummie na coluna Embarked
dados = pd.concat([dados, pd.get_dummies(dados['Embarked'], drop_first=True)], axis=1) 


In [6]:
dados.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
 12  Q            891 non-null    uint8  
 13  S            891 non-null    uint8  
dtypes: float64(2), int64(5), object(5), uint8(2)
memory usage: 85.4+ KB


In [7]:
# Dados a serem utilizados para predição se um passageiro sobreviveu ou não
# Pclass, Sex, Age, Sibsp, Parch, Q, S
# Modelos para classificação: Regressão Logística, KNN, Árvores de Descisão, Gradient Boosting, SVM
# Separação dos features do target
y = dados['Survived']
X = dados.drop(['PassengerId', 'Name', 'Survived', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis=1)

In [8]:
X = pd.concat([X,pd.get_dummies(X['Sex'], drop_first=True)], axis=1)
X.drop('Sex', axis=1, inplace=True)

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X['Age']=scaler.fit_transform(X['Age'].values.reshape(-1,1));

In [10]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Q,S,male
0,3,-0.565736,1,0,0,1,1
1,1,0.663861,1,0,0,0,0
2,3,-0.258337,0,0,0,1,0
3,1,0.433312,1,0,0,1,0
4,3,0.433312,0,0,0,1,1


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [13]:
grid_params = {'criterion': ['gini', 'entropy'], 
'max_depth': np.arange(1,10), 
'min_samples_leaf': np.linspace(0.1, 0.5, num=20)}
tree_model = DecisionTreeClassifier()
gs_tree_model = GridSearchCV(tree_model, grid_params, cv=10, scoring='accuracy')


In [14]:
gs_tree_model.fit(X_train, y_train)


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                         'min_samples_leaf': array([0.1       , 0.12105263, 0.14210526, 0.16315789, 0.18421053,
       0.20526316, 0.22631579, 0.24736842, 0.26842105, 0.28947368,
       0.31052632, 0.33157895, 0.35263158, 0.37368421, 0.39473684,
       0.41578947, 0.43684211, 0.45789474, 0.47894737, 0.5       ])},
             scoring='accuracy')

In [15]:
gs_tree_model.best_params_

{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 0.1}

In [16]:
gs_tree_model.best_score_

0.7814337403889644

In [17]:
gs_tree_model.best_estimator_

DecisionTreeClassifier(max_depth=1, min_samples_leaf=0.1)

In [18]:
pred_tree = gs_tree_model.predict(X_test);

In [19]:
accuracy_score(y_test, pred_tree)

0.8026905829596412

In [20]:
teste_dados = pd.read_csv('dados/test.csv')

In [21]:
teste_dados.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [22]:
teste_tree = teste_dados.drop(['Name', 'Ticket', 'Cabin', 'Fare'], axis=1)

In [23]:
# Tratamento de dados categóricos realizados pelo mais frequente
teste_tree['Age']=imp.fit_transform(teste_tree['Age'].values.reshape(-1,1))
teste_tree['Embarked'] = imp_cat.fit_transform(teste_tree['Embarked'].values.reshape(-1,1))

In [24]:
teste_tree = pd.concat([teste_tree, pd.get_dummies(teste_tree['Embarked'], drop_first=True)], axis=1) 


In [25]:
teste_tree.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Embarked,Q,S
0,892,3,male,34.5,0,0,Q,1,0
1,893,3,female,47.0,1,0,S,0,1
2,894,2,male,62.0,0,0,Q,1,0
3,895,3,male,27.0,0,0,S,0,1
4,896,3,female,22.0,1,1,S,0,1


In [26]:
X_teste = teste_tree.drop('PassengerId', axis=1)
X_teste = pd.concat([X_teste,pd.get_dummies(X_teste['Sex'], drop_first=True)], axis=1)

In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_teste['Age']=scaler.fit_transform(X_teste['Age'].values.reshape(-1,1));

In [28]:
X_teste.drop(['Embarked'], inplace=True, axis=1)

In [32]:
X_teste.drop('Sex', inplace=True, axis=1)

In [33]:
previsoes_teste = gs_tree_model.predict(X_teste)

In [35]:
teste_dados['Survived'] = previsoes_teste

In [38]:
df_final = teste_dados[['PassengerId', 'Survived']]

In [41]:
df_final.to_csv('final_teste.csv', index=False)

In [40]:
df_final.head(
)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
