<a href="https://colab.research.google.com/github/arenas-franklin/bootcampDataEngineer/blob/main/IntroducaoDataScience/modelo_arvore_titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modelo Titanic

## Pacotes Necessários

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


## Lendo o banco de dados

In [2]:
dados = pd.read_csv("/content/train.csv", sep=",")

In [3]:
dados.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
dados.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
# remover algumas colunas que não será utlizada para analise 
dados = dados.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)

In [6]:
dados.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,male,22.0,1,0,7.25
1,2,1,1,female,38.0,1,0,71.2833
2,3,1,3,female,26.0,0,0,7.925
3,4,1,1,female,35.0,1,0,53.1
4,5,0,3,male,35.0,0,0,8.05


## Editando Chave e Variável Resposta

In [7]:
# tornando a coluna PassengerId como coluna de referencia uma chave primária 
dados = dados.set_index(['PassengerId'])
# mudar o nome da columa Survived para Target
dados = dados.rename(columns = {'Survived':'Target'}, inplace = False)

In [8]:
dados.head()

Unnamed: 0_level_0,Target,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,22.0,1,0,7.25
2,1,1,female,38.0,1,0,71.2833
3,1,3,female,26.0,0,0,7.925
4,1,1,female,35.0,1,0,53.1
5,0,3,male,35.0,0,0,8.05


## Descritiva

In [9]:
dados.describe()

Unnamed: 0,Target,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
dados.describe(include=['O'])

Unnamed: 0,Sex
count,891
unique,2
top,male
freq,577


## Transformação de dados

In [13]:
dados['Sex_f'] = np.where(dados['Sex'] == 'female', 1,0 )

dados['Pclass_1'] = np.where(dados['Pclass'] == 1, 1, 0)
dados['Pclass_2'] = np.where(dados['Pclass'] == 2, 1, 0)
dados['Pclass_3'] = np.where(dados['Pclass'] == 3, 1, 0)

In [14]:
dados = dados.drop(['Pclass', 'Sex'], axis = 1)

In [15]:
dados.head()

Unnamed: 0_level_0,Target,Age,SibSp,Parch,Fare,Sex_f,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,22.0,1,0,7.25,0,0,0,1
2,1,38.0,1,0,71.2833,1,1,0,0
3,1,26.0,0,0,7.925,1,0,0,1
4,1,35.0,1,0,53.1,1,1,0,0
5,0,35.0,0,0,8.05,0,0,0,1


In [16]:
# vendo quantos dados faltantes existem
dados.isnull().sum()

Target        0
Age         177
SibSp         0
Parch         0
Fare          0
Sex_f         0
Pclass_1      0
Pclass_2      0
Pclass_3      0
dtype: int64

In [17]:
# substituir os dados faltantes por 0
dados.fillna(0, inplace = True)

In [18]:
dados.isnull().sum()

Target      0
Age         0
SibSp       0
Parch       0
Fare        0
Sex_f       0
Pclass_1    0
Pclass_2    0
Pclass_3    0
dtype: int64

## Amostragem 

In [21]:
x_train, x_test, y_train, y_test = train_test_split(dados.drop(['Target'], axis=1),
                                                     dados['Target'],
                                                     test_size = 0.3,
                                                     random_state = 1234)

[{'treino': x_train.shape}, {'teste': x_test.shape}]

[{'treino': (623, 8)}, {'teste': (268, 8)}]

## Modelo

**Random Forest**

In [24]:
rndforest =  RandomForestClassifier(n_estimators = 1000,
                                    criterion = 'gini',
                                    max_depth = 5)
rndforest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
probabilidade = rndforest.predict_proba(dados.drop('Target', axis=1))[:,1]
classificacao = rndforest.predict(dados.drop('Target', axis=1)) 

In [28]:
dados['Probabilidade'] = probabilidade
dados['Classificacao'] = classificacao

In [29]:
dados.head(10)

Unnamed: 0_level_0,Target,Age,SibSp,Parch,Fare,Sex_f,Pclass_1,Pclass_2,Pclass_3,Probabilidade,Classificacao
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,22.0,1,0,7.25,0,0,0,1,0.123844,0
2,1,38.0,1,0,71.2833,1,1,0,0,0.931202,1
3,1,26.0,0,0,7.925,1,0,0,1,0.445025,0
4,1,35.0,1,0,53.1,1,1,0,0,0.915795,1
5,0,35.0,0,0,8.05,0,0,0,1,0.137116,0
6,0,0.0,0,0,8.4583,0,0,0,1,0.147201,0
7,0,54.0,0,0,51.8625,0,1,0,0,0.368295,0
8,0,2.0,3,1,21.075,0,0,0,1,0.297467,0
9,1,27.0,0,2,11.1333,1,0,0,1,0.489343,0
10,1,14.0,1,0,30.0708,1,0,1,0,0.818137,1
