# Titanic: Machine Learning from Disaster
## Bibliotecas utilizadas

In [1]:
from collections import namedtuple
from numbers import Number

import pandas as pd

## Lendo o arquivo e exibindo suas extremidades

In [2]:
csv_train = pd.read_csv('datasets/train.csv')
csv_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
csv_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


## Criando um novo dataframe
Esse novo dataframe terá apenas as colunas úteis para modelagem. Serão removidos o nome e o número do ticket, PassengerId será colocado como índice e as variáveis Sex e Embarked serão transformadas em dummy

In [19]:
train = csv_train.copy()  # Cria uma cópia do dataframe
train.set_index('PassengerId', inplace=True)  # Define PassengerId como novo índice das linhas
dummies = pd.get_dummies(train.loc[:, ['Sex', 'Embarked']])  # transforma Sex e Embarked em variáveis dummy
train = pd.concat([train, dummies], axis=1)  # Concatena train e dummies
train.drop(['Sex', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)  # Remove colunas inúteis

# Criando a variável CabinType

In [20]:
def gera_CabinType(cabin):
    if isinstance(cabin, Number):
        return 'X'
    elif ' ' in cabin:  # Mais de uma cabine
        cabins = cabin.split()  # Separa todas as cabines em uma lista
        cabins = [c[0] for c in cabins]  # Pega o primeiro caractere de cada cabine
        return ''.join(set(cabins))  # Remove duplicatas e junta todos os CabinTypes
    else:
        return cabin[0]

CabinTypes = train['Cabin'].apply(gera_CabinType)
CabinTypes.name = 'CabinType'
train = pd.concat([train, CabinTypes], axis=1)
train.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,CabinType
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,22.0,1,0,7.25,,0,1,0,0,1,X
2,1,1,38.0,1,0,71.2833,C85,1,0,1,0,0,C
3,1,3,26.0,0,0,7.925,,1,0,0,0,1,X
4,1,1,35.0,1,0,53.1,C123,1,0,0,0,1,C
5,0,3,35.0,0,0,8.05,,0,1,0,0,1,X


In [21]:
print(train.groupby('CabinType').apply(lambda x: x.Fare.min()))
print(train.groupby('CabinType').apply(lambda x: x.Fare.mean()))
print(train.groupby('CabinType').apply(lambda x: x.Fare.max()))

CabinType
A      0.0000
B      0.0000
C     26.5500
D     12.8750
E      8.0500
EF    22.3583
F      7.7500
FG     7.6500
G     10.4625
T     35.5000
X      0.0000
dtype: float64
CabinType
A      39.623887
B     113.505764
C     100.151341
D      57.244576
E      46.026694
EF     22.358300
F      21.972222
FG      7.650000
G      13.581250
T      35.500000
X      19.157325
dtype: float64
CabinType
A      81.8583
B     512.3292
C     263.0000
D     113.2750
E     134.5000
EF     22.3583
F      39.0000
FG      7.6500
G      16.7000
T      35.5000
X     512.3292
dtype: float64


## Analisando valores missing em Cabin
Comparando o preço e a classe dos passageiros sem número de cabine

In [7]:
PreçosCabines = namedtuple('PreçosCabines', 'min méd max')
com_cabine = train.loc[train.Cabin.notnull(), :]
sem_cabine = train.loc[train.Cabin.isnull(), :]
preços_sem_cabine = PreçosCabines(sem_cabine.Fare.min(),
                                  sem_cabine.Fare.mean(),
                                  sem_cabine.Fare.max())
preços_com_cabine = PreçosCabines(com_cabine.Fare.min(),
                                  com_cabine.Fare.mean(),
                                  com_cabine.Fare.max())
print(f'Preços dos tickets dos passageiros sem cabine: {preços_sem_cabine}')
print(f'Preços dos tickets dos passageiros com cabine: {preços_com_cabine}')

train.loc[train.Fare > 512, :]

Preços dos tickets dos passageiros sem cabine: PreçosCabines(min=0.0, méd=19.157325327510915, max=512.32920000000001)
Preços dos tickets dos passageiros com cabine: PreçosCabines(min=0.0, méd=76.141503921568628, max=512.32920000000001)


Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,CabinType
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
259,1,1,35.0,0,0,512.3292,,1,0,1,0,0,
680,1,1,36.0,0,1,512.3292,B51 B53 B55,0,1,1,0,0,B
738,1,1,35.0,0,0,512.3292,B101,0,1,1,0,0,B


## Próximo passo: prever o tipo de cabine