In [26]:
import pandas as pd
import numpy as np

In [27]:
gender = pd.read_csv('titanic/gender_submission.csv', index_col=False, squeeze=True)
gender.dropna()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [28]:
train = pd.read_csv('titanic/train.csv', index_col=False, squeeze=True)

#realiza leitura e já remove linhas com valores NaN
train = train.dropna()

In [29]:
#printando então a matriz de correlação, notamos que o atributo "Fare" tem uma boa correlação com
#os membros sobreviventes, vamos explorar melhor
train[["PassengerId", "Survived", "Pclass", "Age", 'SibSp', "Parch", "Fare"]].corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,0.148495,-0.089136,0.030933,-0.083488,-0.051454,0.02974
Survived,0.148495,1.0,-0.034542,-0.254085,0.106346,0.023582,0.134241
Pclass,-0.089136,-0.034542,1.0,-0.306514,-0.103592,0.047496,-0.315235
Age,0.030933,-0.254085,-0.306514,1.0,-0.156162,-0.271271,-0.092424
SibSp,-0.083488,0.106346,-0.103592,-0.156162,1.0,0.255346,0.286433
Parch,-0.051454,0.023582,0.047496,-0.271271,0.255346,1.0,0.38974
Fare,0.02974,0.134241,-0.315235,-0.092424,0.286433,0.38974,1.0


In [30]:
def toNumerical(df, attr):
    return pd.factorize(df[attr])[0]

#converte alguns atributos categóricos para numéricos para análise de correlação
train['Sex'] = toNumerical(train, 'Sex')
train['Embarked'] = toNumerical(train, 'Embarked')
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,1
6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,1
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",0,4.0,1,1,PP 9549,16.7000,G6,1
11,12,1,1,"Bonnell, Miss. Elizabeth",0,58.0,0,0,113783,26.5500,C103,1
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",0,47.0,1,1,11751,52.5542,D35,1
872,873,0,1,"Carlsson, Mr. Frans Olof",1,33.0,0,0,695,5.0000,B51 B53 B55,1
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",0,56.0,0,1,11767,83.1583,C50,0
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,1


In [31]:
#podemos notar claramente que o sexo é um fator que pode ser bastante útil para identificar a sobrevivência
#isso, olhando para as correlações. Vamos tentar identificar outras informações do dataset
train[["PassengerId", "Survived", "Age", 'SibSp', "Fare", "Sex", "Embarked"]].corr()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Fare,Sex,Embarked
PassengerId,1.0,0.148495,0.030933,-0.083488,0.02974,-0.025205,0.021576
Survived,0.148495,1.0,-0.254085,0.106346,0.134241,-0.532418,-0.108914
Age,0.030933,-0.254085,1.0,-0.156162,-0.092424,0.184969,-0.07779
SibSp,-0.083488,0.106346,-0.156162,1.0,0.286433,-0.104291,0.089959
Fare,0.02974,0.134241,-0.092424,0.286433,1.0,-0.130433,-0.227926
Sex,-0.025205,-0.532418,0.184969,-0.104291,-0.130433,1.0,0.059453
Embarked,0.021576,-0.108914,-0.07779,0.089959,-0.227926,0.059453,1.0


In [33]:
#podemos notar então, que no dataset, temos 60 mortos e 123 sobreviventes
train.groupby("Survived")["Survived"].count()

Survived
0     60
1    123
Name: Survived, dtype: int64

In [51]:
#olhando agora para o atributo Passagem
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FareGroup
count,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0
mean,455.36612,0.672131,1.191257,0.519126,35.674426,0.464481,0.47541,78.682469,0.655738,0.704918
std,247.052476,0.470725,0.515187,0.501005,15.643866,0.644159,0.754617,76.347843,0.498963,1.1046
min,2.0,0.0,1.0,0.0,0.92,0.0,0.0,0.0,0.0,0.0
25%,263.5,0.0,1.0,0.0,24.0,0.0,0.0,29.7,0.0,0.0
50%,457.0,1.0,1.0,1.0,36.0,0.0,0.0,57.0,1.0,0.0
75%,676.0,1.0,1.0,1.0,47.5,1.0,1.0,90.0,1.0,1.0
max,890.0,1.0,3.0,1.0,80.0,3.0,4.0,512.3292,2.0,7.0


In [47]:
#Como podemos perceber, pela média e desvio padrão, podemos dividir os atributos em grupos de 
#aproximadamente 70 em 70. Fazendo isso:

def agroupFare(row):
    fare = row["Fare"]
    if(fare < 70): 
        return 0
    if(fare < 140):
        return 1
    if(fare < 210):
        return 2
    if(fare < 280):
        return 3
    if(fare < 350):
        return 4
    if(fare < 420):
        return 5
    if(fare < 500):
        return 6
    return 7

train["FareGroup"] = train.apply (lambda row: agroupFare(row), axis=1)
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FareGroup
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,1,0
6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,1,0
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",0,4.0,1,1,PP 9549,16.7000,G6,1,0
11,12,1,1,"Bonnell, Miss. Elizabeth",0,58.0,0,0,113783,26.5500,C103,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",0,47.0,1,1,11751,52.5542,D35,1,0
872,873,0,1,"Carlsson, Mr. Frans Olof",1,33.0,0,0,695,5.0000,B51 B53 B55,1,0
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",0,56.0,0,1,11767,83.1583,C50,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,1,0


In [50]:
modelDataset = train[["PassengerId", "Survived", "Age", 'SibSp', "Fare", "FareGroup", "Sex", "Embarked"]]
modelDataset

Unnamed: 0,PassengerId,Survived,Age,SibSp,Fare,FareGroup,Sex,Embarked
1,2,1,38.0,1,71.2833,1,0,0
3,4,1,35.0,1,53.1000,0,0,1
6,7,0,54.0,0,51.8625,0,1,1
10,11,1,4.0,1,16.7000,0,0,1
11,12,1,58.0,0,26.5500,0,0,1
...,...,...,...,...,...,...,...,...
871,872,1,47.0,1,52.5542,0,0,1
872,873,0,33.0,0,5.0000,0,1,1
879,880,1,56.0,0,83.1583,1,0,0
887,888,1,19.0,0,30.0000,0,0,1
