In [132]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report

import time

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn import tree
from sklearn import svm as SVM
from sklearn.neural_network import MLPClassifier

In [133]:
gender = pd.read_csv('titanic/gender_submission.csv', index_col=False, squeeze=True)
gender.dropna()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [134]:
train = pd.read_csv('titanic/train.csv', index_col=False, squeeze=True)

#realiza leitura e já remove linhas com valores NaN
train["Age"] = train["Age"].fillna(train["Age"].mean())
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C


In [135]:
#printando então a matriz de correlação, notamos que o atributo "Fare" tem uma boa correlação com
#os membros sobreviventes, vamos explorar melhor
train[["PassengerId", "Survived", "Pclass", "Age", 'SibSp', "Parch", "Fare"]].corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.033207,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.069809,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495
Age,0.033207,-0.069809,-0.331339,1.0,-0.232625,-0.179191,0.091566
SibSp,-0.057527,-0.035322,0.083081,-0.232625,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.179191,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.091566,0.159651,0.216225,1.0


In [136]:
def toNumerical(df, attr):
    return pd.factorize(df[attr])[0]

#converte alguns atributos categóricos para numéricos para análise de correlação
train['Sex'] = toNumerical(train, 'Sex')
train['Embarked'] = toNumerical(train, 'Embarked')
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.000000,1,0,A/5 21171,7.2500,,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.000000,1,0,PC 17599,71.2833,C85,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.000000,0,0,STON/O2. 3101282,7.9250,,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.000000,1,0,113803,53.1000,C123,0
4,5,0,3,"Allen, Mr. William Henry",0,35.000000,0,0,373450,8.0500,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.000000,0,0,211536,13.0000,,0
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.000000,0,0,112053,30.0000,B42,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,29.699118,1,2,W./C. 6607,23.4500,,0
889,890,1,1,"Behr, Mr. Karl Howell",0,26.000000,0,0,111369,30.0000,C148,1


In [137]:
#podemos notar claramente que o sexo é um fator que pode ser bastante útil para identificar a sobrevivência
#isso, olhando para as correlações. Vamos tentar identificar outras informações do dataset
train[["PassengerId", "Survived", "Age", "Pclass", 'SibSp', "Fare", "Sex", "Embarked"]].corr()

Unnamed: 0,PassengerId,Survived,Age,Pclass,SibSp,Fare,Sex,Embarked
PassengerId,1.0,-0.005007,0.033207,-0.035144,-0.057527,0.012658,-0.042939,-0.030323
Survived,-0.005007,1.0,-0.069809,-0.338481,-0.035322,0.257307,0.543351,0.101849
Age,0.033207,-0.069809,1.0,-0.331339,-0.232625,0.091566,-0.084153,0.001932
Pclass,-0.035144,-0.338481,-0.331339,1.0,0.083081,-0.5495,-0.1319,0.050992
SibSp,-0.057527,-0.035322,-0.232625,0.083081,1.0,0.159651,0.114631,-0.058008
Fare,0.012658,0.257307,0.091566,-0.5495,0.159651,1.0,0.182333,0.058462
Sex,-0.042939,0.543351,-0.084153,-0.1319,0.114631,0.182333,1.0,0.111249
Embarked,-0.030323,0.101849,0.001932,0.050992,-0.058008,0.058462,0.111249,1.0


In [138]:
#podemos notar então, que no dataset, temos 60 mortos e 123 sobreviventes
train.groupby("Survived")["Survived"].count()

Survived
0    549
1    342
Name: Survived, dtype: int64

In [139]:
#olhando agora para o atributo Passagem
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,29.699118,0.523008,0.381594,32.204208,0.359147
std,257.353842,0.486592,0.836071,0.47799,13.002015,1.102743,0.806057,49.693429,0.638707
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0,-1.0
25%,223.5,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,0.0
50%,446.0,0.0,3.0,0.0,29.699118,0.0,0.0,14.4542,0.0
75%,668.5,1.0,3.0,1.0,35.0,1.0,0.0,31.0,1.0
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0


In [140]:
#Como podemos perceber, pela média e desvio padrão, podemos dividir os atributos em grupos de 
#aproximadamente 70 em 70. Fazendo isso:

def agroupFare(row):
    fare = row["Fare"]
    if(fare < 70): 
        return 0
    if(fare < 140):
        return 1
    if(fare < 210):
        return 2
    if(fare < 280):
        return 3
    if(fare < 350):
        return 4
    if(fare < 420):
        return 5
    if(fare < 500):
        return 6
    return 7

train["FareGroup"] = train.apply (lambda row: agroupFare(row), axis=1)
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FareGroup
0,1,0,3,"Braund, Mr. Owen Harris",0,22.000000,1,0,A/5 21171,7.2500,,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.000000,1,0,PC 17599,71.2833,C85,1,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.000000,0,0,STON/O2. 3101282,7.9250,,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.000000,1,0,113803,53.1000,C123,0,0
4,5,0,3,"Allen, Mr. William Henry",0,35.000000,0,0,373450,8.0500,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.000000,0,0,211536,13.0000,,0,0
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.000000,0,0,112053,30.0000,B42,0,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,29.699118,1,2,W./C. 6607,23.4500,,0,0
889,890,1,1,"Behr, Mr. Karl Howell",0,26.000000,0,0,111369,30.0000,C148,1,0


In [141]:
modelDataset = train[["PassengerId", "Survived", "Age", 'Pclass', "Fare", "FareGroup", "Sex", "Embarked"]]
modelDataset.corr()

Unnamed: 0,PassengerId,Survived,Age,Pclass,Fare,FareGroup,Sex,Embarked
PassengerId,1.0,-0.005007,0.033207,-0.035144,0.012658,0.01489,-0.042939,-0.030323
Survived,-0.005007,1.0,-0.069809,-0.338481,0.257307,0.208589,0.543351,0.101849
Age,0.033207,-0.069809,1.0,-0.331339,0.091566,0.08191,-0.084153,0.001932
Pclass,-0.035144,-0.338481,-0.331339,1.0,-0.5495,-0.438254,-0.1319,0.050992
Fare,0.012658,0.257307,0.091566,-0.5495,1.0,0.951401,0.182333,0.058462
FareGroup,0.01489,0.208589,0.08191,-0.438254,0.951401,1.0,0.149118,0.079204
Sex,-0.042939,0.543351,-0.084153,-0.1319,0.182333,0.149118,1.0,0.111249
Embarked,-0.030323,0.101849,0.001932,0.050992,0.058462,0.079204,0.111249,1.0


In [142]:
def classifier(X_train,X_test,Y_train,Y_test):
    #Aplicando KNN
    print(" KNN ")
    ini = time.time() #Inicio do calculo do tempo

    knn=KNeighborsClassifier()
    knn.fit(X_train,Y_train)

    end = time.time()
    knn_time = end - ini # Fim do calculo do tempo

    print(classification_report(Y_test,knn.predict(X_test)))
    print()
    
    #Aplicando Naive Bayes
    print("Naive Bayes")
    ini = time.time() #Inicio do calculo do tempo

    nb=BernoulliNB()
    nb.fit(X_train,Y_train)

    end = time.time()
    nb_time = end - ini # Fim do calculo do tempo

    print(classification_report(Y_test,nb.predict(X_test)))
    print()
    
    #Aplicando Arvore de Decisao
    print("Árvore de Decisão")
    ini = time.time() #Inicio do calculo do tempo

    dtc=tree.DecisionTreeClassifier()
    dtc.fit(X_train,Y_train)

    end = time.time()
    dtc_time = end - ini # Fim do calculo do tempo

    print(classification_report(Y_test,dtc.predict(X_test)))

    #Aplicando SVM
    # print("SVM")
    # ini = time.time() #Inicio do calculo do tempo

    # svm = SVM.SVC(kernel='linear', probability=True)
    # svm.fit(X_train, Y_train)

    # end = time.time()
    # svm_time = end - ini # Fim do calculo do tempo

    # print(classification_report(Y_test,svm.predict(X_test)))

    #Aplicando MLP
    print("MLP")
    ini = time.time() #Inicio do calculo do tempo

    mlp = MLPClassifier(alpha=1, max_iter=1000)
    mlp.fit(X_train, Y_train)

    end = time.time()
    mlp_time = end - ini # Fim do calculo do tempo

    print(classification_report(Y_test,mlp.predict(X_test)))

    return knn, nb, dtc, mlp

In [143]:
#partindo então para um análise preditiva com alguns métodos, temos
X_train,X_test,Y_train,Y_test=train_test_split(modelDataset[["Embarked", "Fare", "Sex", "Pclass"]],modelDataset['Survived'], test_size=0.2, random_state=0)

knn, nb, dtc, mlp = classifier(X_train,X_test,Y_train,Y_test)

 KNN 
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       110
           1       0.73      0.65      0.69        69

    accuracy                           0.77       179
   macro avg       0.76      0.75      0.75       179
weighted avg       0.77      0.77      0.77       179


Naive Bayes
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       110
           1       0.73      0.71      0.72        69

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179


Árvore de Decisão
              precision    recall  f1-score   support

           0       0.84      0.87      0.86       110
           1       0.78      0.74      0.76        69

    accuracy                           0.82       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.82      0.82   

In [144]:
test = pd.read_csv('titanic/test.csv', index_col=False, squeeze=True)
# test = test.dropna()
test["Fare"] = test["Fare"].fillna(0)
test['Sex'] = toNumerical(test, 'Sex')
test['Embarked'] = toNumerical(test, 'Embarked')
test['Survived'] = dtc.predict(test[["Embarked", "Fare", "Sex", "Pclass"]])
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0000,,1,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,0,0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,1,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",0,,0,0,A.5. 3236,8.0500,,1,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9000,C105,2,1
415,1307,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,1,0
416,1308,3,"Ware, Mr. Frederick",0,,0,0,359309,8.0500,,1,0


In [145]:
submission = test[["PassengerId", "Survived"]]
try:
    f = open("titanic/results.csv", "w+")
except:
    f = open("titanic/results.csv", "w+")
f.write(submission.to_csv(index=False))
f.close()