In [1]:


# manipulação dos dados
import pandas as pd     
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
import plotly.express as px


# preprocessamento de dados e metricas
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron


# warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('./data-titanic/train.csv')
test = pd.read_csv('./data-titanic/test.csv')

In [3]:
# visualização dos primeiros registros do dataset

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# tratamento do dataset

# substituição das classes por números
train['sex_'] = train['Sex'].map({'male' :0, 'female' :1})
test['sex_'] = test['Sex'].map({'male':0, 'female':1})

# substituição das classes por números
train['Embarked'] = train['Embarked'].map({'S': 1, 'C': 2, 'Q': 3})
test['Embarked'] = test['Embarked'].map({'S': 1, 'C': 2, 'Q': 3})

# retirando colunas que não serão utilizadas
train.drop(['PassengerId', 'Name', 'Sex','Ticket', 'Cabin'], axis=1, inplace=True)
test.drop(['PassengerId', 'Name', 'Sex','Ticket', 'Cabin'], axis=1, inplace=True)

# substituindo valores nulos da idade pela média
train['Age'].fillna(train['Age'].mean(), inplace=True)
test['Age'].fillna(test['Age'].mean(), inplace=True)

# completanto a substituição de valores nulos de outros atributos
train['Embarked'].fillna(value = 1 , inplace=True)
test['Fare'].fillna(value= 1 , inplace=True)

In [5]:
# checar se tem valores faltantes no dataset de treino

train.isnull().sum()

Survived    0
Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
sex_        0
dtype: int64

In [6]:
# checar se tem valores faltantes no dataset de teste

test.isnull().sum()

Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
sex_        0
dtype: int64

In [7]:
# padronização dos dados

train

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,sex_
0,0,3,22.000000,1,0,7.2500,1.0,0
1,1,1,38.000000,1,0,71.2833,2.0,1
2,1,3,26.000000,0,0,7.9250,1.0,1
3,1,1,35.000000,1,0,53.1000,1.0,1
4,0,3,35.000000,0,0,8.0500,1.0,0
...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,1.0,0
887,1,1,19.000000,0,0,30.0000,1.0,1
888,0,3,29.699118,1,2,23.4500,1.0,1
889,1,1,26.000000,0,0,30.0000,2.0,0
