# Dados de Treino e Teste

## Carregando Biblioteca

In [10]:
library(repr)
library(caTools)
options(repr.plot.width = 4, repr.plot.height = 4)

## Carregando os dados

In [5]:
df = read.csv("../datasets/titanic/train.csv", na.strings = '')
head(df)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


## Pré-Processamento de Dados

In [6]:
# Extração de Pronomes
df$Name <- ifelse(grepl(", Mr. ", df$Name), 'Mr', as.character(df$Name))
df$Name <- ifelse(grepl("Capt", df$Name), 'Tripulacao', as.character(df$Name))
df$Name <- ifelse(grepl("Don", df$Name), 'Mr', as.character(df$Name))
df$Name <- ifelse(grepl("Major", df$Name), 'Tripulacao', as.character(df$Name))
df$Name <- ifelse(grepl("Col", df$Name), 'Tripulacao', as.character(df$Name))
df$Name <- ifelse(grepl("Dr", df$Name), 'Dr', as.character(df$Name))
df$Name <- ifelse(grepl("Rev", df$Name), 'Rev', as.character(df$Name))
df$Name <- ifelse(grepl("Sir", df$Name), 'Mr', as.character(df$Name))
df$Name <- ifelse(grepl("Jonkheer", df$Name), 'Mr', as.character(df$Name))
df$Name <- ifelse(grepl("Dona", df$Name), 'Mrs', as.character(df$Name))
df$Name <- ifelse(grepl("Countess", df$Name), 'Mrs', as.character(df$Name))
df$Name <- ifelse(grepl("Mme", df$Name), 'Mrs', as.character(df$Name))
df$Name <- ifelse(grepl("Lady", df$Name), 'Mrs', as.character(df$Name))
df$Name <- ifelse(grepl("Mrs", df$Name), 'Mrs', as.character(df$Name))
df$Name <- ifelse(grepl("Mlle", df$Name), 'Miss', as.character(df$Name))
df$Name <- ifelse(grepl("Ms", df$Name), 'Miss', as.character(df$Name))
df$Name <- ifelse(grepl("Miss", df$Name), 'Miss', as.character(df$Name))
df$Name <- ifelse(grepl("Master", df$Name), 'Master', as.character(df$Name))

# Categóricas para Numéricas
df$Name = factor(df$Name, levels = unique(df$Name), labels = 1:7)

# Substituir valores ausentes/nulos
media = mean(df$Age, na.rm = TRUE)
df$Age = ifelse(is.na(df$Age), media, df$Age)

t = table(df$Embarked)
moda = names(t[t == max(t)])

df$Embarked <- ifelse(is.na(df$Embarked), as.character(moda), as.character(df$Embarked) )

# Categórico Para numérico
df$Sex = factor(df$Sex, levels = unique(df$Sex), labels = 0:1)

df$Embarked = factor(df$Embarked, levels = unique(df$Embarked), labels = 1:3)

# Remoção de colunas
df$PassengerId = NULL
df$Ticket = NULL
df$Cabin = NULL

#Escalonamento 
df[, c('Age', 'Fare')] = scale(df[, c('Age', 'Fare')])

head(df)

Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,0,-0.592148,1,0,-0.5021631,1
1,1,2,1,0.6384304,1,0,0.7864036,2
1,3,3,1,-0.2845034,0,0,-0.4885799,1
1,1,2,1,0.407697,1,0,0.4204941,1
0,3,1,0,0.407697,0,0,-0.4860644,1
0,3,1,0,0.0,0,0,-0.4778481,3


# Divisão dos dados em treino e teste

- A função **sample.split** da **biblioteca caTools** realiza a subdisão dos dados.
- **Def**: Divida os dados do vetor Y em dois conjuntos em proporção predefinida enquanto preserva as proporções relativas de rótulos diferentes em Y. Usado para dividir os dados usados durante a classificação em subconjuntos de treino e teste. 
    - **SplitRatio** - Define a razão de divisão (Porcentagem de treinamento)
- **set.seed(n)** - Nos permite executar sempre as mesmas amostras aleatória nos testes. 
- **sample.split** retorna um vetor de verdadeiro e falso, que irão nos permitir selecionar linhas do dataframe, com o auxílio da função **subset**.

In [17]:
set.seed(0)
divisao = sample.split(df$Survived, SplitRatio = 0.75)

treino = subset(df, divisao == TRUE)
teste =  subset(df, divisao == FALSE)

x_train = treino[,-1]
y_train = treino$Survived

x_test = teste[,-1]
y_test = teste$Survived

In [18]:
head(x_train)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
2,1,2,1,0.6384304,1,0,0.7864036,2
3,3,3,1,-0.2845034,0,0,-0.4885799,1
5,3,1,0,0.407697,0,0,-0.4860644,1
6,3,1,0,0.0,0,0,-0.4778481,3
7,1,1,0,1.8690089,0,0,0.3955914,1
9,3,2,1,-0.2075923,0,2,-0.424018,1


In [19]:
head(x_test)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
1,3,1,0,-0.592148,1,0,-0.50216314,1
4,1,2,1,0.407697,1,0,0.42049407,1
8,3,4,0,-2.1303711,3,1,-0.22395734,1
11,3,3,1,-1.9765488,1,1,-0.31199715,1
12,1,3,1,2.1766535,0,0,-0.1137818,1
14,3,1,0,0.7153416,1,5,-0.01869881,1


In [21]:
head(y_train)

In [23]:
head(y_test)