In [2]:
# Classificação Multiclasse com SVM - Prevendo Gastos com Cartão de Crédito em 3 Categorias

# Obs: Caso tenha problemas com a acentuação, consulte este link:
# https://support.rstudio.com/hc/en-us/articles/200532197-Character-Encoding

# Definindo o diretório de trabalho
getwd()
#setwd("~/Dropbox/DSA/MachineLearning2.0/Cap11/R")

In [3]:
####  Definido o Problema de Negócio #### 

# A identificação e a capacidade de classificar os clientes com base nos gastos sempre foram uma área de 
# interesse para instituições bancárias e empresas de cartão de crédito. É um aspecto importante no 
# gerenciamento de relacionamento com o cliente e ajuda a aumentar a receita com clientes existentes. Várias 
# tentativas foram feitas a esse respeito. Os emissores de cartões de crédito tradicionalmente têm como alvo 
# os consumidores usando informações sobre seus comportamentos e dados demográficos. 

# Nosso trabalho é classificar os clientes de cartão de crédito de acordo com seu comportamento de gastos. 
# A segmentação é um aspecto importante na compreensão do cliente e na execução de campanhas de marketing 
# eficazes e rentáveis. Usaremos o SVM como nosso modelo.

# Os dados demográficos, os detalhes sobre emprego e o estilo de vida dos clientes desempenham um papel vital na 
# maneira como eles gastam. Existem fatores ocultos, bem como semelhança com as compras. A máquina de vetores 
# de suporte pode ser usada para problemas de regressão e classificação. 

# Usaremos SVM com Kernel Linear Multiclasse como nosso modelo proposto para classificar a variável target. 
# No entanto, também avaliaremos outros Kernels, como RBF e Polinomial, para uma variedade de hiperparâmetros. 
# Também levamos em consideração o viés no dados.

# Fonte dos dados: https://sorry.vse.cz/~berka/ (dados anônimos)

In [4]:
# Pacotes
install.packages("gains")
install.packages("pROC")
install.packages("ROSE")
install.packages("mice")
library(dplyr)
library(caret)
library(gains)
library(pROC)
library(ROCR)
library(ROSE)
library(e1071)
library(mice)

package 'gains' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\cassi\AppData\Local\Temp\RtmpE1OfuT\downloaded_packages
package 'pROC' successfully unpacked and MD5 sums checked


"restored 'pROC'"


The downloaded binary packages are in
	C:\Users\cassi\AppData\Local\Temp\RtmpE1OfuT\downloaded_packages
package 'ROSE' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\cassi\AppData\Local\Temp\RtmpE1OfuT\downloaded_packages
package 'mice' successfully unpacked and MD5 sums checked


"restored 'mice'"


The downloaded binary packages are in
	C:\Users\cassi\AppData\Local\Temp\RtmpE1OfuT\downloaded_packages


"package 'dplyr' was built under R version 3.6.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'caret' was built under R version 3.6.3"Loading required package: lattice
Loading required package: ggplot2
Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
"package 'pROC' was built under R version 3.6.3"Type 'citation("pROC")' for a citation.

Attaching package: 'pROC'

The following objects are masked from 'package:stats':

    cov, smooth, var

"package 'ROSE' was built under R version 3.6.3"Loaded ROSE 0.0-3

"package 'mice' was built under R version 3.6.3"
Attaching package: 'mice'

The following objects are masked from 'package:base':

    cbind, rbind



In [6]:
# Carregando os dados
dataset_clientes <- read.csv("dados/cartoes_clientes.csv")
head(dataset_clientes)

custid,townsize,gender,age,birthmonth,ed,jobcat,union,employ,retire,...,ownpda,ownpc,ownipod,owngame,ownfax,news,response_01,response_02,response_03,Customer_cat
3964-QJWTRG-NPN,2,1,20,September,15,1,1,0,0,...,0,0,1,1,0,0,0,1,0,low_spend_cust
0648-AIPJSP-UVM,5,0,22,May,17,2,0,0,0,...,1,1,1,1,1,1,0,0,0,low_spend_cust
5195-TLUDJE-HVO,4,1,67,June,14,2,0,16,0,...,0,0,0,0,0,1,0,0,0,low_spend_cust
4459-VLPQUH-3OL,3,0,23,May,16,2,0,0,0,...,0,1,1,1,0,1,1,0,0,medium_spend_cust
8158-SMTQFB-CNO,2,0,26,July,16,2,0,1,0,...,0,1,0,1,0,0,0,1,0,medium_spend_cust
9662-FUSYIM-1IV,4,0,64,August,17,3,0,22,0,...,1,0,0,0,0,0,0,1,0,medium_spend_cust


## Pré-Processamento dos Dados

In [7]:
# Removemos a variável com ID do cliente pois não é necessário
dataset_clientes <- dataset_clientes[-1]
head(dataset_clientes)

townsize,gender,age,birthmonth,ed,jobcat,union,employ,retire,income,...,ownpda,ownpc,ownipod,owngame,ownfax,news,response_01,response_02,response_03,Customer_cat
2,1,20,September,15,1,1,0,0,31,...,0,0,1,1,0,0,0,1,0,low_spend_cust
5,0,22,May,17,2,0,0,0,15,...,1,1,1,1,1,1,0,0,0,low_spend_cust
4,1,67,June,14,2,0,16,0,35,...,0,0,0,0,0,1,0,0,0,low_spend_cust
3,0,23,May,16,2,0,0,0,20,...,0,1,1,1,0,1,1,0,0,medium_spend_cust
2,0,26,July,16,2,0,1,0,23,...,0,1,0,1,0,0,0,1,0,medium_spend_cust
4,0,64,August,17,3,0,22,0,107,...,1,0,0,0,0,0,0,1,0,medium_spend_cust


In [8]:
# Função para Fatorização de variáveis categóricas
to.factors <- function(df, variables){
  for (variable in variables){
    df[[variable]] <- as.factor(paste(df[[variable]]))
  }
  return(df)
}

In [9]:
# Lista de varáveis categóricas
categorical.vars <- c('townsize', 'jobcat', 'retire', 'hometype', 'addresscat', 
                      'cartype', 'carvalue', 'carbought', 'card2', 'gender', 'card2type', 
                      'card2benefit', 'card2benefit', 'bfast', 'internet', 'Customer_cat')

In [11]:
# Fatorizando as variáveis categóricas (alterando as variáveis categóricas para fatores)
str(dataset_clientes)

'data.frame':	5000 obs. of  117 variables:
 $ townsize         : Factor w/ 6 levels "1","2","3","4",..: 2 5 4 3 2 4 5 4 3 2 ...
 $ gender           : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 2 2 2 1 ...
 $ age              : int  20 22 67 23 26 64 52 44 66 47 ...
 $ birthmonth       : Factor w/ 12 levels "April","August",..: 12 9 7 9 6 2 6 11 11 6 ...
 $ ed               : int  15 17 14 16 16 17 14 16 12 11 ...
 $ jobcat           : Factor w/ 6 levels "1","2","3","4",..: 1 2 2 2 2 3 1 1 1 6 ...
 $ union            : int  1 0 0 0 0 0 0 0 0 0 ...
 $ employ           : int  0 0 16 0 1 22 10 11 15 19 ...
 $ retire           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
 $ income           : int  31 15 35 20 23 107 77 97 16 84 ...
 $ debtinc          : num  11.1 18.6 9.9 5.7 1.7 5.6 1.9 14.4 2.6 4.1 ...
 $ creddebt         : num  1.2 1.22 0.93 0.02 0.21 1.06 0.5 5.95 0.1 1.77 ...
 $ lncreddebt       : num  0.18 0.2 -0.07 -3.78 -1.54 0.06 -0.69 1.78 -2.28 0.57 ...
 $ othdebt          

In [12]:
dataset_clientes <- to.factors(df = dataset_clientes, variables = categorical.vars)
str(dataset_clientes)

'data.frame':	5000 obs. of  117 variables:
 $ townsize         : Factor w/ 6 levels "1","2","3","4",..: 2 5 4 3 2 4 5 4 3 2 ...
 $ gender           : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 2 2 2 1 ...
 $ age              : int  20 22 67 23 26 64 52 44 66 47 ...
 $ birthmonth       : Factor w/ 12 levels "April","August",..: 12 9 7 9 6 2 6 11 11 6 ...
 $ ed               : int  15 17 14 16 16 17 14 16 12 11 ...
 $ jobcat           : Factor w/ 6 levels "1","2","3","4",..: 1 2 2 2 2 3 1 1 1 6 ...
 $ union            : int  1 0 0 0 0 0 0 0 0 0 ...
 $ employ           : int  0 0 16 0 1 22 10 11 15 19 ...
 $ retire           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
 $ income           : int  31 15 35 20 23 107 77 97 16 84 ...
 $ debtinc          : num  11.1 18.6 9.9 5.7 1.7 5.6 1.9 14.4 2.6 4.1 ...
 $ creddebt         : num  1.2 1.22 0.93 0.02 0.21 1.06 0.5 5.95 0.1 1.77 ...
 $ lncreddebt       : num  0.18 0.2 -0.07 -3.78 -1.54 0.06 -0.69 1.78 -2.28 0.57 ...
 $ othdebt          

In [13]:
head(dataset_clientes)

townsize,gender,age,birthmonth,ed,jobcat,union,employ,retire,income,...,ownpda,ownpc,ownipod,owngame,ownfax,news,response_01,response_02,response_03,Customer_cat
2,1,20,September,15,1,1,0,0,31,...,0,0,1,1,0,0,0,1,0,low_spend_cust
5,0,22,May,17,2,0,0,0,15,...,1,1,1,1,1,1,0,0,0,low_spend_cust
4,1,67,June,14,2,0,16,0,35,...,0,0,0,0,0,1,0,0,0,low_spend_cust
3,0,23,May,16,2,0,0,0,20,...,0,1,1,1,0,1,1,0,0,medium_spend_cust
2,0,26,July,16,2,0,1,0,23,...,0,1,0,1,0,0,0,1,0,medium_spend_cust
4,0,64,August,17,3,0,22,0,107,...,1,0,0,0,0,0,0,1,0,medium_spend_cust


In [14]:
str(dataset_clientes$gender)

 Factor w/ 2 levels "0","1": 2 1 2 1 1 1 2 2 2 1 ...


In [15]:
# Aplicando Imputação em Valores Missing Usando Método PMM (Predictive Mean Matching)

# Checando valores missing
sapply(dataset_clientes, function(x)sum(is.na(x)))
sum(is.na(dataset_clientes))

In [16]:
# A correspondência média preditiva (PMM) é uma maneira atraente de fazer imputação múltipla para dados 
# ausentes, especialmente para imputar variáveis quantitativas que não são normalmente distribuídas. 

# Variável dummy
# Variável sexo = 0 ou 1
# sexo_M = 1
# sexo_F = 0

# Comparado com métodos padrão baseados em regressão linear e distribuição normal, o PMM produz valores 
# imputados que são muito mais parecidos com valores reais. Se a variável original estiver inclinada, os 
# valores imputados também serão inclinados. Se a variável original estiver delimitada por 0 e 100, os 
# valores imputados também serão delimitados por 0 e 100. E se os valores reais forem discretos 
# (como número de filhos), os valores imputados também serão discretos. 

In [17]:
# Descobrindo os números das colunas das variáveis fatores, para excluí-las da imputação
fac_col <- as.integer(0)
facnames <- names(Filter(is.factor, dataset_clientes))
k = 1

In [18]:
for(i in facnames){
  while (k <= 16){
    grep(i, colnames(dataset_clientes))
    fac_col[k] <- grep(i, colnames(dataset_clientes))
    k = k + 1
    break
  }
}

"número de itens para para substituir não é um múltiplo do comprimento do substituto"

In [19]:
# Colunas que são do tipo fator
fac_col

In [20]:
# Imputação

# Fatiamento do dataset
head(dataset_clientes)

townsize,gender,age,birthmonth,ed,jobcat,union,employ,retire,income,...,ownpda,ownpc,ownipod,owngame,ownfax,news,response_01,response_02,response_03,Customer_cat
2,1,20,September,15,1,1,0,0,31,...,0,0,1,1,0,0,0,1,0,low_spend_cust
5,0,22,May,17,2,0,0,0,15,...,1,1,1,1,1,1,0,0,0,low_spend_cust
4,1,67,June,14,2,0,16,0,35,...,0,0,0,0,0,1,0,0,0,low_spend_cust
3,0,23,May,16,2,0,0,0,20,...,0,1,1,1,0,1,1,0,0,medium_spend_cust
2,0,26,July,16,2,0,1,0,23,...,0,1,0,1,0,0,0,1,0,medium_spend_cust
4,0,64,August,17,3,0,22,0,107,...,1,0,0,0,0,0,0,1,0,medium_spend_cust


In [21]:
head(dataset_clientes[,-c(fac_col)])

age,ed,union,employ,income,debtinc,creddebt,lncreddebt,othdebt,lnothdebt,...,owncd,ownpda,ownpc,ownipod,owngame,ownfax,news,response_01,response_02,response_03
20,15,1,0,31,11.1,1.2,0.18,2.24,0.81,...,0,0,0,1,1,0,0,0,1,0
22,17,0,0,15,18.6,1.22,0.2,1.57,0.45,...,1,1,1,1,1,1,1,0,0,0
67,14,0,16,35,9.9,0.93,-0.07,2.54,0.93,...,1,0,0,0,0,0,1,0,0,0
23,16,0,0,20,5.7,0.02,-3.78,1.12,0.11,...,1,0,1,1,1,0,1,1,0,0
26,16,0,1,23,1.7,0.21,-1.54,0.18,-1.74,...,1,0,1,0,1,0,0,0,1,0
64,17,0,22,107,5.6,1.06,0.06,4.93,1.6,...,1,1,0,0,0,0,0,0,1,0


In [22]:
# Definindo a regra de imputação
?mice
regra_imputacao <- mice((dataset_clientes[,-c(fac_col)]), 
                        m = 1, 
                        maxit = 50, 
                        meth = 'pmm', 
                        seed = 500)


 iter imp variable
  1   1  lncreddebt  lnothdebt  commutetime  longten  lntollmon  lntollten  lncardmon  cardten  lncardten  lnwiremon  lnwireten
  2   1  lncreddebt  lnothdebt  commutetime  longten  lntollmon  lntollten  lncardmon  cardten  lncardten  lnwiremon  lnwireten
  3   1  lncreddebt  lnothdebt  commutetime  longten  lntollmon  lntollten  lncardmon  cardten  lncardten  lnwiremon  lnwireten
  4   1  lncreddebt  lnothdebt  commutetime  longten  lntollmon  lntollten  lncardmon  cardten  lncardten  lnwiremon  lnwireten
  5   1  lncreddebt  lnothdebt  commutetime  longten  lntollmon  lntollten  lncardmon  cardten  lncardten  lnwiremon  lnwireten
  6   1  lncreddebt  lnothdebt  commutetime  longten  lntollmon  lntollten  lncardmon  cardten  lncardten  lnwiremon  lnwireten
  7   1  lncreddebt  lnothdebt  commutetime  longten  lntollmon  lntollten  lncardmon  cardten  lncardten  lnwiremon  lnwireten
  8   1  lncreddebt  lnothdebt  commutetime  longten  lntollmon  lntollten  lncardmo

"Number of logged events: 550"

In [24]:
# Aplicando a regra de imputação
?mice::complete
total_data <- complete(regra_imputacao, 1)
head(total_data)

age,ed,union,employ,income,debtinc,creddebt,lncreddebt,othdebt,lnothdebt,...,owncd,ownpda,ownpc,ownipod,owngame,ownfax,news,response_01,response_02,response_03
20,15,1,0,31,11.1,1.2,0.18,2.24,0.81,...,0,0,0,1,1,0,0,0,1,0
22,17,0,0,15,18.6,1.22,0.2,1.57,0.45,...,1,1,1,1,1,1,1,0,0,0
67,14,0,16,35,9.9,0.93,-0.07,2.54,0.93,...,1,0,0,0,0,0,1,0,0,0
23,16,0,0,20,5.7,0.02,-3.78,1.12,0.11,...,1,0,1,1,1,0,1,1,0,0
26,16,0,1,23,1.7,0.21,-1.54,0.18,-1.74,...,1,0,1,0,1,0,0,0,1,0
64,17,0,22,107,5.6,1.06,0.06,4.93,1.6,...,1,1,0,0,0,0,0,0,1,0


In [25]:
# Junta novamente as variáveis categóricas
dataset_clientes_final <- cbind(total_data, dataset_clientes[,c(fac_col)])
head(dataset_clientes_final)

age,ed,union,employ,income,debtinc,creddebt,lncreddebt,othdebt,lnothdebt,...,addresscat,cartype,carvalue,carbought,card2,card2type,card2benefit,bfast,internet,Customer_cat
20,15,1,0,31,11.1,1.2,0.18,2.24,0.81,...,1,0,14.3,0,5,3,1,3,0,low_spend_cust
22,17,0,0,15,18.6,1.22,0.2,1.57,0.45,...,1,1,6.8,0,4,1,3,1,4,low_spend_cust
67,14,0,16,35,9.9,0.93,-0.07,2.54,0.93,...,5,1,18.8,0,4,1,3,3,0,low_spend_cust
23,16,0,0,20,5.7,0.02,-3.78,1.12,0.11,...,2,1,8.7,0,3,2,4,1,2,medium_spend_cust
26,16,0,1,23,1.7,0.21,-1.54,0.18,-1.74,...,2,1,10.6,0,1,3,2,3,3,medium_spend_cust
64,17,0,22,107,5.6,1.06,0.06,4.93,1.6,...,5,-1,-1.0,-1,3,3,2,3,0,medium_spend_cust


In [26]:
# Dimensões
dim(dataset_clientes_final)

In [27]:
# Tipos de dados
str(dataset_clientes_final)

'data.frame':	5000 obs. of  117 variables:
 $ age              : int  20 22 67 23 26 64 52 44 66 47 ...
 $ ed               : int  15 17 14 16 16 17 14 16 12 11 ...
 $ union            : int  1 0 0 0 0 0 0 0 0 0 ...
 $ employ           : int  0 0 16 0 1 22 10 11 15 19 ...
 $ income           : int  31 15 35 20 23 107 77 97 16 84 ...
 $ debtinc          : num  11.1 18.6 9.9 5.7 1.7 5.6 1.9 14.4 2.6 4.1 ...
 $ creddebt         : num  1.2 1.22 0.93 0.02 0.21 1.06 0.5 5.95 0.1 1.77 ...
 $ lncreddebt       : num  0.18 0.2 -0.07 -3.78 -1.54 0.06 -0.69 1.78 -2.28 0.57 ...
 $ othdebt          : num  2.24 1.57 2.54 1.12 0.18 4.93 0.96 8.02 0.31 1.67 ...
 $ lnothdebt        : num  0.81 0.45 0.93 0.11 -1.74 1.6 -0.04 2.08 -1.16 0.52 ...
 $ default          : int  1 1 0 1 0 0 0 0 0 0 ...
 $ jobsat           : int  1 1 4 2 1 2 2 5 2 4 ...
 $ marital          : int  0 0 1 1 1 0 0 1 0 0 ...
 $ spoused          : int  -1 -1 13 18 13 -1 -1 15 -1 -1 ...
 $ spousedcat       : int  -1 -1 2 4 2 -1 -1 3 -1 

In [28]:
str(dataset_clientes_final$gender)

 Factor w/ 2 levels "0","1": 2 1 2 1 1 1 2 2 2 1 ...


In [29]:
# Checando valores missing
sapply(dataset_clientes_final, function(x)sum(is.na(x)))
sum(is.na(dataset_clientes_final))
sum(is.na(dataset_clientes))

In [30]:
# Variável target como fator
dataset_clientes_final$Customer_cat <- as.factor(dataset_clientes_final$Customer_cat)
str(dataset_clientes_final$Customer_cat)

 Factor w/ 3 levels "high_spend_cust",..: 2 2 2 3 3 3 3 3 3 2 ...


In [31]:
# Dividindo randomicamente o dataset em 80% para dados de treino e 20% para dados de teste

# Seed para reproduzir os mesmos resultados
set.seed(100)

In [33]:
# Índice de divisão dos dados
indice_divide_dados <- sample(x = nrow(dataset_clientes_final),
                              size = 0.8 * nrow(dataset_clientes_final),
                              replace = FALSE)
head(indice_divide_dados)

In [34]:
# Aplicando o índice
dados_treino <- dataset_clientes_final[indice_divide_dados,]
dados_teste <- dataset_clientes_final[-indice_divide_dados,]

head(dados_treino)
head(dados_teste)

Unnamed: 0,age,ed,union,employ,income,debtinc,creddebt,lncreddebt,othdebt,lnothdebt,...,addresscat,cartype,carvalue,carbought,card2,card2type,card2benefit,bfast,internet,Customer_cat
2387,74,14,0,13,10,19.4,0.45,-0.79,1.49,0.4,...,3,1,6.8,1,1,2,3,2,0,low_spend_cust
4977,53,12,0,16,73,11.7,4.59,1.52,3.95,1.37,...,4,0,41.9,0,4,2,1,3,0,low_spend_cust
2580,58,15,0,19,138,8.4,0.59,-0.53,11.0,2.4,...,5,1,39.0,1,3,2,1,2,2,medium_spend_cust
3067,77,15,1,25,26,11.9,0.97,-0.04,2.13,0.76,...,5,0,15.7,0,2,1,4,2,0,medium_spend_cust
10,47,11,0,19,84,4.1,1.77,0.57,1.67,0.52,...,4,0,41.0,1,2,2,1,3,0,low_spend_cust
922,47,17,0,4,73,1.5,0.3,-1.19,0.79,-0.23,...,4,1,42.5,0,1,4,2,2,3,medium_spend_cust


Unnamed: 0,age,ed,union,employ,income,debtinc,creddebt,lncreddebt,othdebt,lnothdebt,...,addresscat,cartype,carvalue,carbought,card2,card2type,card2benefit,bfast,internet,Customer_cat
2,22,17,0,0,15,18.6,1.22,0.2,1.57,0.45,...,1,1,6.8,0,4,1,3,1,4,low_spend_cust
3,67,14,0,16,35,9.9,0.93,-0.07,2.54,0.93,...,5,1,18.8,0,4,1,3,3,0,low_spend_cust
6,64,17,0,22,107,5.6,1.06,0.06,4.93,1.6,...,5,-1,-1.0,-1,3,3,2,3,0,medium_spend_cust
8,44,16,0,11,97,14.4,5.95,1.78,8.02,2.08,...,4,0,55.5,0,3,1,4,3,0,medium_spend_cust
11,59,19,0,8,47,8.6,1.36,0.31,2.68,0.99,...,3,0,28.0,0,1,1,4,3,3,medium_spend_cust
15,72,20,1,27,17,9.8,1.28,0.24,0.39,-0.94,...,3,1,9.3,0,4,4,3,3,0,medium_spend_cust


In [35]:
# Checando o balanceamento de classe da variável target
prop.table(table(dados_treino$Customer_cat)) * 100


  high_spend_cust    low_spend_cust medium_spend_cust 
             2.35             29.75             67.90 

In [36]:
# Podemos ver que os dados apresentam um desequilíbrio alto com:
# 2% high_spend_cust, 30% low_spend_cust enquanto a maioria de 68% é medium_spent_cust
# Vamos balancear a classe usando Oversampling com SMOTE.

# Balanceamento de Classe com SMOTE
# Oversampling x Undersampling

# Seed
set.seed(301)

In [37]:
# Pacote
install.packages("DMwR")
library(DMwR)

package 'DMwR' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\cassi\AppData\Local\Temp\RtmpE1OfuT\downloaded_packages


"package 'DMwR' was built under R version 3.6.3"Loading required package: grid
Registered S3 method overwritten by 'xts':
  method     from
  as.zoo.xts zoo 
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 


In [38]:
# SMOTE - Synthetic Minority Oversampling Technique
?SMOTE
dados_treino_balanceados <- SMOTE(Customer_cat ~ ., dados_treino, perc.over = 3000, perc.under = 200)

In [39]:
# Checando o balanceamento de classe da variável target
prop.table(table(dados_treino_balanceados$Customer_cat)) * 100


  high_spend_cust    low_spend_cust medium_spend_cust 
         34.06593          19.90881          46.02525 

In [40]:
# Salvando os datasets após o pré-processamento
class(dados_treino_balanceados)
class(dados_teste)

In [41]:
write.csv(dados_treino_balanceados, "dados/dados_treino_balanceados.csv")
write.csv(dados_teste, "dados/dados_teste.csv")

In [42]:
dim(dados_treino_balanceados)
dim(dados_teste)

In [43]:
head(dados_treino_balanceados)
head(dados_teste)

Unnamed: 0,age,ed,union,employ,income,debtinc,creddebt,lncreddebt,othdebt,lnothdebt,...,addresscat,cartype,carvalue,carbought,card2,card2type,card2benefit,bfast,internet,Customer_cat
761,18,13,0,0,19,13.0,1.59,0.46,0.88,-0.13,...,1,0,9.0,0,3,4,4,3,0,medium_spend_cust
2852,20,15,0,0,42,5.3,0.67,-0.41,1.56,0.44,...,1,0,19.4,1,2,3,1,1,0,medium_spend_cust
4869,19,14,0,0,18,1.2,0.04,-3.15,0.17,-1.75,...,1,0,9.4,1,1,3,1,3,3,medium_spend_cust
1393,35,15,0,7,30,2.7,0.25,-1.37,0.56,-0.59,...,2,1,11.0,1,2,1,1,3,4,medium_spend_cust
3877,43,17,0,8,115,19.8,8.13,2.1,14.64,2.68,...,4,1,50.8,0,3,2,4,2,0,low_spend_cust
2609,48,17,0,15,80,3.4,1.52,0.42,1.2,0.19,...,4,0,48.0,0,4,4,1,2,0,low_spend_cust


Unnamed: 0,age,ed,union,employ,income,debtinc,creddebt,lncreddebt,othdebt,lnothdebt,...,addresscat,cartype,carvalue,carbought,card2,card2type,card2benefit,bfast,internet,Customer_cat
2,22,17,0,0,15,18.6,1.22,0.2,1.57,0.45,...,1,1,6.8,0,4,1,3,1,4,low_spend_cust
3,67,14,0,16,35,9.9,0.93,-0.07,2.54,0.93,...,5,1,18.8,0,4,1,3,3,0,low_spend_cust
6,64,17,0,22,107,5.6,1.06,0.06,4.93,1.6,...,5,-1,-1.0,-1,3,3,2,3,0,medium_spend_cust
8,44,16,0,11,97,14.4,5.95,1.78,8.02,2.08,...,4,0,55.5,0,3,1,4,3,0,medium_spend_cust
11,59,19,0,8,47,8.6,1.36,0.31,2.68,0.99,...,3,0,28.0,0,1,1,4,3,3,medium_spend_cust
15,72,20,1,27,17,9.8,1.28,0.24,0.39,-0.94,...,3,1,9.3,0,4,4,3,3,0,medium_spend_cust


In [44]:
sum(is.na(dados_treino_balanceados))
sum(is.na(dados_teste))
sapply(dados_teste, function(x)sum(is.na(x)))