# Data Preprocessing Template

## Importing the dataset

In [29]:
dataset <- read.csv("../data/Data.csv")
dataset

Country,Age,Salary,Purchased
<chr>,<int>,<int>,<chr>
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,,Yes
France,35.0,58000.0,Yes
Spain,,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


## Taking care of missing data

In [30]:
dataset$Age = ifelse(is.na(dataset$Age),
                     ave(dataset$Age, FUN = function(x) mean(x, na.rm = TRUE)),
                     dataset$Age)
dataset$Salary = ifelse(is.na(dataset$Salary),
                        ave(dataset$Salary, FUN = function(x) mean(x, na.rm = TRUE)),
                        dataset$Salary)
dataset

Country,Age,Salary,Purchased
<chr>,<dbl>,<dbl>,<chr>
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,63777.78,Yes
France,35.0,58000.0,Yes
Spain,38.77778,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


## Encoding categorical data

In [31]:
dataset$Country <- factor(dataset$Country,
                         levels = c('France', 'Spain', 'Germany'),
                         labels = c(1, 2, 3))
dataset$Purchased <- factor(dataset$Purchased,
                           levels = c('No', 'Yes'),
                           labels = c(0, 1))
dataset

Country,Age,Salary,Purchased
<fct>,<dbl>,<dbl>,<fct>
1,44.0,72000.0,0
2,27.0,48000.0,1
3,30.0,54000.0,0
2,38.0,61000.0,0
3,40.0,63777.78,1
1,35.0,58000.0,1
2,38.77778,52000.0,0
1,48.0,79000.0,1
3,50.0,83000.0,0
1,37.0,67000.0,1


## Splitting the dataset into the Training set and Test set

In [32]:
# install.packages("caTools")
library(caTools)
set.seed(123)
# change the dependent variable accordingly to the dataset
split <- sample.split(dataset$Purchased, SplitRatio = 0.8)
training_set <- subset(dataset, split == TRUE)
test_set <- subset(dataset, split == FALSE)
training_set
test_set

Unnamed: 0_level_0,Country,Age,Salary,Purchased
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<fct>
1,1,44.0,72000.0,0
2,2,27.0,48000.0,1
3,3,30.0,54000.0,0
4,2,38.0,61000.0,0
5,3,40.0,63777.78,1
7,2,38.77778,52000.0,0
8,1,48.0,79000.0,1
10,1,37.0,67000.0,1


Unnamed: 0_level_0,Country,Age,Salary,Purchased
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<fct>
6,1,35,58000,1
9,3,50,83000,0


## Feature Scaling


In [38]:
training_set$Age <- scale(training_set$Age)
training_set$Salary <- scale(training_set$Salary)
test_set$Salary <- scale(test_set$Salary)
test_set$Age <- scale(test_set$Age)
training_set
test_set

Unnamed: 0_level_0,Country,Age,Salary,Purchased
Unnamed: 0_level_1,<fct>,"<dbl[,1]>","<dbl[,1]>",<fct>
1,1,0.90101716,0.9392746,0
2,2,-1.58847494,-1.337116,1
3,3,-1.14915281,-0.7680183,0
4,2,0.02237289,-0.1040711,0
5,3,0.31525431,0.1594,1
7,2,0.13627122,-0.9577176,0
8,1,1.48678,1.6032218,1
10,1,-0.12406783,0.4650265,1


Unnamed: 0_level_0,Country,Age,Salary,Purchased
Unnamed: 0_level_1,<fct>,"<dbl[,1]>","<dbl[,1]>",<fct>
6,1,-0.7071068,-0.7071068,1
9,3,0.7071068,0.7071068,0
