In [10]:
using DataFrames
using Statistics
using CSV

# Importing the dataset
df = CSV.File("Data.csv") |> DataFrames.DataFrame!;

In [26]:
# Taking care of missing data
mean_age    = mean(filter(a -> !ismissing(a), df.Age));
mean_salary = mean(filter(a -> !ismissing(a), df.Salary));

df.Age     = map(a -> ismissing(a) ? mean_age    : a, df.Age);
df.Salary  = map(s -> ismissing(s) ? mean_salary : s, df.Salary);

In [27]:
using CategoricalArrays

# Encoding categorical data
categorical!(df, [ :Country, :Purchased ]);

In [28]:
X = Matrix(df[!, Not([ :Purchased, :Country ])]); # or df[!, 1:3] or : instead of ! for copy
y = df[!, :Purchased];

In [29]:
using MLLabelUtils
using MLDataUtils

# One-Hot encoding for Country column
onehotcountries = convertlabel(LabelEnc.OneOfK, df[!, :Country]);
X = hcat(transpose(onehotcountries), X);

In [30]:
display(X)

10×5 Array{Real,2}:
 1  0  0  44       72000
 0  1  0  27       48000
 0  0  1  30       54000
 0  1  0  38       61000
 0  0  1  40       63777.8
 1  0  0  35       58000
 0  1  0  38.7778  52000
 1  0  0  48       79000
 0  0  1  50       83000
 1  0  0  37       67000

In [31]:
show(y)

CategoricalString{UInt32}["No", "Yes", "No", "No", "Yes", "Yes", "No", "Yes", "No", "Yes"]

In [35]:
using Random

# Splitting the dataset into the Training set and Test set
Random.seed!(0)
(X_train, y_train), (X_test, y_test) = splitobs(shuffleobs((X, y), obsdim = 1), at = 0.8, obsdim = 1);

In [36]:
using MLPreprocessing

scaler = fit(StandardScaler, X_train, obsdim = 1, operate_on = collect(1:size(X_train)[2]))

X_train = transform(X_train, scaler);
X_test  = transform(X_test, scaler);