# COMMON DATA PREPROCESSING `WORKFLOWS`
---

```julia
versioninfo() # -> v"1.11.1"
```

In [1]:
cd(@__DIR__)

In [2]:
using Pkg; pkg"activate .."

[32m[1m  Activating[22m[39m project at `~/Work/git-repos/AI-ML-DL/jlai/Codes/Julia/Part-2`


Import librairies

In [3]:
using CSV, DataFrames
using MLJ

Import data from CSV file

In [4]:
df = CSV.read("../../Datasets/Data.csv", DataFrame)
describe(df)
nrow(df), ncol(df)
schema(df)

┌───────────┬───────────────────────┬───────────────────────┐
│[22m names     [0m│[22m scitypes              [0m│[22m types                 [0m│
├───────────┼───────────────────────┼───────────────────────┤
│ Country   │ Textual               │ String7               │
│ Age       │ Union{Missing, Count} │ Union{Missing, Int64} │
│ Salary    │ Union{Missing, Count} │ Union{Missing, Int64} │
│ Purchased │ Textual               │ String3               │
└───────────┴───────────────────────┴───────────────────────┘


Scientific type coercion

In [5]:
df_coerced = coerce(df,
    :Country => Multiclass,
    :Age => Continuous,
    :Salary => Continuous,
    :Purchased => Multiclass);
schema(df_coerced)

[36m[1m┌ [22m[39m[36m[1mInfo: [22m[39mTrying to coerce from `Union{Missing, Int64}` to `Continuous`.
[36m[1m└ [22m[39mCoerced to `Union{Missing,Continuous}` instead.
[36m[1m┌ [22m[39m[36m[1mInfo: [22m[39mTrying to coerce from `Union{Missing, Int64}` to `Continuous`.
[36m[1m└ [22m[39mCoerced to `Union{Missing,Continuous}` instead.


┌───────────┬────────────────────────────┬───────────────────────────────────┐
│[22m names     [0m│[22m scitypes                   [0m│[22m types                             [0m│
├───────────┼────────────────────────────┼───────────────────────────────────┤
│ Country   │ Multiclass{3}              │ CategoricalValue{String7, UInt32} │
│ Age       │ Union{Missing, Continuous} │ Union{Missing, Float64}           │
│ Salary    │ Union{Missing, Continuous} │ Union{Missing, Float64}           │
│ Purchased │ Multiclass{2}              │ CategoricalValue{String3, UInt32} │
└───────────┴────────────────────────────┴───────────────────────────────────┘


Missing values imputation

In [6]:
imputer = FillImputer()
mach = machine(imputer, df_coerced) |> fit!
df_imputed = MLJ.transform(mach, df_coerced);
schema(df_imputed)

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(FillImputer(features = Symbol[], …), …).


┌───────────┬───────────────┬───────────────────────────────────┐
│[22m names     [0m│[22m scitypes      [0m│[22m types                             [0m│
├───────────┼───────────────┼───────────────────────────────────┤
│ Country   │ Multiclass{3} │ CategoricalValue{String7, UInt32} │
│ Age       │ Continuous    │ Float64                           │
│ Salary    │ Continuous    │ Float64                           │
│ Purchased │ Multiclass{2} │ CategoricalValue{String3, UInt32} │
└───────────┴───────────────┴───────────────────────────────────┘


In [7]:
#= CAN BE WRITTEN THIS WAY
df_imputed = machine(imputer, df_coerced) |> fit! |> MLJ.transform
=#

Features & target selection

In [8]:
X_imputed = select(df_imputed,
    :Country, # :Country__France, :Country__Germany, :Country__Spain, # levels(df.Country)
    :Age,
    :Salary)
y_imputed = select(df_imputed, :Purchased)

Row,Purchased
Unnamed: 0_level_1,Cat…
1,No
2,Yes
3,No
4,No
5,Yes
6,Yes
7,No
8,Yes
9,No
10,Yes


Feature encoding

In [9]:
encoder_X = ContinuousEncoder()
encoder_y = ContinuousEncoder(drop_last=true)

ContinuousEncoder(
  drop_last = true, 
  one_hot_ordered_factors = false)

In [10]:
#=
mach_X = machine(encoder_X, X_imputed) |> fit!
mach_y = machine(encoder_y, y_imputed) |> fit!
X = MLJ.transform(mach_X, X_imputed);
y = MLJ.transform(mach_y, y_imputed);
=#

In [11]:
X = machine(encoder_X, X_imputed) |> fit! |> MLJ.transform
y = machine(encoder_y, y_imputed) |> fit! |> MLJ.transform
schema(X)
schema(y)

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(ContinuousEncoder(drop_last = false, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(ContinuousEncoder(drop_last = true, …), …).


┌───────────────┬────────────┬─────────┐
│[22m names         [0m│[22m scitypes   [0m│[22m types   [0m│
├───────────────┼────────────┼─────────┤
│ Purchased__No │ Continuous │ Float64 │
└───────────────┴────────────┴─────────┘


Split data to train & test sets

In [12]:
(Xtrain, Xtest), (ytrain, ytest) = partition((X, y), .8, rng=123, multi=true);

Standardizer

In [13]:
sc_ = Standardizer()

Standardizer(
  features = Symbol[], 
  ignore = false, 
  ordered_factor = false, 
  count = false)

In [14]:
sc_age = machine(sc_, Xtrain.Age) |> fit! 
Xtrain.Age = MLJ.transform(sc_age, Xtrain.Age) 
Xtest.Age = MLJ.transform(sc_age, Xtest.Age) 

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(Standardizer(features = Symbol[], …), …).


2-element Vector{Float64}:
 -2.0564091557255306
 -2.604784930585672

In [15]:
sc_salary = machine(sc_, Xtrain.Salary) |> fit! 
Xtrain.Salary = MLJ.transform(sc_salary, Xtrain.Salary) 
Xtest.Salary = MLJ.transform(sc_salary, Xtest.Salary) 

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(Standardizer(features = Symbol[], …), …).


2-element Vector{Float64}:
 -1.1795791024447642
 -1.7401711511313849