In [1]:
import numpy as np
import pandas as pd

##### import data

In [2]:
df = pd.read_csv("../data/Purchased.csv")

# df.iloc[].to_numpy(), use .to_numpy() if you only want a np array and not a df with column names and indexes
# though scikit-learn needs a np array to train models, it automatically converts a df to np array internally

x = df.iloc[:, :-1].to_numpy()
y = df.iloc[:, -1].to_numpy()

In [3]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


##### handle missing data

In [4]:
# check for missing values in any column
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

x[:, [1, 2]] = imputer.fit_transform(x[:, [1, 2]])

> other methods (works on df only)

> x['Age'] = x['Age'].fillna(x['Age'].mean())

> x['Country'] = x['Country'].fillna(x['Country'].mode()[0])

##### encode categories

> use label encoder only on target variable

> for independent variables, use one hot encoder or ordinal encoder

> both the above encoders can be used in column transformer, but not label encoder

> encoded columns are placed at the starting columns

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# drop='first' to have k-1 dummy variables
# can add more transformers=[(...), ('ss', StandardScaler(), [1, 2]), (...)]
# passthrough is important as it allows other columns to not be dropped by default
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop = 'first'), [0])], remainder='passthrough')

x = ct.fit_transform(x) 

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

##### split data

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

##### feature scaling


> Standardisaion range = [-3, 3] ; works for most of the cases <br>
> Normalisation range = [0, 1] ; works where there is a normal distribution

> scale after splitting to avoid information leakage into the test set

In [9]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

# avoid dummy variables as they already have values lying in the range of [-3, 3]
x_train[:, 2:] = ss.fit_transform(x_train[:, 2:])
x_test[:, 2:] = ss.transform(x_test[:, 2:])

In [10]:
print(x_train)

[[0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [0.0 0.0 1.1475343068237058 1.232653363453549]
 [1.0 0.0 1.4379472069688968 1.5749910381638885]
 [0.0 0.0 -0.7401495441200351 -0.5646194287757332]]
