In [None]:
import numpy as np
import pandas as pd

##### import data

In [None]:
df = pd.read_csv("../data/Purchased.csv")

# df.iloc[].to_numpy(), use .to_numpy() if you only want a np array and not a df with column names and indexes
# though scikit-learn needs a np array to train models, it automatically converts a df to np array internally

x = df.iloc[:, :-1].to_numpy()
y = df.iloc[:, -1].to_numpy()

In [None]:
df.head()

##### handle missing data

In [None]:
# check for missing values in any column
df.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

x[:, [1, 2]] = imputer.fit_transform(x[:, [1, 2]])

> other methods (works on df only)

> x['Age'] = x['Age'].fillna(x['Age'].mean())

> x['Country'] = x['Country'].fillna(x['Country'].mode()[0])

##### encode categories

> use label encoder only on target variable

> for independent variables, use one hot encoder or ordinal encoder

> both the above encoders can be used in column transformer, but not label encoder

> encoded columns are placed at the starting columns

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# drop='first' to have k-1 dummy variables
# can add more transformers=[(...), ('ss', StandardScaler(), [1, 2]), (...)]
# passthrough is important as it allows other columns to not be dropped by default
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop = 'first'), [0])], remainder='passthrough')

x = ct.fit_transform(x) 

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

##### split data

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

##### feature scaling


> Standardisaion range = [-3, 3] ; works for most of the cases <br>
> Normalisation range = [0, 1] ; works where there is a normal distribution

> scale after splitting to avoid information leakage into the test set

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

# avoid dummy variables as they already have values lying in the range of [-3, 3]
x_train[:, 2:] = ss.fit_transform(x_train[:, 2:])
x_test[:, 2:] = ss.transform(x_test[:, 2:])

In [None]:
print(x_train)