# Data Preprocessing Tools

In [2]:
# import libraries 
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns 
%matplotlib inline

In [3]:
# import the data 
data = pd.read_csv('../Data/Purchase.csv')
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,Germany,,,No
1,Germany,,,Yes
2,Germany,43.0,67611.0,Yes
3,Germany,37.0,,Yes
4,Spain,55.0,,Yes


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    200 non-null    object 
 1   Age        110 non-null    float64
 2   Salary     103 non-null    float64
 3   Purchased  200 non-null    object 
dtypes: float64(2), object(2)
memory usage: 6.4+ KB


The features are `Country`, `Age`, `Salary` and the dependent variable is `Purchased`

In [23]:
# separate the features from the dependent variable and convert to NumPy array
X = data.iloc[:, :-1].values
y = data.iloc[:,-1].values

# shape 
X.shape, y.shape

((200, 3), (200,))

In [24]:
X

array([['Germany', nan, nan],
       ['Germany', nan, nan],
       ['Germany', 43.0, 67611.0],
       ['Germany', 37.0, nan],
       ['Spain', 55.0, nan],
       ['Spain', 25.0, 58058.0],
       ['Spain', 40.0, 62617.0],
       ['France', 36.0, nan],
       ['Spain', nan, nan],
       ['Spain', nan, 42394.0],
       ['Germany', nan, nan],
       ['France', nan, nan],
       ['France', 50.0, 61711.0],
       ['Germany', nan, nan],
       ['France', 59.0, 47686.0],
       ['France', nan, 41322.0],
       ['France', 33.0, 82755.0],
       ['Germany', 37.0, nan],
       ['Spain', nan, nan],
       ['Germany', 58.0, nan],
       ['Spain', 35.0, nan],
       ['Germany', 54.0, nan],
       ['Germany', 41.0, 82872.0],
       ['Germany', 35.0, 81562.0],
       ['Germany', nan, 80776.0],
       ['France', nan, nan],
       ['Spain', nan, nan],
       ['France', 57.0, 48931.0],
       ['France', 60.0, 63567.0],
       ['France', 32.0, 51600.0],
       ['France', 37.0, 67000.0],
       ['France', 

## Dealing with Missing Values

In [None]:
# looking at number of missing values 
# note that the dtype is object - first convert it to float 

numeric_arr = X[:,[1,2]].astype('float')
np.sum(np.isnan(numeric_arr)) # total missing values 

np.int64(187)

In [29]:
# import necessary libraries 
from sklearn.impute import SimpleImputer

# Note, however, that while doing ML models first divide the dataset into train and test set and the do imputation

# create a instance of SimpleImputer class
imputer = SimpleImputer(missing_values=np.nan,
                        strategy='mean')

# fit and transform on the dataset (only include the numeric columns)
X[:,[1,2]] =  imputer.fit_transform(X=numeric_arr)

In [34]:
numeric_arr = X[:,[1,2]].astype('float')
np.sum(np.isnan(numeric_arr)) # total missing values 

np.int64(0)

In [39]:
# Retrieve the values used for imputation
print("Imputation values:", imputer.statistics_)

Imputation values: [4.16636364e+01 6.46245825e+04]


## One Hot Encoding - Transforming Categorical Variables 

In [None]:
# number of unique values in the Country columns 
np.unique(X[:,0])

array(['France', 'Germany', 'Spain'], dtype=object)

In [None]:
# import necessary libraries 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# create the ColumnTransformer object 
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [0])],
    remainder='passthrough'
)

# fit and transform X and convert it to a numpy array (important for ML)
X = np.array(ct.fit_transform(X=X))

In [47]:
X

array([[0.0, 1.0, 0.0, 41.663636363636364, 64624.58252427184],
       [0.0, 1.0, 0.0, 41.663636363636364, 64624.58252427184],
       [0.0, 1.0, 0.0, 43.0, 67611.0],
       [0.0, 1.0, 0.0, 37.0, 64624.58252427184],
       [0.0, 0.0, 1.0, 55.0, 64624.58252427184],
       [0.0, 0.0, 1.0, 25.0, 58058.0],
       [0.0, 0.0, 1.0, 40.0, 62617.0],
       [1.0, 0.0, 0.0, 36.0, 64624.58252427184],
       [0.0, 0.0, 1.0, 41.663636363636364, 64624.58252427184],
       [0.0, 0.0, 1.0, 41.663636363636364, 42394.0],
       [0.0, 1.0, 0.0, 41.663636363636364, 64624.58252427184],
       [1.0, 0.0, 0.0, 41.663636363636364, 64624.58252427184],
       [1.0, 0.0, 0.0, 50.0, 61711.0],
       [0.0, 1.0, 0.0, 41.663636363636364, 64624.58252427184],
       [1.0, 0.0, 0.0, 59.0, 47686.0],
       [1.0, 0.0, 0.0, 41.663636363636364, 41322.0],
       [1.0, 0.0, 0.0, 33.0, 82755.0],
       [0.0, 1.0, 0.0, 37.0, 64624.58252427184],
       [0.0, 0.0, 1.0, 41.663636363636364, 64624.58252427184],
       [0.0, 1.0, 0.0, 

In [48]:
# using LabelEncoder for the dependent variable 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

y

array([0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0])

## Splitting the set into Train and Test Set

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True,
                                                    random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((160, 5), (40, 5), (160,), (40,))

## Feature Scaling

In [54]:
from sklearn.preprocessing import StandardScaler

# create an object of the class 
sc = StandardScaler()

# fit only on the train set and baseds on that transform train and test set
# donot apply Standardization on the dummy variables 

X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] = sc.transform(X_test[:,3:])