# Data Preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Load Data

In [2]:
dataset = pd.read_csv('data/udemy_data.csv')

## Inspect data

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
Country      10 non-null object
Age          9 non-null float64
Salary       9 non-null float64
Purchased    10 non-null object
dtypes: float64(2), object(2)
memory usage: 400.0+ bytes


In [6]:
type(dataset)

pandas.core.frame.DataFrame

## loc and iloc

In [7]:
type(dataset.iloc[:,:])

pandas.core.frame.DataFrame

In [8]:
type(dataset.iloc[:,0])

pandas.core.series.Series

In [9]:
type(dataset.iloc[:,:].values)

numpy.ndarray

In [10]:
dataset.iloc[:,:].values.shape

(10, 4)

In [11]:
dataset.loc[1:3, 'Country']

1      Spain
2    Germany
3      Spain
Name: Country, dtype: object

In [12]:
dataset.loc[1:3, ['Country', 'Age']] # Why are we getting three rows with 1:3? (happening only with loc)

Unnamed: 0,Country,Age
1,Spain,27.0
2,Germany,30.0
3,Spain,38.0


In [13]:
dataset.iloc[1:3, 1:2]

Unnamed: 0,Age
1,27.0
2,30.0


## Extract values in numpy array

In [14]:
X = dataset.iloc[:, :-1].values

In [15]:
y = dataset.iloc[:, -1].values

In [16]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [17]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

## Take care of missing data

In [18]:
from sklearn.preprocessing import Imputer # Imputation transformer for completing missing values.

In [19]:
imp = Imputer(missing_values='NaN', copy=False, axis=0, strategy='mean')

In [20]:
X[:, 1:3] = imp.fit_transform(X[:, 1:3])

In [21]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Take care of (encode) categrorical data

In [22]:
X # The country name needs to be encoded

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [23]:
y # The Yes/No values need to be encoded

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

In [24]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [25]:
lab_X = LabelEncoder() # One encoder per column, creates enumeration of the distinct labels

In [26]:
X[:, 0] = lab_X.fit_transform(X[:, 0])

In [27]:
X # Encoded

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [28]:
lab_y = LabelEncoder()

In [29]:
y = lab_y.fit_transform(y)

In [30]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [31]:
ohe = OneHotEncoder(categorical_features=[0])

In [32]:
X = ohe.fit_transform(X).toarray()

In [33]:
X

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.0000000

In [34]:
#Let's remove the dummy variable trap

## Splitting data into training and testing sets

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Feature scalling

In [37]:
dataset # Age and Scalary are not on same scale. Many ML algorithms give wrong results. If you take two observations, the euclidean distance will be dominated by the Salary feature.

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [38]:
# Two approaches:
# 1. Standardisation: X_stand = (x1 - mean(x))/stdev(x)
# 2. Normalization: X_norm = (x1 - min(x))/(max(x) - min(x))

In [39]:
from sklearn.preprocessing import StandardScaler

In [40]:
sc_X = StandardScaler()

In [41]:
X_train = sc_X.fit_transform(X_train)

In [42]:
X_test = sc_X.transform(X_test)

In [43]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4
0,-1.0,1.732051,-0.57735,2.182718,2.300892
1,-1.0,-0.57735,1.732051,-2.318628,-1.79681


In [44]:
# No need to scale y