# Data Preprocessing Tools

## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Importing the dataset

In [2]:
data_path = '../datasets/'
related_name = 'Data.csv'

full_path = data_path + related_name

dataset = pd.read_csv(full_path)

In [3]:
# X, y = dataset.drop('Purchased', axis=1), dataset['Purchased']

X, y = dataset.iloc[:, :-1].values, dataset.iloc[:, -1].values

In [4]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [6]:
X.shape, y.shape

((10, 3), (10,))

## Taking care of missing data

In [7]:
imputer = SimpleImputer(
   missing_values=np.nan,
   strategy='mean' 
)
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

## Encoding categorical data

### Encoding the Independent Variable

In [8]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                       remainder='passthrough')

X = np.array(ct.fit_transform(X))

In [9]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [10]:
le = LabelEncoder()

y = le.fit_transform(y)

In [11]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [12]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [13]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 30.0 54000.0]]


In [14]:
print(X_test)

[[0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [15]:
print(y_train)

[0 1 1 1 0 1 0 0]


In [16]:
print(y_test)

[0 1]


## Feature Scaling

In [17]:
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [18]:
X_train

array([[0.0, 0.0, 1.0, -0.07541774069184015, -1.0699949974060108],
       [0.0, 0.0, 1.0, -1.6352773530499043, -1.4031393857875578],
       [0.0, 1.0, 0.0, 0.08645448323210979, -0.08906985383812145],
       [1.0, 0.0, 0.0, 1.1459817670979646, 1.178729624169433],
       [0.0, 1.0, 0.0, 1.4108635880644282, 1.5118740125509802],
       [1.0, 0.0, 0.0, -0.31086824821758574, 0.1792964590247913],
       [1.0, 0.0, 0.0, 0.6162181251650372, 0.5957269445017253],
       [0.0, 1.0, 0.0, -1.2379546216002086, -0.9034228032152372]],
      dtype=object)

In [19]:
X_test

array([[0.0, 0.0, 1.0, -0.1784273377343539, -0.3204201235475295],
       [1.0, 0.0, 0.0, -0.5757500691840495, -0.5702784148336899]],
      dtype=object)