In [1]:
# Mengimpor perpustakaan
import numpy as np # digunakan untuk menangani angka 
import pandas as pd # digunakan untuk menangani dataset
from sklearn.impute import SimpleImputer # digunakan untuk menangani data yang hilang
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # digunakan untuk encoding data kategorikal
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split # digunakan untuk memisahkan data pelatihan dan pengujian
from sklearn.preprocessing import StandardScaler # digunakan untuk penskalaan fitur

In [2]:
dataset = pd.read_csv ('DataPreprocessing.csv') # untuk mengimpor dataset ke dalam variabel
# Memisahkan atribut menjadi atribut independen dan dependen 
X = dataset.iloc [:,: -1] .values # atribut untuk menentukan variabel dependen / Kelas 
Y = dataset.iloc [:, -1] .values # variabel dependen / Kelas

In [3]:
dataset

Unnamed: 0,Region,Age,Income,Online Shopper
0,India,49.0,86400.0,No
1,Brazil,32.0,57600.0,Yes
2,USA,35.0,64800.0,No
3,Brazil,43.0,73200.0,No
4,USA,45.0,,Yes
5,India,40.0,69600.0,Yes
6,Brazil,,62400.0,No
7,India,53.0,94800.0,Yes
8,USA,55.0,99600.0,No
9,India,42.0,80400.0,Yes


In [4]:
# Penanganan Data yang Hilang
# menangani data yang hilang dan mengganti nilai yang hilang dengan nan dari numpy dan mengganti dengan mean dari semua nilai lainnya 
imputer = SimpleImputer (missing_values = np.nan, strategy = 'mean') 
imputer = imputer.fit (X [:, 1: ]) 
X [:, 1:] = imputer.transform (X [:, 1:])

In [5]:
X

array([['India', 49.0, 86400.0],
       ['Brazil', 32.0, 57600.0],
       ['USA', 35.0, 64800.0],
       ['Brazil', 43.0, 73200.0],
       ['USA', 45.0, 76533.33333333333],
       ['India', 40.0, 69600.0],
       ['Brazil', 43.77777777777778, 62400.0],
       ['India', 53.0, 94800.0],
       ['USA', 55.0, 99600.0],
       ['India', 42.0, 80400.0]], dtype=object)

In [7]:
# encode categorical data
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

In [8]:
onehotencoder = OneHotEncoder()
X_category = onehotencoder.fit_transform(X[:, 0:1]).toarray()
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

In [9]:
X = np.concatenate((X_category, X[:,1:]), axis=1)

In [10]:
X

array([[0.0, 1.0, 0.0, 49.0, 86400.0],
       [1.0, 0.0, 0.0, 32.0, 57600.0],
       [0.0, 0.0, 1.0, 35.0, 64800.0],
       [1.0, 0.0, 0.0, 43.0, 73200.0],
       [0.0, 0.0, 1.0, 45.0, 76533.33333333333],
       [0.0, 1.0, 0.0, 40.0, 69600.0],
       [1.0, 0.0, 0.0, 43.77777777777778, 62400.0],
       [0.0, 1.0, 0.0, 53.0, 94800.0],
       [0.0, 0.0, 1.0, 55.0, 99600.0],
       [0.0, 1.0, 0.0, 42.0, 80400.0]], dtype=object)

In [11]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [12]:
# Memisahkan kumpulan data menjadi kumpulan data pelatihan dan pengujian
# splitting the dataset into training set and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [13]:
X_train

array([[0.0, 0.0, 1.0, 45.0, 76533.33333333333],
       [0.0, 1.0, 0.0, 42.0, 80400.0],
       [1.0, 0.0, 0.0, 32.0, 57600.0],
       [1.0, 0.0, 0.0, 43.77777777777778, 62400.0],
       [0.0, 1.0, 0.0, 53.0, 94800.0],
       [1.0, 0.0, 0.0, 43.0, 73200.0],
       [0.0, 1.0, 0.0, 49.0, 86400.0],
       [0.0, 1.0, 0.0, 40.0, 69600.0]], dtype=object)

In [14]:
X_test

array([[0.0, 0.0, 1.0, 35.0, 64800.0],
       [0.0, 0.0, 1.0, 55.0, 99600.0]], dtype=object)

In [15]:
# Penskalaan Fitur
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [16]:
X_train

array([[-0.77459667, -1.        ,  2.64575131,  0.26306757,  0.12381479],
       [-0.77459667,  1.        , -0.37796447, -0.25350148,  0.46175632],
       [ 1.29099445, -1.        , -0.37796447, -1.97539832, -1.53093341],
       [ 1.29099445, -1.        , -0.37796447,  0.05261351, -1.11141978],
       [-0.77459667,  1.        , -0.37796447,  1.64058505,  1.7202972 ],
       [ 1.29099445, -1.        , -0.37796447, -0.0813118 , -0.16751412],
       [-0.77459667,  1.        , -0.37796447,  0.95182631,  0.98614835],
       [-0.77459667,  1.        , -0.37796447, -0.59788085, -0.48214934]])

In [17]:
X_test

array([[-0.77459667, -1.        ,  2.64575131, -1.45882927, -0.90166297],
       [-0.77459667, -1.        ,  2.64575131,  1.98496442,  2.13981082]])