## Sample Dataset

In [1]:
import pandas as pd

sensus = {
    'tinggi': [158, 170, 183, 191, 155, 163, 180, 158, 178],
    'jk': ['pria', 'pria', 'pria', 'pria', 'wanita','wanita','wanita','wanita','wanita'],
    'berat': [64, 86, 84, 80, 49, 59, 67, 54, 67],
    
}

sensus_df = pd.DataFrame(sensus)
sensus_df

Unnamed: 0,tinggi,jk,berat
0,158,pria,64
1,170,pria,86
2,183,pria,84
3,191,pria,80
4,155,wanita,49
5,163,wanita,59
6,180,wanita,67
7,158,wanita,54
8,178,wanita,67


## Features & target

In [2]:
import numpy as np

X_train = np.array(sensus_df[['tinggi','jk']])
y_train = np.array(sensus_df['berat'])

print(f'X_train:\n {X_train}\n')
print(f'y_train: {y_train}')

X_train:
 [[158 'pria']
 [170 'pria']
 [183 'pria']
 [191 'pria']
 [155 'wanita']
 [163 'wanita']
 [180 'wanita']
 [158 'wanita']
 [178 'wanita']]

y_train: [64 86 84 80 49 59 67 54 67]


## Preprocess Dataset: Konversi Label menjadi Numerik Biner

In [4]:
X_train_transposed = np.transpose(X_train)

print(f'X_train:\n{X_train}\n')
print(f'X_train_transposed:\n{X_train_transposed}')

X_train:
[[158 'pria']
 [170 'pria']
 [183 'pria']
 [191 'pria']
 [155 'wanita']
 [163 'wanita']
 [180 'wanita']
 [158 'wanita']
 [178 'wanita']]

X_train_transposed:
[[158 170 183 191 155 163 180 158 178]
 ['pria' 'pria' 'pria' 'pria' 'wanita' 'wanita' 'wanita' 'wanita'
  'wanita']]


In [6]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
jk_binarised = lb.fit_transform(X_train_transposed[1])

print(f'jk: {X_train_transposed}')
print(f'jk_binarised\n{jk_binarised}')

jk: [[158 170 183 191 155 163 180 158 178]
 ['pria' 'pria' 'pria' 'pria' 'wanita' 'wanita' 'wanita' 'wanita'
  'wanita']]
jk_binarised
[[0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]]


In [8]:
jk_binarised = jk_binarised.flatten()
jk_binarised

array([0, 0, 0, 0, 1, 1, 1, 1, 1])

In [10]:
X_train_transposed[1] = jk_binarised
X_train = X_train_transposed.transpose()

print(f'X_train_transposed: {X_train_transposed}\n')
print(f'X_train: \n {X_train}')

X_train_transposed: [[158 170 183 191 155 163 180 158 178]
 [0 0 0 0 1 1 1 1 1]]

X_train: 
 [[158 0]
 [170 0]
 [183 0]
 [191 0]
 [155 1]
 [163 1]
 [180 1]
 [158 1]
 [178 1]]


## Training KNN Regression Model

In [12]:
from sklearn.neighbors import KNeighborsRegressor

K = 3
model = KNeighborsRegressor(n_neighbors=K)
model.fit(X_train, y_train)

## Prediksi Berat Badan

In [13]:
X_new = np.array([[155, 1]])
X_new

array([[155,   1]])

In [14]:
y_pred = model.predict(X_new)
y_pred

array([55.66666667])

## Evaluasi KNN Regression Model

In [15]:
X_test = np.array([[168, 0], [180, 0], [160, 1], [169, 1]])
y_test = np.array([65, 96, 52, 67])

print(f'X_test:\n{X_test}\n')
print(f'y_test:\n{y_test}')

X_test:
[[168   0]
 [180   0]
 [160   1]
 [169   1]]

y_test:
[65 96 52 67]


In [16]:
y_pred = model.predict(X_test)
y_pred

array([69.66666667, 72.66666667, 59.        , 70.66666667])

### Coefficient of Determination atau R^2

In [18]:
from sklearn.metrics import r2_score

r_squared = r2_score(y_test, y_pred)

print(f'R_squared: {r_squared}')

R_squared: 0.39200515796260493


### Mean Absolute Error (MAE) atau Mean Absolute Deviation (MAD)

In [21]:
from sklearn.metrics import mean_absolute_error

MAE = mean_absolute_error(y_test, y_pred)

print(f'MAE: {MAE}')

MAE: 9.666666666666668


### Mean Squared Error (MSE) atau Mean Squared Deviation (MSD)

In [22]:
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test, y_pred)

print(f'MSE: {MSE}')

MSE: 157.16666666666663


## Permasalahan Scaling pada Features

In [24]:
from scipy.spatial.distance import euclidean

#Tinggi dalam milimeter
X_train = np.array([[1700, 0], [1600, 1]])
X_new = np.array([[1640, 0]])

[euclidean(X_new[0], d) for d in X_train]

[60.0, 40.01249804748511]

In [25]:
#Tinggi dalam meter
X_train = np.array([[1.7, 0], [1.6, 1]])
X_new = np.array([[1.64, 0]])

[euclidean(X_new[0], d) for d in X_train]

[0.06000000000000005, 1.0007996802557442]

## Menerapkan Standar Scaler (Standar Score atau Z-Score)

In [28]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [31]:
#Tinggi dalam milimeter
X_train = np.array([[1700, 0], [1600, 1]])
X_train_scaled = ss.fit_transform(X_train)
print(f'X_train_scaled:\n{X_train_scaled}\n')

X_new = np.array([[1640, 0]])
X_new_scaled = ss.transform(X_new)
print(f'X_new_scaled:{X_new_scaled}\n')

jarak = [euclidean(X_new_scaled[0], d) for d in X_train_scaled]
print(f'Jarak: {jarak}')

X_train_scaled:
[[ 1. -1.]
 [-1.  1.]]

X_new_scaled:[[-0.2 -1. ]]

Jarak: [1.2, 2.1540659228538015]


In [34]:
#Tinggi dalam meter
X_train = np.array([[1.7, 0], [1.6, 1]])
X_train_scaled = ss.fit_transform(X_train)
print(f'X_train_scaled:\n{X_train_scaled}\n')

X_new = np.array([[1.64, 0]])
X_new_scaled = ss.transform(X_new)
print(f'X_new_scaled:{X_new_scaled}\n')

jarak = [euclidean(X_new_scaled[0], d) for d in X_train_scaled]
print(f'Jarak: {jarak}')

X_train_scaled:
[[ 1. -1.]
 [-1.  1.]]

X_new_scaled:[[-0.2 -1. ]]

Jarak: [1.2000000000000026, 2.1540659228538006]


## Menerapkan Feature Scaling pada KNN

### Dataset

In [35]:
# Training set
X_train = np.array([[158, 1], [170, 0], [183, 0], [191, 0], [155, 1], [163, 1], [180, 1], [158, 1], [170, 1]])

y_train = np.array([64, 86, 84, 80, 49, 59, 67, 54, 67])

# Test set
X_test = np.array([[168, 0], [180, 0], [160, 1], [169, 1]])
y_test = np.array([65, 96, 52, 67])

## Features Scaling

In [36]:
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

print(f'X_train_scaled:\n{X_train_scaled}\n')
print(f'X_test_scaled:\n{X_test_scaled}\n')

X_train_scaled:
[[-0.9908706   0.70710678]
 [ 0.01869567 -1.41421356]
 [ 1.11239246 -1.41421356]
 [ 1.78543664 -1.41421356]
 [-1.24326216  0.70710678]
 [-0.57021798  0.70710678]
 [ 0.86000089  0.70710678]
 [-0.9908706   0.70710678]
 [ 0.01869567  0.70710678]]

X_test_scaled:
[[-0.14956537 -1.41421356]
 [ 0.86000089 -1.41421356]
 [-0.82260955  0.70710678]
 [-0.06543485  0.70710678]]



## Training & Evaluasi Model

In [37]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test_scaled)

MAE = mean_absolute_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)

print(f'MAE: {MAE}')
print(f'MSE: {MSE}')

MAE: 16.166666666666668
MSE: 463.9444444444445
