In [1]:
%store -r dt
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = dt.copy()

In [3]:
df.head()

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting
0,1,17,3.0,49,10.0,72.2,2,2
1,2,11,2.9,49,2.9,65.0,2,1
2,1,16,2.9,49,8.5,72.2,2,1
3,1,31,2.8,49,6.4,63.0,2,1
4,1,15,3.1,49,10.5,49.0,2,1


In [4]:
df.describe()

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.3796,14.255,2.7576,49.1174,7.63199,68.85492,2.0,1.2045
std,0.485312,7.923285,0.291092,0.439814,1.720461,8.90664,0.0,0.403356
min,1.0,6.0,2.0,48.0,2.9,49.0,2.0,1.0
25%,1.0,10.0,2.8,49.0,6.4,65.0,2.0,1.0
50%,1.0,13.0,2.8,49.0,7.7,68.3,2.0,1.0
75%,2.0,15.0,2.9,49.0,9.0,72.2,2.0,1.0
max,2.0,48.0,3.1,50.0,10.5,92.7,2.0,2.0


### Drop duplicate

In [5]:
print(df.shape)
df.drop_duplicates(dt, inplace=True)
df.reset_index(inplace=True)
df.drop(['index'],axis=1, inplace=True)
print(df.shape)

(10000, 8)
(7573, 8)


### Drop Outliers

In [6]:
out = df.copy()

In [7]:
def count_outs(df, col):
# Above Upper bound
    Q1 = np.percentile(df[col], 25, method='midpoint')
    Q3 = np.percentile(df[col], 75, method='midpoint')
    IQR = Q3 - Q1
    upper=Q3+1.5*IQR
    upper_array=np.array(df[col]>=upper)
    print(col)
    print("Upper Outs: ",upper,' | ' ,upper_array.sum())
    
    #Below Lower bound
    lower=Q1-1.5*IQR
    lower_array=np.array(df[col]<=lower)
    print("Lower Outs: ",lower, ' | ', lower_array.sum(), '\n')

In [475]:
count_outs(out, 'Body Weight')
count_outs(out, 'Body Length')
count_outs(out, 'Age')

Body Weight
Upper Outs:  12.0  |  0
Lower Outs:  4.0  |  509 

Body Length
Upper Outs:  83.0  |  489
Lower Outs:  54.199999999999996  |  609 

Age
Upper Outs:  25.0  |  702
Lower Outs:  1.0  |  0 



In [8]:
def drop_outs(df, col):
    Q1 = np.percentile(df[col], 25, method='midpoint')
    Q3 = np.percentile(df[col], 75, method='midpoint')
    IQR = Q3 - Q1
    upper=Q3+1.5*IQR
    lower=Q1-1.5*IQR

    upper_array = np.where(df[col]>=upper)[0]
    lower_array = np.where(df[col]<=lower)[0]
    df.drop(index=upper_array, inplace=True)
    df.drop(index=lower_array, inplace=True)
    df.reset_index(inplace=True)
    df.drop(['index'],axis=1, inplace=True)
    print('Dropping ',col, ' Outliers')

In [9]:
print(len(out))
drop_outs(out, 'Body Weight')
drop_outs(out, 'Body Length')
drop_outs(out, 'Age')
print(len(out))

7573
Dropping  Body Weight  Outliers
Dropping  Body Length  Outliers
Dropping  Age  Outliers
5465


In [10]:
print(len(out))
fill = np.array([True] * len(out))
for col in ['Age', 'Body Weight', 'Body Length']:
    zscore = abs(stats.zscore(out[col]))
    fill = (zscore < 3) & fill
out = out[fill]
print(len(out))

5465
5451


In [480]:
count_outs(out, 'Body Weight')
count_outs(out, 'Body Length')
count_outs(out, 'Age')

Body Weight
Upper Outs:  12.0  |  0
Lower Outs:  4.0  |  0 

Body Length
Upper Outs:  83.0  |  0
Lower Outs:  54.199999999999996  |  0 

Age
Upper Outs:  22.5  |  0
Lower Outs:  2.5  |  0 



In [481]:
out.shape

(5451, 8)

In [11]:
#buang data umur lebih dari 25
out = out[out['Age'] < 25]
out.shape

(5451, 8)

In [483]:
out.describe()

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting
count,5451.0,5451.0,5451.0,5451.0,5451.0,5451.0,5451.0,5451.0
mean,1.358099,12.405797,2.755641,49.101449,7.96197,69.24878,2.0,1.227114
std,0.479486,3.419919,0.298729,0.463116,1.269166,4.759325,0.0,0.419005
min,1.0,6.0,2.0,48.0,6.2,63.0,2.0,1.0
25%,1.0,10.0,2.8,49.0,7.0,65.0,2.0,1.0
50%,1.0,12.0,2.8,49.0,7.7,69.0,2.0,1.0
75%,2.0,15.0,3.0,49.0,9.0,72.2,2.0,1.0
max,2.0,22.0,3.1,50.0,10.5,80.0,2.0,2.0


### Normalisasi range feature dan pindah Weight dan Length ke metrik umum

In [484]:
out['Body Length'].max()

80.0

In [485]:
out['Body Length'].min()

63.0

In [486]:
out

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting
0,1,17,3.0,49,10.0,72.2,2,2
1,1,16,2.9,49,8.5,72.2,2,1
2,2,11,2.8,49,8.5,65.0,2,2
3,2,17,2.8,49,8.0,63.0,2,1
4,2,10,2.7,49,8.4,73.5,2,2
...,...,...,...,...,...,...,...,...
5460,1,13,2.9,50,6.4,73.5,2,1
5461,1,12,2.7,49,6.2,69.0,2,1
5462,1,11,2.8,48,10.5,73.5,2,2
5463,2,12,2.8,48,7.7,63.0,2,2


In [487]:
out[out['Body Length'] * 2.54 > 193]

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting
27,2,14,2.8,49,10.5,80.0,2,1
29,1,12,2.9,49,6.2,76.0,2,1
41,2,14,2.8,49,10.0,76.0,2,1
46,1,11,3.0,49,7.1,76.0,2,1
52,1,17,2.9,49,9.0,76.0,2,1
...,...,...,...,...,...,...,...,...
5438,2,14,2.8,49,8.5,80.0,2,1
5439,1,11,3.0,49,7.0,76.0,2,1
5453,1,20,2.0,49,10.0,76.0,2,1
5456,1,11,2.8,50,9.0,76.0,2,2


In [488]:
(((out['Body Length'] - np.min(out['Body Length'])) / (np.max(out['Body Length']) - np.min(out['Body Length']))) * (193 - 100) + 100).tail()

5460    157.441176
5461    132.823529
5462    157.441176
5463    100.000000
5464    193.000000
Name: Body Length, dtype: float64

In [489]:
out['Body Weight'].max()

10.5

In [490]:
out['Body Weight'].min()

6.2

In [491]:
out[['Age', 'Body Weight', 'Body Length']][out['Age'] < 20]

Unnamed: 0,Age,Body Weight,Body Length
0,17,10.0,72.2
1,16,8.5,72.2
2,11,8.5,65.0
3,17,8.0,63.0
4,10,8.4,73.5
...,...,...,...
5460,13,6.4,73.5
5461,12,6.2,69.0
5462,11,10.5,73.5
5463,12,7.7,63.0


In [492]:
(((out['Body Weight'] - np.min(out['Body Weight'])) / (np.max(out['Body Weight']) - np.min(out['Body Weight']))) * (100 - 45) + 45)

0        93.604651
1        74.418605
2        74.418605
3        68.023256
4        73.139535
           ...    
5460     47.558140
5461     45.000000
5462    100.000000
5463     64.186047
5464     64.186047
Name: Body Weight, Length: 5451, dtype: float64

In [12]:
def scal_range(df, col, min, max):

    res = (((df[col] - np.min(df[col]))) / (np.max(df[col]) - np.min(df[col]))) * (max - min) + min
    res = round(res, 1)
    
    return res


In [13]:
out['Body Length'] = scal_range(out, 'Body Length', 193, 125)
out['Body Weight'] = scal_range(out, 'Body Weight', 120, 20)

In [14]:
#feature engine
out['BMI'] = out['Body Weight'] / np.power(out['Body Length']/100, 1)

In [15]:
out['Stunting'][out['BMI'] >= 18.5][out['BMI'] < 25] = 0
out['Stunting'][out['BMI'] < 18.5][out['BMI'] >= 25] = 1
# bisa penambahan label obesitas, overwirght, underweight, ideal
# satu stunting dan 0 tidak stunting

In [16]:
out.head()

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting,BMI
0,1,17,3.0,49,31.6,156.2,2,2,20.230474
1,1,16,2.9,49,66.5,156.2,2,1,42.573624
2,2,11,2.8,49,66.5,185.0,2,2,35.945946
3,2,17,2.8,49,78.1,193.0,2,1,40.466321
4,2,10,2.7,49,68.8,151.0,2,2,45.562914


### Scaling data

In [498]:
MinMaxScaler().fit_transform(out['Body Length'].values.reshape(len(out), 1))

array([[0.45882353],
       [0.45882353],
       [0.88235294],
       ...,
       [0.38235294],
       [1.        ],
       [0.        ]])

In [499]:
after_scaler = out.copy()
for col in out.columns:
    after_scaler[col] = MinMaxScaler().fit_transform(after_scaler[col].values.reshape(len(after_scaler), 1))
after_scaler.head()

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting,BMI
0,0.0,0.6875,0.909091,0.5,0.116,0.458824,0.0,1.0,0.115228
1,0.0,0.625,0.818182,0.5,0.465,0.458824,0.0,0.0,0.376132
2,1.0,0.3125,0.727273,0.5,0.465,0.882353,0.0,1.0,0.29874
3,1.0,0.6875,0.727273,0.5,0.581,1.0,0.0,0.0,0.351525
4,1.0,0.25,0.636364,0.5,0.488,0.382353,0.0,1.0,0.411038


In [500]:
after_scaler.corr()

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting,BMI
Gender,1.0,-0.026308,-0.04549,-0.104138,-0.022819,0.06082,,-0.094368,-0.037955
Age,-0.026308,1.0,0.045569,-0.068283,-0.105158,-0.061249,,-0.195319,-0.079744
Birth Weight,-0.04549,0.045569,1.0,-0.087892,-0.031552,-0.030692,,-0.116222,-0.017815
Birth Length,-0.104138,-0.068283,-0.087892,1.0,0.053746,0.022616,,0.213137,0.039506
Body Weight,-0.022819,-0.105158,-0.031552,0.053746,1.0,-0.006924,,0.130158,0.942842
Body Length,0.06082,-0.061249,-0.030692,0.022616,-0.006924,1.0,,0.06321,-0.316381
Breastfeeding,,,,,,,,,
Stunting,-0.094368,-0.195319,-0.116222,0.213137,0.130158,0.06321,,1.0,0.097163
BMI,-0.037955,-0.079744,-0.017815,0.039506,0.942842,-0.316381,,0.097163,1.0


In [501]:
after_scaler.shape

(5451, 9)

In [17]:
%store out

Stored 'out' (DataFrame)


In [18]:
%store after_scaler

UsageError: Unknown variable 'after_scaler'


## **NOTE**
```
data asli (10000,8), setelah preprocessing jadi (5451, 9)
3000an duplicate
1000an outliers
300an diluar usia 25
penambahan feature BMI
korelasi pada weght dan length naik, tapi birth length turun
```