# Data Cleaning

In [82]:
import pandas as pd
import numpy as np

In [83]:
data = pd.read_csv('abalone.data.csv')

In [84]:
data.shape

(199, 9)

In [85]:
data.head(n=5)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [86]:
data.tail(n=5)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
194,F,0.51,0.4,0.14,0.8145,0.459,0.1965,0.195,10
195,M,0.5,0.405,0.155,0.772,0.346,0.1535,0.245,12
196,F,0.505,0.41,0.15,0.644,0.285,0.145,0.21,11
197,M,0.64,0.5,0.185,1.3035,0.4445,0.2635,0.465,16
198,M,0.56,0.45,0.16,0.922,0.432,0.178,0.26,15


In [87]:
data.describe() #descriptive statistics

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,192.0,192.0,194.0,196.0,195.0,191.0,199.0,199.0
mean,0.508385,0.397734,0.134098,0.777349,0.309018,0.168207,0.252704,10.738693
std,0.117404,0.096054,0.039054,0.485326,0.183426,0.104823,0.173279,3.588993
min,0.17,0.13,0.04,0.03,0.0105,0.0065,0.01,4.0
25%,0.44,0.34875,0.105,0.406,0.17125,0.0865,0.135,8.0
50%,0.53,0.415,0.135,0.748,0.295,0.1595,0.22,10.0
75%,0.595,0.47,0.16,1.03075,0.42025,0.2335,0.33,13.0
max,0.725,0.575,0.23,2.55,1.0705,0.541,1.005,21.0


In [88]:
data['Sex'].value_counts()

F    84
M    82
I    29
Name: Sex, dtype: int64

In [89]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 9 columns):
Sex               195 non-null object
Length            192 non-null float64
Diameter          192 non-null float64
Height            194 non-null float64
Whole weight      196 non-null float64
Shucked weight    195 non-null float64
Viscera weight    191 non-null float64
Shell weight      199 non-null float64
Rings             199 non-null int64
dtypes: float64(7), int64(1), object(1)
memory usage: 14.1+ KB


## Imputasi Missing Value

In [90]:
np.sum(data.isnull()) #number of NA for each variable

Sex               4
Length            7
Diameter          7
Height            5
Whole weight      3
Shucked weight    4
Viscera weight    8
Shell weight      0
Rings             0
dtype: int64

In [91]:
data.isnull().sum()

Sex               4
Length            7
Diameter          7
Height            5
Whole weight      3
Shucked weight    4
Viscera weight    8
Shell weight      0
Rings             0
dtype: int64

In [92]:
data.isnull().sum().sum()

38

In [93]:
#imputasi missing value data numerik
data['Length'] = data['Length'].fillna((data['Length'].mean())) 
data['Diameter'] = data['Diameter'].fillna((data['Diameter'].mean()))
data['Height'] = data['Height'].fillna((data['Height'].median()))
data['Whole weight'] = data['Whole weight'].fillna((data['Whole weight'].median()))
data['Shucked weight'] = data['Shucked weight'].fillna((data['Shucked weight'].median()))
data['Viscera weight'] = data['Viscera weight'].fillna((data['Viscera weight'].median()))
data['Shell weight'] = data['Shell weight'].fillna((data['Shell weight'].median()))
data.isnull().any()

Sex                True
Length            False
Diameter          False
Height            False
Whole weight      False
Shucked weight    False
Viscera weight    False
Shell weight      False
Rings             False
dtype: bool

In [94]:
#imputasi data kategorik dengan modus
data['Sex'] = data.fillna(data['Sex'].value_counts().index[0])
data.isnull().any()

Sex               False
Length            False
Diameter          False
Height            False
Whole weight      False
Shucked weight    False
Viscera weight    False
Shell weight      False
Rings             False
dtype: bool

## Encode Labels

In [95]:
from sklearn.preprocessing import LabelEncoder

In [101]:
# Encode untuk Sex tanpa membuat kolom baru
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,15
1,2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,7
2,0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,9
3,2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,10
4,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,7


## Standardize

In [97]:
from sklearn.preprocessing import StandardScaler

In [98]:
data['Length'] = StandardScaler().fit(data[['Length']]).transform(data[['Length']])
data['Diameter'] = StandardScaler().fit(data[['Diameter']]).transform(data[['Diameter']])
data['Height'] = StandardScaler().fit(data[['Height']]).transform(data[['Height']])
data['Whole weight'] = StandardScaler().fit(data[['Whole weight']]).transform(data[['Whole weight']])
data['Shucked weight'] = StandardScaler().fit(data[['Shucked weight']]).transform(data[['Shucked weight']])
data['Viscera weight'] = StandardScaler().fit(data[['Viscera weight']]).transform(data[['Viscera weight']])
data['Shell weight'] = StandardScaler().fit(data[['Shell weight']]).transform(data[['Shell weight']])
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,2,-0.46414,-0.347855,-1.017149,-0.547224,-0.465091,-0.652645,-0.594201,15
1,2,-1.377023,-1.410514,-1.14715,-1.147719,-1.15525,-1.165141,-1.057048,7
2,0,0.18792,0.236608,0.022865,-0.20795,-0.28841,-0.257291,-0.247065,9
3,2,-0.594551,-0.347855,-0.237139,-0.543062,-0.514782,-0.525741,-0.565273,10
4,0,-1.550905,-1.51678,-1.407154,-1.190388,-1.210463,-1.252998,-1.143832,7


# Normalize

In [99]:
from sklearn.preprocessing import Normalizer 

In [100]:
data['Length'] = Normalizer().fit(data[['Length']]).transform(data[['Length']])
data['Diameter'] = Normalizer().fit(data[['Diameter']]).transform(data[['Diameter']])
data['Height'] = Normalizer().fit(data[['Height']]).transform(data[['Height']])
data['Whole weight'] = Normalizer().fit(data[['Whole weight']]).transform(data[['Whole weight']])
data['Shucked weight'] = Normalizer().fit(data[['Shucked weight']]).transform(data[['Shucked weight']])
data['Viscera weight'] = Normalizer().fit(data[['Viscera weight']]).transform(data[['Viscera weight']])
data['Shell weight'] = Normalizer().fit(data[['Shell weight']]).transform(data[['Shell weight']])
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,15
1,2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,7
2,0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,9
3,2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,10
4,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,7
