In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = [5,5]
plt.rcParams['axes.unicode_minus'] = False


df = pd.read_csv('https://raw.githubusercontent.com/algoboni/pythoncodebook1-1/main/2-1_wine.csv', index_col=0)
df.head()

#실습에 사용할 열만 추출
df = df.filter(['Alcohol', 'Malicacid', 'Ash', 'Magnesium', 'Hue'])

#결측치 삽입
from numpy.random import randint
dfm = df.copy()

for col in ['Alcohol', 'Ash']:
    rand_idx = randint(0, len(dfm), 25)
    dfm.loc[rand_idx, col] = np.nan

In [2]:
# -------------------------
# Constant Imputation
# -------------------------
dfm_imp = dfm.fillna(1000)

# -------------------------
# 대표값 Imputation
# -------------------------
from sklearn.impute import SimpleImputer

sImp = SimpleImputer(missing_values=np.nan, strategy='mean')
dfm_imp1 = sImp.fit_transform(dfm)
dfm_imp1 = pd.DataFrame(dfm_imp1, columns=dfm.columns)

print(sImp.statistics_)
dfm_imp1

[12.99503226  2.33634831  2.36636364 99.74157303  0.95744944]


Unnamed: 0,Alcohol,Malicacid,Ash,Magnesium,Hue
0,14.230000,1.71,2.43,127.0,1.04
1,13.200000,1.78,2.14,100.0,1.05
2,13.160000,2.36,2.67,101.0,1.03
3,14.370000,1.95,2.50,113.0,0.86
4,13.240000,2.59,2.87,118.0,1.04
...,...,...,...,...,...
173,12.995032,5.65,2.45,95.0,0.64
174,13.400000,3.91,2.48,102.0,0.70
175,13.270000,4.28,2.26,120.0,0.59
176,13.170000,2.59,2.37,120.0,0.60


In [3]:
# -------------------------
# 단순확률 대치법
# -------------------------
dfm_imp2 = dfm.copy()

for col in ['Alcohol', 'Ash']:
    miss_idx = dfm[dfm[col].isna()==True].index
    dfm_imp2.loc[miss_idx, col] = df[col].sample(len(miss_idx)).values

dfm_imp2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 178 entries, 0 to 177
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Alcohol    178 non-null    float64
 1   Malicacid  178 non-null    float64
 2   Ash        178 non-null    float64
 3   Magnesium  178 non-null    int64  
 4   Hue        178 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 12.4 KB


In [6]:
# -------------------------
# 다른 변수들로 모델링하여 결측값을 예측
# -------------------------
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
dfm_imp3 = imputer.fit_transform(dfm)
dfm_imp3 = pd.DataFrame(dfm_imp3, columns=dfm.columns)
dfm_imp3.isna().sum()

Alcohol      0
Malicacid    0
Ash          0
Magnesium    0
Hue          0
dtype: int64

In [7]:
# -------------------------
# 보간법(Interpolation)
#   Linear
#       series의 데이터를 하나의 직선 상에 있다고 가정한다.
#   Polynomial
#       series의 데이터를 하나의 다항식 상에 있다고 가정하고, 가장 적합한 다항식을 찾아낸다.
#   Spline (zero:0차, slinear:1차, quadratic:2차, cubic:3차)
#       series의 데이터를 여러 개의 구간의 분할하고 각 구간에 대해, 
#       구간 내 series에 적합하면서 연결점에서 앞뒤 구간에 일치하는 다항식을 찾아낸다.
# -------------------------
dfm_imp4 = dfm.interpolate(method='linear')
dfm_imp4.isna().sum()

Alcohol      0
Malicacid    0
Ash          0
Magnesium    0
Hue          0
dtype: int64

In [None]:
# Alcohol 특성을 기준으로 전처리 성능을 비교

# 실제값과 각 결측치처리법 별 대치값을 Dataframe으로 만든다.

# 실제값과