# 결측치 실습

### 학습 목표
- 결측치에 대한 처리법을 익힌다.

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from numpy import nan as NA

In [2]:
data = DataFrame([[1, 6, 3], [1, np.nan, NA], [NA, NA, NA], [NA, 5, 3]])
data

Unnamed: 0,0,1,2
0,1.0,6.0,3.0
1,1.0,,
2,,,
3,,5.0,3.0


In [27]:
# 결측치가 있는 행, 열 처리하기
# column별 결측치 수 확인 --> isnull()
data.isnull().sum()

0    2
1    2
2    2
dtype: int64

In [36]:
# 결측치가 있는 행, 열 처리하기
# 0번 column에서 결측치 index 찾기
data[data[0].isnull()].index

Index([2, 3], dtype='int64')

In [37]:
# 결측치가 있는 행, 열 처리하기
# 결측치 지우기 --> dropna()
data.dropna()  # axis = 0 가 defalt

Unnamed: 0,0,1,2
0,1.0,6.0,3.0


In [38]:
data

Unnamed: 0,0,1,2
0,1.0,6.0,3.0
1,1.0,,
2,,,
3,,5.0,3.0


In [40]:
# 특정 column을 기준으로 결측치 있는 행을 삭제, subset = ['column명']
data.dropna(subset = [0,2]) # 0 , 2 열에 결측치가 있는 행만 지워라


Unnamed: 0,0,1,2
0,1.0,6.0,3.0


In [41]:
# how = any 하나라도 결측치가 있는 행은 지워라
data.dropna(how = 'any')

Unnamed: 0,0,1,2
0,1.0,6.0,3.0


In [42]:
# how = all 모든 값이 결측치인 행은 지워라
data.dropna(how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.0,3.0
1,1.0,,
3,,5.0,3.0


In [46]:
# data[4] = np.array([1,2,3,4])
data[4] = [1,2,3,4]
data

Unnamed: 0,0,1,2,4
0,1.0,6.0,3.0,1
1,1.0,,,2
2,,,,3
3,,5.0,3.0,4


In [47]:
# 결축치 채우기 --> fillna()
data.fillna(0)

Unnamed: 0,0,1,2,4
0,1.0,6.0,3.0,1
1,1.0,0.0,0.0,2
2,0.0,0.0,0.0,3
3,0.0,5.0,3.0,4


In [56]:
# 결축치 채우기 --> fillna()
# 1 column의 결측치를 4 column의 평균값으로 채우기
# data[1].fillna(value=data[4].mean(), inplace=True)
data[1].fillna(data[4].mean(), inplace=True)
data

Unnamed: 0,0,1,2,4
0,1.0,6.0,3.0,1
1,1.0,2.5,,2
2,,2.5,,3
3,,5.0,3.0,4


In [57]:
# thresh --> 임계치, thresh = n, 정상치가 n개 이상인 행이나 열을 살려라
data.dropna(axis=0, thresh=3)


Unnamed: 0,0,1,2,4
0,1.0,6.0,3.0,1
1,1.0,2.5,,2
3,,5.0,3.0,4


In [60]:
df = DataFrame(np.random.rand(7,3)) 
df.iloc[:4, 1]= np.nan
df.iloc[:2, 2]= np.nan
df

Unnamed: 0,0,1,2
0,0.422628,,
1,0.519237,,
2,0.801085,,0.257024
3,0.772264,,0.692742
4,0.23738,0.355273,0.037066
5,0.906791,0.71819,0.374988
6,0.29814,0.530074,0.587328


In [63]:
# 결측치 채우기
df.fillna(method='bfill')


Unnamed: 0,0,1,2
0,0.422628,0.355273,0.257024
1,0.519237,0.355273,0.257024
2,0.801085,0.355273,0.257024
3,0.772264,0.355273,0.692742
4,0.23738,0.355273,0.037066
5,0.906791,0.71819,0.374988
6,0.29814,0.530074,0.587328


In [64]:
# 결측치 있는 label 제거
df.dropna()


Unnamed: 0,0,1,2
4,0.23738,0.355273,0.037066
5,0.906791,0.71819,0.374988
6,0.29814,0.530074,0.587328


In [65]:
# 정상치가 2개 이상인 행은 살리기
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.801085,,0.257024
3,0.772264,,0.692742
4,0.23738,0.355273,0.037066
5,0.906791,0.71819,0.374988
6,0.29814,0.530074,0.587328


In [11]:
# 결측치가 2개 이상인 label만 제거



In [66]:
# 결측치를 0으로 채우기

df.fillna(0)

Unnamed: 0,0,1,2
0,0.422628,0.0,0.0
1,0.519237,0.0,0.0
2,0.801085,0.0,0.257024
3,0.772264,0.0,0.692742
4,0.23738,0.355273,0.037066
5,0.906791,0.71819,0.374988
6,0.29814,0.530074,0.587328


In [70]:
# 1번 열의 결측치는 0.5로, 2번 열의 결측치는 0으로 채우기 --> Dictionary
fill_dict = {1:0.5, 2:0 }
# df.fillna(value=fill_dict)
df.fillna(fill_dict)


Unnamed: 0,0,1,2
0,0.422628,0.5,0.0
1,0.519237,0.5,0.0
2,0.801085,0.5,0.257024
3,0.772264,0.5,0.692742
4,0.23738,0.355273,0.037066
5,0.906791,0.71819,0.374988
6,0.29814,0.530074,0.587328


In [76]:
# 1번 열의 결측치는 0.5로, 2번 열의 결측치는 0으로 채우기 --> Dictionary
# fill_dict = {1:0.5, 2:0 }
# df.fillna(value=fill_dict)
df.fillna({1:0.5, 2: df[0].mean()})

Unnamed: 0,0,1,2
0,0.726993,0.73557,0.575909
1,0.979014,0.174442,0.13562
2,0.526764,0.5,0.885329
3,0.697189,0.5,0.0139
4,0.016229,0.5,0.581511
5,0.542877,0.5,0.581511


In [74]:
df = DataFrame(np.random.rand(6,3)) 
df.iloc[2:, 1]= np.nan
df.iloc[4:, 2]= np.nan
df

Unnamed: 0,0,1,2
0,0.726993,0.73557,0.575909
1,0.979014,0.174442,0.13562
2,0.526764,,0.885329
3,0.697189,,0.0139
4,0.016229,,
5,0.542877,,


In [75]:
# 열 전체 결측치를 ffill로 채우기

df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.726993,0.73557,0.575909
1,0.979014,0.174442,0.13562
2,0.526764,0.174442,0.885329
3,0.697189,0.174442,0.0139
4,0.016229,0.174442,0.0139
5,0.542877,0.174442,0.0139


In [80]:
# 결측치 ffill로 채우기
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.726993,0.73557,0.575909
1,0.979014,0.174442,0.13562
2,0.526764,0.174442,0.885329
3,0.697189,0.174442,0.0139
4,0.016229,,0.0139
5,0.542877,,0.0139


In [91]:
# 이상치 제거하기 

# 설문조사(성별 --> 0, 1,만족도(5점 척도) --> 1 ~ 5)
df = pd.DataFrame({'sex': [0,1,1,1,3,1], 'score':[1, 5, 4, 3, 2, 7]})
df

Unnamed: 0,sex,score
0,0,1
1,1,5
2,1,4
3,1,3
4,3,2
5,1,7


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   sex     6 non-null      int64
 1   score   6 non-null      int64
dtypes: int64(2)
memory usage: 224.0 bytes


In [86]:
# np.where() --> np.where(조건, 4, 2)
df['sex'] = np.where((df['sex'] != 0) & (df['sex'] != 1), np.nan, df['sex'])
df

Unnamed: 0,sex,score
0,0.0,1
1,1.0,5
2,1.0,4
3,1.0,3
4,,2
5,1.0,7


In [87]:
df['score'] = np.where(df['score'] > 5, np.nan, df['score'])
df

Unnamed: 0,sex,score
0,0.0,1.0
1,1.0,5.0
2,1.0,4.0
3,1.0,3.0
4,,2.0
5,1.0,


In [89]:
# df.dropna()
df.dropna(subset=['sex', 'score'])

Unnamed: 0,sex,score
0,0.0,1.0
1,1.0,5.0
2,1.0,4.0
3,1.0,3.0


In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sex     5 non-null      float64
 1   score   5 non-null      float64
dtypes: float64(2)
memory usage: 224.0 bytes


In [95]:
df = pd.read_csv('../Data/auto-mpg.csv')

In [97]:
df

Unnamed: 0,18.0,8,307.0,130.0,3504.,12.0,70,1,chevrolet chevelle malibu
0,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
1,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
2,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
3,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
4,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500
...,...,...,...,...,...,...,...,...,...
392,27.0,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl
393,44.0,4,97.0,52.00,2130.0,24.6,82,2,vw pickup
394,32.0,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage
395,28.0,4,120.0,79.00,2625.0,18.6,82,1,ford ranger


In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   18.0                       397 non-null    float64
 1   8                          397 non-null    int64  
 2   307.0                      397 non-null    float64
 3   130.0                      397 non-null    object 
 4   3504.                      397 non-null    float64
 5   12.0                       397 non-null    float64
 6   70                         397 non-null    int64  
 7   1                          397 non-null    int64  
 8   chevrolet chevelle malibu  397 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.0+ KB
