# 누락값 확인하기

In [1]:
from numpy import NaN, NAN, nan

In [3]:
# 누락값은 데이터 자체가 없다는 것
print(NaN==True)
print(NaN==False)
print(NaN==0)
print(NaN=='')

print(NaN==NaN)
print(NaN==nan)

False
False
False
False
False
False


In [4]:
# isnull() : 누락값 확인
import pandas as pd

print(pd.isnull(NaN))

True


In [5]:
# notnull() : 누락값이 아닌 경우 확인
print(pd.notnull(NaN))
print(pd.notnull(42))

False
True


# 누락값을 포함한 데이터를 불러올 때

In [6]:
visited = pd.read_csv("../data/survey_visited.csv")
survey = pd.read_csv("../data/survey_survey.csv")

print(visited)
print(survey)

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22
    taken person quant  reading
0     619   dyer   rad     9.82
1     619   dyer   sal     0.13
2     622   dyer   rad     7.80
3     622   dyer   sal     0.09
4     734     pb   rad     8.41
5     734   lake   sal     0.05
6     734     pb  temp   -21.50
7     735     pb   rad     7.22
8     735    NaN   sal     0.06
9     735    NaN  temp   -26.00
10    751     pb   rad     4.35
11    751     pb  temp   -18.50
12    751   lake   sal     0.10
13    752   lake   rad     2.19
14    752   lake   sal     0.09
15    752   lake  temp   -16.00
16    752    roe   sal    41.60
17    837   lake   rad     1.46
18    837   lake   sal     0.21
19    837    roe   sal    22.50
20    844    roe   rad    11.25


# 데이터 집합을 연결할 때 누락값이 발생하는 경우

In [7]:
vs = visited.merge(survey, left_on='ident',right_on='taken')
print(vs)

    ident   site       dated  taken person quant  reading
0     619   DR-1  1927-02-08    619   dyer   rad     9.82
1     619   DR-1  1927-02-08    619   dyer   sal     0.13
2     622   DR-1  1927-02-10    622   dyer   rad     7.80
3     622   DR-1  1927-02-10    622   dyer   sal     0.09
4     734   DR-3  1939-01-07    734     pb   rad     8.41
5     734   DR-3  1939-01-07    734   lake   sal     0.05
6     734   DR-3  1939-01-07    734     pb  temp   -21.50
7     735   DR-3  1930-01-12    735     pb   rad     7.22
8     735   DR-3  1930-01-12    735    NaN   sal     0.06
9     735   DR-3  1930-01-12    735    NaN  temp   -26.00
10    751   DR-3  1930-02-26    751     pb   rad     4.35
11    751   DR-3  1930-02-26    751     pb  temp   -18.50
12    751   DR-3  1930-02-26    751   lake   sal     0.10
13    752   DR-3         NaN    752   lake   rad     2.19
14    752   DR-3         NaN    752   lake   sal     0.09
15    752   DR-3         NaN    752   lake  temp   -16.00
16    752   DR

# 데이터를 입력할 때 누락값이 발생하는 경우

In [9]:
# 시리즈를 생성할 때 데이터 프레임에 없는 열과 행 데이터를 입력하여 누락값이 생긴 경우
num_legs=pd.Series({'goat':4,'amoeba':nan})
print(num_legs)

goat      4.0
amoeba    NaN
dtype: float64


In [11]:
scientists = pd.DataFrame({
    'Name':['Rosline','William'],
    'Occupation':['Chemist','Statistician'],
    'Born':['1920-07-25','1876-06-13'],
    'Died':['1958-04-16','1937-10-16'],
    'missing':[NaN,nan]
})

print(scientists)

      Name    Occupation        Born        Died  missing
0  Rosline       Chemist  1920-07-25  1958-04-16      NaN
1  William  Statistician  1876-06-13  1937-10-16      NaN


# 인덱스를 다시 만들 때 누락값이 발생하는 경우

In [14]:
gapminder = pd.read_csv("../data/gapminder.tsv", sep='\t')

In [16]:
life_exp = gapminder.groupby(['year'])['lifeExp'].mean()
print(life_exp)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


In [17]:
# 데이터프레임에 존재하지 않는 데이터 추출
print(life_exp.loc[range(2000,2010),])

year
2000          NaN
2001          NaN
2002    65.694923
2003          NaN
2004          NaN
2005          NaN
2006          NaN
2007    67.007423
2008          NaN
2009          NaN
Name: lifeExp, dtype: float64


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


In [18]:
# 해결법 : 불린 추출
print(life_exp[life_exp.index>2000])

year
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


# 누락값의 개수 구하기

In [19]:
ebola = pd.read_csv("../data/country_timeseries.csv")

In [20]:
print(ebola.count())

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             12
dtype: int64


In [21]:
# count()이용하여 누락값의 개수 구하기
num_rows = ebola.shape[0]
num_missing = num_rows-ebola.count()
print(num_missing)

Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            110
dtype: int64


In [22]:
# count_nonzero()와 isnull() 조합이용하기
# count_nonzero() : 배열에서 0이 아닌 값의 개수를 셈

import numpy as np

print(np.count_nonzero(ebola.isnull()))

print(np.count_nonzero(ebola['Cases_Guinea'].isnull()))

1214
29


In [23]:
# value_counts : 지정한 열의 빈도를 구함 (시리즈)
print(ebola.Cases_Guinea.value_counts(dropna=False).head())

NaN       29
 86.0      3
 495.0     2
 112.0     2
 390.0     2
Name: Cases_Guinea, dtype: int64


# 누락값을 다른 값으로 변경하기

In [24]:
# fillna(0) : 누락값 0으로 변경
print(ebola.fillna(0).iloc[0:10,0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            0.0            10030.0
1    1/4/2015  288        2775.0            0.0             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286           0.0         8157.0                0.0
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0            0.0             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0            0.0             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


In [25]:
# fillna(method='ffill') : 누락값이 나타나기 전의 값으로 변경
print(ebola.fillna(method='ffill').iloc[:10,:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            NaN            10030.0
1    1/4/2015  288        2775.0            NaN             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286        2769.0         8157.0             9722.0
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0         8018.0             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0         7977.0             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


In [26]:
# fillna(method='bfill') : 누락값 다음 값의 값으로 변경
print(ebola.fillna(method='bfill').iloc[:10,:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0         8166.0            10030.0
1    1/4/2015  288        2775.0         8166.0             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286        2730.0         8157.0             9633.0
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0         7977.0             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0         7862.0             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


In [27]:
# interpolate() : 누락값 양쪽 값의 중간값으로 변경
print(ebola.interpolate().iloc[:10,:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            NaN            10030.0
1    1/4/2015  288        2775.0            NaN             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286        2749.5         8157.0             9677.5
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0         7997.5             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0         7919.5             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


# 누락값 삭제하기

In [28]:
# dropna() : 누락값이 포함된 행 삭제
ebola_dropna = ebola.dropna()

print(ebola_dropna.shape)

(1, 18)


# 누락값이 포함된 데이터 계산하기

In [30]:
# 누락값이 포함하여 계산하면 결과가 NaN이 됨 
# skipna()사용하여 계산

print(ebola.Cases_Guinea.sum(skipna=True))

print(ebola.Cases_Guinea.sum(skipna=False))

84729.0
nan
