# 누락값 확인하기

In [3]:
#numpy 수학이나 과학 연산을 위해 만든 파이썬 라이브러리
from numpy import NaN, NAN, nan

* 누락값은 0, ' ' 와 같은 값과는 다른 개념
* 데이터 자체가 없다는 의미
* '같다'라는 개념도 없다.
* 누락값과 True, False, 0, ' '을 비교한 결과

In [3]:
print(NaN == True)

False


In [4]:
print(NaN == False)

False


In [6]:
print(NaN == 0)

False


In [8]:
print(NaN == ' ')

False


In [9]:
# 값 자체가 없으므로 자신과 비교해도 false
print(NaN == NaN)

False


In [10]:
print(NaN == nan)

False


In [11]:
print(NaN == NAN)

False


* 누락값 확인하는 메서드 isnull, notnull

In [2]:
import pandas as pd

print(pd.isnull(NaN))

NameError: name 'NaN' is not defined

In [15]:
print(pd.isnull(nan))

True


In [16]:
print(pd.isnull(NAN))

True


In [17]:
print(pd.notnull(NaN))

False


In [18]:
print(pd.notnull(42))

True


In [19]:
print(pd.notnull('missing'))

True


# 누락값이 생기는 이유

## 1. 누락값이 있는 데이터 집합을 연결할 때 누락값이 생기는 경우

In [22]:
visited = pd.read_csv('../data/survey_visited.csv')
survey = pd.read_csv('../data/survey_survey.csv')

print(visited)

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


In [23]:
print(survey)

    taken person quant  reading
0     619   dyer   rad     9.82
1     619   dyer   sal     0.13
2     622   dyer   rad     7.80
3     622   dyer   sal     0.09
4     734     pb   rad     8.41
5     734   lake   sal     0.05
6     734     pb  temp   -21.50
7     735     pb   rad     7.22
8     735    NaN   sal     0.06
9     735    NaN  temp   -26.00
10    751     pb   rad     4.35
11    751     pb  temp   -18.50
12    751   lake   sal     0.10
13    752   lake   rad     2.19
14    752   lake   sal     0.09
15    752   lake  temp   -16.00
16    752    roe   sal    41.60
17    837   lake   rad     1.46
18    837   lake   sal     0.21
19    837    roe   sal    22.50
20    844    roe   rad    11.25


In [24]:
vs = visited.merge(survey, left_on = 'ident', right_on = 'taken')
vs

Unnamed: 0,ident,site,dated,taken,person,quant,reading
0,619,DR-1,1927-02-08,619,dyer,rad,9.82
1,619,DR-1,1927-02-08,619,dyer,sal,0.13
2,622,DR-1,1927-02-10,622,dyer,rad,7.8
3,622,DR-1,1927-02-10,622,dyer,sal,0.09
4,734,DR-3,1939-01-07,734,pb,rad,8.41
5,734,DR-3,1939-01-07,734,lake,sal,0.05
6,734,DR-3,1939-01-07,734,pb,temp,-21.5
7,735,DR-3,1930-01-12,735,pb,rad,7.22
8,735,DR-3,1930-01-12,735,,sal,0.06
9,735,DR-3,1930-01-12,735,,temp,-26.0


## 2. 데이터를 입력할 때 누락값이 생기는 경우

In [26]:
num_legs = pd.Series({'goat': 4, 'amoeba': nan})
print(num_legs)
print(type(num_legs))

goat      4.0
amoeba    NaN
dtype: float64
<class 'pandas.core.series.Series'>


In [29]:
scientists = pd.DataFrame({
    'Name': ['Rosaline Franklin', 'William Gosset'],
    'Occupation': ['Chemist', 'Statistician'],
    'Born': ['1920-07-25', '1876-06-13'],
    'Died': ['1958-04-16', '1937-10-16'],
    'missing': [NaN, nan]
})

print(scientists)
print(type(scientists))

                Name    Occupation        Born        Died  missing
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16      NaN
1     William Gosset  Statistician  1876-06-13  1937-10-16      NaN
<class 'pandas.core.frame.DataFrame'>


## 3. 범위를 지정하여 데이터를 추출할 때 누락값이 생기는 경우

In [32]:
gapminder = pd.read_csv('../data/gapminder.tsv', sep='\t')
gapminder

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


In [33]:
life_exp = gapminder.groupby(['year'])['lifeExp'].mean()
print(life_exp)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


In [34]:
y2000 = life_exp[life_exp.index > 2000]
print(y2000)

year
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


# 누락값의 개수 구하기

### 전체 값에서 누락값을 빼는 방법

In [6]:
ebola = pd.read_csv('../data/country_timeseries.csv')
print(ebola.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 18 columns):
Date                   122 non-null object
Day                    122 non-null int64
Cases_Guinea           93 non-null float64
Cases_Liberia          83 non-null float64
Cases_SierraLeone      87 non-null float64
Cases_Nigeria          38 non-null float64
Cases_Senegal          25 non-null float64
Cases_UnitedStates     18 non-null float64
Cases_Spain            16 non-null float64
Cases_Mali             12 non-null float64
Deaths_Guinea          92 non-null float64
Deaths_Liberia         81 non-null float64
Deaths_SierraLeone     87 non-null float64
Deaths_Nigeria         38 non-null float64
Deaths_Senegal         22 non-null float64
Deaths_UnitedStates    18 non-null float64
Deaths_Spain           16 non-null float64
Deaths_Mali            12 non-null float64
dtypes: float64(16), int64(1), object(1)
memory usage: 17.3+ KB
None


In [5]:
print(ebola.count())

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             12
dtype: int64


In [7]:
ebola.shape # 몇행 몇열

(122, 18)

In [9]:
num_rows = ebola.shape[0] # 몇 행인지 확인
print(num_rows) 
num_missing = num_rows - ebola.count() # (전체 행 - non null)
print(num_missing)

122
Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            110
dtype: int64


### numpy를 이용한 방법

In [10]:
# count_nonzero, isnull 메서드를 조합하여 누락값 개수 확인
import numpy as np

print(np.count_nonzero(ebola.isnull())) # 전체에 null이 몇 개인지

1214


In [11]:
print(np.count_nonzero(ebola['Cases_Guinea'].isnull()))

29


In [17]:
# value_counts 메서드는 지정한 열의 빈도를 구하는 메서드
print(ebola.Cases_Guinea.value_counts(dropna = False).head(10))

NaN      29
86.0      3
495.0     2
112.0     2
390.0     2
506.0     1
812.0     1
771.0     1
648.0     1
607.0     1
Name: Cases_Guinea, dtype: int64


In [16]:
print(ebola.Cases_Guinea.value_counts(dropna = True).head())

86.0      3
112.0     2
390.0     2
495.0     2
2597.0    1
Name: Cases_Guinea, dtype: int64


# 누락값 처리하기 - 변경, 삭제

## 1. 누락값 변경하기

In [18]:
# fillna 메서드에 0 을 대입하면 누락값을 0으로 변경
# df의 크기가 매우 크고 메모리를 효율적으로 사용해야 하는 경우 사용
print(ebola.fillna(0).iloc[0:10, 0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            0.0            10030.0
1    1/4/2015  288        2775.0            0.0             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286           0.0         8157.0                0.0
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0            0.0             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0            0.0             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


In [19]:
# ffill : 누락값이 나타나기 전의 값으로 누락값이 변경
    # 첫 번째 값이 누락값인 경우 처리하지 못함
print(ebola.fillna(method='ffill').iloc[0:10, 0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            NaN            10030.0
1    1/4/2015  288        2775.0            NaN             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286        2769.0         8157.0             9722.0
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0         8018.0             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0         7977.0             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


In [21]:
# bfill : 누락값이 나타난 후의 값으로 누락값이 변경
    # 마지막 값이 누락값인 경우 처리하지 못함
print(ebola.fillna(method='bfill').iloc[0:10, 0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0         8166.0            10030.0
1    1/4/2015  288        2775.0         8166.0             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286        2730.0         8157.0             9633.0
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0         7977.0             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0         7862.0             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


In [22]:
# interpolate 메서드는 누락값 양쪽에 있는 값을 이용하여 중간값으로 채움

print(ebola.interpolate().iloc[0:10, 0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            NaN            10030.0
1    1/4/2015  288        2775.0            NaN             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286        2749.5         8157.0             9677.5
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0         7997.5             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0         7919.5             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


## 5. 누락값 삭제하기

In [23]:
print(ebola.shape)

(122, 18)


In [25]:
# dropna 메서드 : 누락값 사제, 누락값이 포함된 행이 모두 삭제 (많은 데이터가 삭제됨)

ebola_dropna = ebola.dropna()
print(ebola_dropna.shape)

(1, 18)


In [26]:
print(ebola_dropna)

          Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone  \
19  11/18/2014  241        2047.0         7082.0             6190.0   

    Cases_Nigeria  Cases_Senegal  Cases_UnitedStates  Cases_Spain  Cases_Mali  \
19           20.0            1.0                 4.0          1.0         6.0   

    Deaths_Guinea  Deaths_Liberia  Deaths_SierraLeone  Deaths_Nigeria  \
19         1214.0          2963.0              1267.0             8.0   

    Deaths_Senegal  Deaths_UnitedStates  Deaths_Spain  Deaths_Mali  
19             0.0                  1.0           0.0          6.0  


# 누락값이 포함된 데이터 계산하기

In [27]:
ebola['Cases_multipie'] = ebola['Cases_Guinea'] + ebola['Cases_Liberia'] \
+ ebola['Cases_SierraLeone']

In [30]:
# 누락값이 하나라도 있는 행은 계산시 NaN으로 반환

ebola_subset = ebola.loc[:, 
                         ['Cases_Guinea', 'Cases_Liberia', 'Cases_SierraLeone', 'Cases_multipie']]

print(ebola_subset. head(n=10))

   Cases_Guinea  Cases_Liberia  Cases_SierraLeone  Cases_multipie
0        2776.0            NaN            10030.0             NaN
1        2775.0            NaN             9780.0             NaN
2        2769.0         8166.0             9722.0         20657.0
3           NaN         8157.0                NaN             NaN
4        2730.0         8115.0             9633.0         20478.0
5        2706.0         8018.0             9446.0         20170.0
6        2695.0            NaN             9409.0             NaN
7        2630.0         7977.0             9203.0         19810.0
8        2597.0            NaN             9004.0             NaN
9        2571.0         7862.0             8939.0         19372.0


In [32]:
# sum 메서드 skipna = True 인잣값을 사용해 누락값 포함해서 계산

print(ebola.Cases_Guinea.sum(skipna = True))

84729.0
