## 기상청 종관지상관측 데이터 (ASOS)

* https://data.kma.go.kr

In [2]:
import pandas as pd
import os, sys, time
import glob

In [46]:
#FILENAME = 'FZ_Data/ASOS_All.xlsx'
FILENAME = 'FZ_Data/ASOS_All_csv.csv'

if not os.path.exists(FILENAME):
    assert False, 'cannot find FILENAME: {}'.format(FILENAME)
else:
    print('found : {}'.format(FILENAME))

found : FZ_Data/ASOS_All_csv.csv


### load ASOS 데이터

In [83]:
if FILENAME.endswith('.xlsx'):
    df = pd.read_excel(FILENAME, 
                       encoding='euc-kr',
                       converters={'지점':str, '지면상태(지면상태코드)':str, '현상번호(국내식)':str})
elif FILENAME.endswith('.csv'):
    df = pd.read_csv(FILENAME, 
                       encoding='euc-kr',
                       converters={'지점':str, '지면상태(지면상태코드)':str, '현상번호(국내식)':str})

In [84]:
df.shape

(29015, 37)

In [85]:
df.columns

Index(['지점', '지점명', '일시', '기온(°C)', '기온 QC플래그', '강수량(mm)', '강수량 QC플래그',
       '풍속(m/s)', '풍속 QC플래그', '풍향(16방위)', '풍향 QC플래그', '습도(%)', '습도 QC플래그',
       '증기압(hPa)', '이슬점온도(°C)', '현지기압(hPa)', '현지기압 QC플래그', '해면기압(hPa)',
       '해면기압 QC플래그', '일조(hr)', '일조 QC플래그', '일사(MJ/m2)', '적설(cm)', '3시간신적설(cm)',
       '전운량(10분위)', '중하층운량(10분위)', '운형(운형약어)', '최저운고(100m )', '시정(10m)',
       '지면상태(지면상태코드)', '현상번호(국내식)', '지면온도(°C)', '지면온도 QC플래그', '5cm 지중온도(°C)',
       '10cm 지중온도(°C)', '20cm 지중온도(°C)', '30cm 지중온도(°C)'],
      dtype='object')

### 24개 컬럼이 분석 대상

* 데이터품질에 대한 내용인 QC플래그 제외
* 특정지점에서만 데이터를 취합하는 지중온도는 제외 

In [86]:
use_columns = ['지점', '지점명', '일시', '기온(°C)', '강수량(mm)', 
       '풍속(m/s)', '풍향(16방위)', '습도(%)', 
       '증기압(hPa)', '이슬점온도(°C)', '현지기압(hPa)', '해면기압(hPa)',
       '일조(hr)', '일사(MJ/m2)', '적설(cm)', '3시간신적설(cm)',
       '전운량(10분위)', '중하층운량(10분위)', '운형(운형약어)', '최저운고(100m )', '시정(10m)',
       '지면상태(지면상태코드)', '현상번호(국내식)', '지면온도(°C)']

In [87]:
len(use_columns)

24

In [88]:
df = df[use_columns]

### 결측치 확인

In [89]:
df.isnull().sum()

지점                  0
지점명                 0
일시                  0
기온(°C)              0
강수량(mm)         27447
풍속(m/s)             0
풍향(16방위)            0
습도(%)               1
증기압(hPa)        14513
이슬점온도(°C)       14513
현지기압(hPa)       14514
해면기압(hPa)       14514
일조(hr)          15001
일사(MJ/m2)       20261
적설(cm)          22727
3시간신적설(cm)      28455
전운량(10분위)       16067
중하층운량(10분위)     16081
운형(운형약어)        20738
최저운고(100m )     22252
시정(10m)         16054
지면상태(지면상태코드)        0
현상번호(국내식)           0
지면온도(°C)        18142
dtype: int64

In [90]:
df.groupby('현상번호(국내식)').size()

현상번호(국내식)
              19422
1               576
10                8
1005              7
100601            2
              ...  
71704001          1
7170401901        1
7201              1
8                10
805               1
Length: 112, dtype: int64

In [91]:
df.head()

Unnamed: 0,지점,지점명,일시,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),...,적설(cm),3시간신적설(cm),전운량(10분위),중하층운량(10분위),운형(운형약어),최저운고(100m ),시정(10m),지면상태(지면상태코드),현상번호(국내식),지면온도(°C)
0,100,대관령,2006-01-01 0:00,-5.7,,4.2,270,96.0,3.6,-7.0,...,1.0,,4.0,4.0,Sc,4.0,1200.0,,,
1,100,대관령,2006-01-01 1:00,-5.4,,5.0,270,97.0,,,...,,,,,,,,,,
2,100,대관령,2006-01-01 2:00,-5.3,,6.8,270,96.0,,,...,,,,,,,,,,
3,100,대관령,2006-01-01 3:00,-5.1,,7.3,270,95.0,3.8,-6.4,...,1.0,,4.0,4.0,Sc,4.0,1200.0,16.0,,-5.3
4,100,대관령,2006-01-01 4:00,-5.2,,4.1,250,91.0,,,...,1.0,,,,,,,,,


### 결측치 처리: 숫자가 0인 것들이 데이터가 없는 경우들

* 강수량(mm)이 확인 안 된 경우 
* 일조(hr)이 확인 안 된 경우
* 적설(cm)이 확인 안 된 경우
* 3시간신적설(cm)이 확인 안 된 경우

In [92]:
for col in ['강수량(mm)', '일조(hr)', '일사(MJ/m2)', '적설(cm)', '3시간신적설(cm)']:
    df[col] = df[col].fillna(0)

In [93]:
df[['강수량(mm)', '일조(hr)', '일사(MJ/m2)','적설(cm)', '3시간신적설(cm)']].isnull().sum()

강수량(mm)       0
일조(hr)        0
일사(MJ/m2)     0
적설(cm)        0
3시간신적설(cm)    0
dtype: int64

In [94]:
df_copy = df.copy()

### 결측치 처리: 주변값을 활용하여 비슷한 것으로 처리

* 습도(%)
* 3시간마다 측정하는 값들: 증기압(hPa), 이슬점온도(°C), 현지기압(hPa), 해면기압(hPa), 전운량(10분위), 중하층운량(10분위), 운형(운형약어),
  최저운고(100m ), 시정(10m), 지면상태(지면상태코드), 지면온도(°C)

In [95]:
IMPUTE_METHOD=['ffill', 'bfill']

In [96]:
def impute_columns(df, cols, impute_methods):
    if isinstance(cols, str):
        cols = [cols]
    if isinstance(impute_methods, str):
        impute_methods = [impute_methods]
    for col in cols:
        if col not in df.columns:
            print('cannot find such column: {}'.format(col))
            continue
            
        for method in impute_methods:
            before = df[col].isnull().sum()
            if before > 0:
                if method == 'ffill':
                    df[col] = df[col].fillna(method='ffill')
                    after = df[col].isnull().sum()
                    print('** impute: {} column {} -> {}'.format(col, before, after))
                elif method == 'bfill':
                    df[col] = df[col].fillna(method='bfill')
                    after = df[col].isnull().sum()
                    print('** impute: {} column {} -> {}'.format(col, before, after))
                elif method == 'linear_interpolate':
                    df[col] = df[col].interpolate(method='linear')
                    after = df[col].isnull().sum()
                    print('** impute: {} column {} -> {}'.format(col, before, after))
                else:
                    print('cannot find such impute method')     

In [97]:
impute_columns(df, ['습도(%)'], IMPUTE_METHOD)

** impute: 습도(%) column 1 -> 0


In [98]:
df['습도(%)'].isnull().sum()

0

In [99]:
impute_columns(df, ['증기압(hPa)', '이슬점온도(°C)', '현지기압(hPa)', '해면기압(hPa)', '전운량(10분위)', 
                    '중하층운량(10분위)', '운형(운형약어)', '최저운고(100m )', '시정(10m)', '지면상태(지면상태코드)', 
                    '지면온도(°C)'], IMPUTE_METHOD)

** impute: 증기압(hPa) column 14513 -> 0
** impute: 이슬점온도(°C) column 14513 -> 0
** impute: 현지기압(hPa) column 14514 -> 0
** impute: 해면기압(hPa) column 14514 -> 0
** impute: 전운량(10분위) column 16067 -> 0
** impute: 중하층운량(10분위) column 16081 -> 0
** impute: 운형(운형약어) column 20738 -> 0
** impute: 최저운고(100m ) column 22252 -> 0
** impute: 시정(10m) column 16054 -> 0
** impute: 지면온도(°C) column 18142 -> 3
** impute: 지면온도(°C) column 3 -> 0


In [100]:
df[['증기압(hPa)', '이슬점온도(°C)', '현지기압(hPa)', '해면기압(hPa)', '전운량(10분위)', 
   '중하층운량(10분위)', '운형(운형약어)', '최저운고(100m )', '시정(10m)', '지면상태(지면상태코드)', '지면온도(°C)']].isnull().sum()

증기압(hPa)        0
이슬점온도(°C)       0
현지기압(hPa)       0
해면기압(hPa)       0
전운량(10분위)       0
중하층운량(10분위)     0
운형(운형약어)        0
최저운고(100m )     0
시정(10m)         0
지면상태(지면상태코드)    0
지면온도(°C)        0
dtype: int64

In [101]:
df.isnull().sum()

지점              0
지점명             0
일시              0
기온(°C)          0
강수량(mm)         0
풍속(m/s)         0
풍향(16방위)        0
습도(%)           0
증기압(hPa)        0
이슬점온도(°C)       0
현지기압(hPa)       0
해면기압(hPa)       0
일조(hr)          0
일사(MJ/m2)       0
적설(cm)          0
3시간신적설(cm)      0
전운량(10분위)       0
중하층운량(10분위)     0
운형(운형약어)        0
최저운고(100m )     0
시정(10m)         0
지면상태(지면상태코드)    0
현상번호(국내식)       0
지면온도(°C)        0
dtype: int64

### 현상번호(국내식)을 FZ_flag로 변환

* 3, 7, 12 : 어는 비(Freezing Rain)
* 15, 18 : 얼음침 

In [104]:
def is_FZ_rain(x):
    if pd.isna(x):
        return 0
    x = str(x)
    if len(x) <= 2:
        if x in ['3', '7', '12', '15', '18']:
            return 1
    else:
        xx = [x[i:i+2] for i in range(0, len(x), 2)]
        for x in xx:
            if x in ['03', '07', '12', '15', '18']:
                return 1
    return 0

In [105]:
df['FZ_flag'] = df['현상번호(국내식)'].map(is_FZ_rain)

In [106]:
df.groupby('FZ_flag').size()

FZ_flag
0    28946
1       69
dtype: int64

### ffill 방식으로 처리한 것을 저장

* FZ_all_ffill.csv

In [107]:
df.to_csv('FZ_Data/FZ_all_ffill.csv', index=False, encoding='euc-kr')

### interpolate 사용

In [108]:
df_copy.isnull().sum()

지점                  0
지점명                 0
일시                  0
기온(°C)              0
강수량(mm)             0
풍속(m/s)             0
풍향(16방위)            0
습도(%)               1
증기압(hPa)        14513
이슬점온도(°C)       14513
현지기압(hPa)       14514
해면기압(hPa)       14514
일조(hr)              0
일사(MJ/m2)           0
적설(cm)              0
3시간신적설(cm)          0
전운량(10분위)       16067
중하층운량(10분위)     16081
운형(운형약어)        20738
최저운고(100m )     22252
시정(10m)         16054
지면상태(지면상태코드)        0
현상번호(국내식)           0
지면온도(°C)        18142
dtype: int64

In [109]:
IMPUTE_METHOD='linear_interpolate'

In [110]:
impute_columns(df_copy, ['습도(%)'], IMPUTE_METHOD)

** impute: 습도(%) column 1 -> 0


In [111]:
impute_columns(df_copy, ['증기압(hPa)', '이슬점온도(°C)', '현지기압(hPa)', '해면기압(hPa)', '전운량(10분위)', 
                    '중하층운량(10분위)', '운형(운형약어)', '최저운고(100m )', '시정(10m)', '지면상태(지면상태코드)', 
                    '지면온도(°C)'], IMPUTE_METHOD)

** impute: 증기압(hPa) column 14513 -> 0
** impute: 이슬점온도(°C) column 14513 -> 0
** impute: 현지기압(hPa) column 14514 -> 0
** impute: 해면기압(hPa) column 14514 -> 0
** impute: 전운량(10분위) column 16067 -> 0
** impute: 중하층운량(10분위) column 16081 -> 0
** impute: 운형(운형약어) column 20738 -> 20738
** impute: 최저운고(100m ) column 22252 -> 0
** impute: 시정(10m) column 16054 -> 0
** impute: 지면온도(°C) column 18142 -> 3


In [115]:
df_copy.isnull().sum()

지점                  0
지점명                 0
일시                  0
기온(°C)              0
강수량(mm)             0
풍속(m/s)             0
풍향(16방위)            0
습도(%)               0
증기압(hPa)            0
이슬점온도(°C)           0
현지기압(hPa)           0
해면기압(hPa)           0
일조(hr)              0
일사(MJ/m2)           0
적설(cm)              0
3시간신적설(cm)          0
전운량(10분위)           0
중하층운량(10분위)         0
운형(운형약어)        20738
최저운고(100m )         0
시정(10m)             0
지면상태(지면상태코드)        0
현상번호(국내식)           0
지면온도(°C)            3
FZ_flag             0
dtype: int64

### numeric type이 아닌 것은 interpolate가 동작하지 않음

In [116]:
impute_columns(df_copy, ['운형(운형약어)'], ['ffill', 'bfill'])

** impute: 운형(운형약어) column 20738 -> 0


In [120]:
df_copy.isnull().sum()

지점              0
지점명             0
일시              0
기온(°C)          0
강수량(mm)         0
풍속(m/s)         0
풍향(16방위)        0
습도(%)           0
증기압(hPa)        0
이슬점온도(°C)       0
현지기압(hPa)       0
해면기압(hPa)       0
일조(hr)          0
일사(MJ/m2)       0
적설(cm)          0
3시간신적설(cm)      0
전운량(10분위)       0
중하층운량(10분위)     0
운형(운형약어)        0
최저운고(100m )     0
시정(10m)         0
지면상태(지면상태코드)    0
현상번호(국내식)       0
지면온도(°C)        3
FZ_flag         0
dtype: int64

In [121]:
impute_columns(df_copy, ['지면온도(°C)'], ['ffill', 'bfill'])

** impute: 지면온도(°C) column 3 -> 3
** impute: 지면온도(°C) column 3 -> 0


In [122]:
df_copy.isnull().sum()

지점              0
지점명             0
일시              0
기온(°C)          0
강수량(mm)         0
풍속(m/s)         0
풍향(16방위)        0
습도(%)           0
증기압(hPa)        0
이슬점온도(°C)       0
현지기압(hPa)       0
해면기압(hPa)       0
일조(hr)          0
일사(MJ/m2)       0
적설(cm)          0
3시간신적설(cm)      0
전운량(10분위)       0
중하층운량(10분위)     0
운형(운형약어)        0
최저운고(100m )     0
시정(10m)         0
지면상태(지면상태코드)    0
현상번호(국내식)       0
지면온도(°C)        0
FZ_flag         0
dtype: int64

In [117]:
df_copy['FZ_flag'] = df_copy['현상번호(국내식)'].map(is_FZ_rain)

In [118]:
df_copy.groupby('FZ_flag').size()

FZ_flag
0    28946
1       69
dtype: int64

In [123]:
df_copy.to_csv('FZ_Data/FZ_all_interpolate.csv', index=False, encoding='euc-kr')

### Test codes

In [244]:
df['증기압(hPa)'].head(10)

0    3.6
1    NaN
2    NaN
3    3.8
4    NaN
5    NaN
6    3.8
7    NaN
8    NaN
9    3.8
Name: 증기압(hPa), dtype: float64

In [245]:
#df['증기압(hPa)'] = df['증기압(hPa)'].interpolate(method='linear')
df['증기압(hPa)'] = df['증기압(hPa)'].fillna(method='ffill')

In [246]:
df['증기압(hPa)'].head(10)

0    3.600000
1    3.666667
2    3.733333
3    3.800000
4    3.800000
5    3.800000
6    3.800000
7    3.800000
8    3.800000
9    3.800000
Name: 증기압(hPa), dtype: float64

In [247]:
df['시정(10m)'].head(10)

0    1200.0
1       NaN
2       NaN
3    1200.0
4       NaN
5       NaN
6    1200.0
7       NaN
8       NaN
9    1000.0
Name: 시정(10m), dtype: float64

In [248]:
df['시정(10m)'] = df['시정(10m)'].interpolate(method='linear')

In [249]:
df['시정(10m)'].head(10)

0    1200.000000
1    1200.000000
2    1200.000000
3    1200.000000
4    1200.000000
5    1200.000000
6    1200.000000
7    1133.333333
8    1066.666667
9    1000.000000
Name: 시정(10m), dtype: float64

In [251]:
df['일사(MJ/m2)'].head(20)

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
8     0.00
9     0.12
10    0.42
11    0.31
12    0.36
13    0.56
14    0.47
15    0.64
16    0.64
17    0.15
18    0.01
19     NaN
Name: 일사(MJ/m2), dtype: float64

In [252]:
df['일사(MJ/m2)'] = df['일사(MJ/m2)'].fillna(0)

In [253]:
df['일사(MJ/m2)'].head(20)

0     0.00
1     0.00
2     0.00
3     0.00
4     0.00
5     0.00
6     0.00
7     0.00
8     0.00
9     0.12
10    0.42
11    0.31
12    0.36
13    0.56
14    0.47
15    0.64
16    0.64
17    0.15
18    0.01
19    0.00
Name: 일사(MJ/m2), dtype: float64

In [254]:
df['3시간신적설(cm)'].head(20)

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
12   NaN
13   NaN
14   NaN
15   NaN
16   NaN
17   NaN
18   NaN
19   NaN
Name: 3시간신적설(cm), dtype: float64

In [256]:
dfdf.groupby(['운형(운형약어)']).size()

운형(운형약어)
Ac         133
AcCc         1
AcCi       161
AcCs        10
As         189
AsCi         6
AsCs         3
CbNs         4
CbStNs       2
Cc           1
CcCi         1
Ci        1669
Cs          80
CsCi         3
Cu           5
CuCi         2
CuCs         1
CuNs         2
CuSc         2
Sc        2979
ScAc       146
ScAcCi      30
ScAcCs       2
ScAs      1068
ScAsCs       1
ScCc         1
ScCi       380
ScCs        14
ScNs        64
St         179
StAc         5
StAcCi       1
StAs       138
StCi         5
StNs       981
StSc         3
StScAs       3
StScCi       1
StScNs       1
dtype: int64

In [258]:
df['운형(운형약어)'].head(20)

0       Sc
1      NaN
2      NaN
3       Sc
4      NaN
5      NaN
6       Sc
7      NaN
8      NaN
9       Sc
10     NaN
11     NaN
12    StAs
13     NaN
14     NaN
15    StAs
16     NaN
17     NaN
18    StAs
19     NaN
Name: 운형(운형약어), dtype: object

In [259]:
df['운형(운형약어)'] = df['운형(운형약어)'].fillna(method='ffill')

In [266]:
df['운형(운형약어)'].isnull().sum()

0

In [260]:
df['운형(운형약어)'].head(20)

0       Sc
1       Sc
2       Sc
3       Sc
4       Sc
5       Sc
6       Sc
7       Sc
8       Sc
9       Sc
10      Sc
11      Sc
12    StAs
13    StAs
14    StAs
15    StAs
16    StAs
17    StAs
18    StAs
19    StAs
Name: 운형(운형약어), dtype: object

In [261]:
df['지면상태(지면상태코드)'].head(20)

0     NaN
1     NaN
2     NaN
3      16
4     NaN
5     NaN
6     NaN
7     NaN
8     NaN
9      16
10    NaN
11    NaN
12    NaN
13    NaN
14    NaN
15     16
16    NaN
17    NaN
18    NaN
19    NaN
Name: 지면상태(지면상태코드), dtype: object

In [262]:
df['지면상태(지면상태코드)'] = df['지면상태(지면상태코드)'].fillna(method='ffill')

In [265]:
df['지면상태(지면상태코드)'].isnull().sum()

3

In [264]:
df['지면상태(지면상태코드)'].head(20)

(0     NaN
 1     NaN
 2     NaN
 3      16
 4      16
 5      16
 6      16
 7      16
 8      16
 9      16
 10     16
 11     16
 12     16
 13     16
 14     16
 15     16
 16     16
 17     16
 18     16
 19     16
 Name: 지면상태(지면상태코드), dtype: object, 3)

In [267]:
df['지면상태(지면상태코드)'] = df['지면상태(지면상태코드)'].fillna(method='bfill')

In [268]:
df['지면상태(지면상태코드)'].isnull().sum()

0

In [269]:
df['이슬점온도(°C)'].head(20)

0    -7.0
1     NaN
2     NaN
3    -6.4
4     NaN
5     NaN
6    -6.4
7     NaN
8     NaN
9    -6.4
10    NaN
11    NaN
12   -5.8
13    NaN
14    NaN
15   -6.6
16    NaN
17    NaN
18   -7.1
19    NaN
Name: 이슬점온도(°C), dtype: float64

In [None]:
IMPUTE_METHOD=['ffill', 'bfill']

In [276]:
def impute_columns(dataframes, cols, impute_methods):
    if isinstance(cols, str):
        cols = [cols]
    if isinstance(impute_methods, str):
        impute_methods = [impute_methods]
    for col in cols:
        if col not in dataframes.columns:
            print('cannot find such column: {}'.format(col))
            continue
            
        for method in impute_methods:
            before = dataframes[col].isnull().sum()
            if before > 0:
                if method == 'ffill':
                    dataframes[col] = dataframes[col].fillna(method='ffill')
                    after = dataframes[col].isnull().sum()
                    print('** impute: {} column {} -> {}'.format(col, before, after))
                elif method == 'bfill':
                    dataframes[col] = dataframes[col].fillna(method='bfill')
                    after = dataframes[col].isnull().sum()
                    print('** impute: {} column {} -> {}'.format(col, before, after))
                elif method == 'linear_interpolate':
                    dataframes[col] = dataframes[col].interpolate(method='linear')
                    after = dataframes[col].isnull().sum()
                    print('** impute: {} column {} -> {}'.format(col, before, after))
                else:
                    print('cannot find such impute method')                

In [277]:
impute_columns(df, ['이슬점온도(°C)'], ['ffill', 'bfill'])

In [278]:
df['이슬점온도(°C)'].isnull().sum()

0

In [280]:
impute_columns(df, '해면기압(hPa)', 'linear_interpolate')

** impute: 해면기압(hPa) column 14514 -> 0


In [168]:
for col in ['이슬점온도(°C)', '현지기압(hPa)', '해면기압(hPa)']:
    df[col] = df[col].fillna(method='ffill')

In [169]:
for col in ['지면온도(°C)', '전운량(10분위)', '중하층운량(10분위)', '최저운고(100m )']:
    #df[col] = df[col].fillna(method='ffill')
    df[col] = df[col].interpolate(method='linear')

In [170]:
df.isnull().sum()

지점                  0
지점명                 0
일시                  0
기온(°C)              0
강수량(mm)             0
풍속(m/s)             0
풍향(16방위)            0
습도(%)               0
증기압(hPa)            0
이슬점온도(°C)           0
현지기압(hPa)           0
해면기압(hPa)           0
일조(hr)              0
일사(MJ/m2)           8
적설(cm)              0
3시간신적설(cm)        219
전운량(10분위)           0
중하층운량(10분위)         0
운형(운형약어)        20738
최저운고(100m )         0
시정(10m)             0
지면상태(지면상태코드)    24171
현상번호(국내식)       19422
지면온도(°C)            3
dtype: int64

In [171]:
for col in ['일사(MJ/m2)', '3시간신적설(cm)', '지면상태(지면상태코드)', '지면온도(°C)']:
    df[col] = df[col].fillna(method='bfill')

In [172]:
df.groupby(['현상번호(국내식)']).size()

현상번호(국내식)
1             576
10              8
1005            7
100601          2
1105            2
             ... 
71704001        1
7170401901      1
7201            1
8              10
805             1
Length: 111, dtype: int64

In [173]:
def is_fzrain(x):
    if pd.isna(x):
        return 0
    x = str(x)
    if len(x) <= 2:
        if x in ['3', '7', '12', '15', '18']:
            return 1
    else:
        xx = [x[i:i+2] for i in range(0, len(x), 2)]
        for x in xx:
            if x in ['03', '07', '12', '15', '18']:
                return 1
    return 0

In [174]:
is_fzrain('1')

0

In [175]:
is_fzrain('3'), is_fzrain('03')

(1, 0)

In [176]:
is_fzrain('12'), is_fzrain('7112401901')

(1, 1)

In [177]:
df['FZ_flag'] = df['현상번호(국내식)'].map(is_fzrain)

In [178]:
df.groupby(['FZ_flag']).size()

FZ_flag
0    28946
1       69
dtype: int64

In [180]:
df.to_csv('FZ_Data/FZ_all2.csv', index=False, encoding='euc-kr')

In [181]:
df.describe()

Unnamed: 0,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),일조(hr),일사(MJ/m2),적설(cm),3시간신적설(cm),전운량(10분위),중하층운량(10분위),최저운고(100m ),시정(10m),지면온도(°C),FZ_flag
count,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0,29015.0
mean,-0.226562,0.047317,2.738394,195.293469,63.176364,4.033472,-7.423405,972.53989,1022.45948,0.237577,0.277348,2.241151,0.472886,4.210495,2.775427,10.222247,1249.360951,0.829071,0.002378
std,6.622321,0.464496,2.406592,115.410034,21.600016,2.22836,7.508438,36.958702,6.466127,0.398648,0.588484,6.795927,0.854326,3.865776,3.183531,7.940813,611.47586,5.953886,0.048708
min,-26.2,0.0,0.0,0.0,0.0,0.5,-30.0,903.3,993.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,-25.0,0.0
25%,-4.5,0.0,0.9,70.0,46.0,2.3,-12.7,929.9,1018.4,0.0,0.01,0.0,0.0,0.0,0.0,4.0,800.0,-2.591667,0.0
50%,-0.2,0.0,2.2,250.0,65.0,3.7,-6.8,990.9,1022.8,0.0,0.01,0.0,0.2,3.333333,1.333333,9.0,1200.0,-0.1,0.0
75%,4.1,0.0,3.9,270.0,82.0,5.3,-1.9,1002.0,1026.9,0.4,0.11,0.0,0.5,8.0,5.666667,10.736068,1800.0,3.866667,0.0
max,20.6,23.5,17.7,360.0,99.0,16.4,14.4,1028.1,1044.3,1.0,3.26,74.7,24.5,10.0,10.0,55.0,3000.0,30.6,1.0


In [182]:
df1 = df.loc[df['지점명']=='대관령']

In [183]:
df1.corr()

Unnamed: 0,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),일조(hr),일사(MJ/m2),적설(cm),3시간신적설(cm),전운량(10분위),중하층운량(10분위),최저운고(100m ),시정(10m),지면온도(°C),FZ_flag
기온(°C),1.0,0.045373,0.116781,-0.005081,-0.117022,0.732829,0.758031,-0.128245,-0.5137,0.217457,0.305539,-0.315643,0.061974,0.305766,0.246366,-0.04886,-0.192951,0.835789,-0.035718
강수량(mm),0.045373,1.0,-0.032803,-0.073225,0.132005,0.127902,0.115174,-0.051573,-0.076955,-0.067745,-0.055161,0.068394,0.237297,0.165399,0.169739,-0.075008,-0.174839,0.046486,-0.008023
풍속(m/s),0.116781,-0.032803,1.0,0.490506,-0.392252,-0.099483,-0.124846,-0.491337,-0.491315,0.22654,0.235784,-0.0709,0.050728,-0.167707,-0.090867,-0.06624,0.143741,0.046127,-0.075246
풍향(16방위),-0.005081,-0.073225,0.490506,1.0,-0.376514,-0.201945,-0.211507,-0.258867,-0.251138,0.100927,0.073085,-0.071878,-0.001519,-0.267718,-0.23191,0.058476,0.245938,-0.084683,-0.050515
습도(%),-0.117022,0.132005,-0.392252,-0.376514,1.0,0.501223,0.530531,0.119583,0.1116,-0.557177,-0.519808,0.060918,0.056366,0.455826,0.50748,-0.252757,-0.620754,-0.02966,0.078898
증기압(hPa),0.732829,0.127902,-0.099483,-0.201945,0.501223,1.0,0.958671,-0.048811,-0.373483,-0.185516,-0.090462,-0.233381,0.083416,0.521521,0.516775,-0.209567,-0.524831,0.666629,-0.002678
이슬점온도(°C),0.758031,0.115174,-0.124846,-0.211507,0.530531,0.958671,1.0,-0.035256,-0.378474,-0.187157,-0.090042,-0.228746,0.092166,0.535085,0.520987,-0.204351,-0.552898,0.673698,0.010401
현지기압(hPa),-0.128245,-0.051573,-0.491337,-0.258867,0.119583,-0.048811,-0.035256,1.0,0.804627,-0.010112,-0.057298,0.15704,-0.055766,-0.078677,-0.109802,0.100856,0.15332,-0.119571,0.031756
해면기압(hPa),-0.5137,-0.076955,-0.491315,-0.251138,0.1116,-0.373483,-0.378474,0.804627,1.0,-0.054764,-0.124412,0.216696,-0.121929,-0.192402,-0.218962,0.150205,0.226063,-0.455549,0.026621
일조(hr),0.217457,-0.067745,0.22654,0.100927,-0.557177,-0.185516,-0.187157,-0.010112,-0.054764,1.0,0.855257,0.064012,-0.045332,-0.311778,-0.318724,0.100895,0.364177,0.200433,-0.032814


In [184]:
df2 = df.loc[df['지점명']=='영월']

In [185]:
df2.corr()

Unnamed: 0,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),일조(hr),일사(MJ/m2),적설(cm),3시간신적설(cm),전운량(10분위),중하층운량(10분위),최저운고(100m ),시정(10m),지면온도(°C),FZ_flag
기온(°C),1.0,0.04631922,0.2472334,0.07565673,-0.2604086,0.6758182,0.6942386,-0.3978142,-0.4958314,0.2538284,-2.940459e-15,-0.1179418,0.0005821016,0.2812954,0.2301877,-0.09697794,-0.08123657,0.927577,-0.03518735
강수량(mm),0.04631922,1.0,0.004128413,-0.003512626,0.1067985,0.1355751,0.1124092,-0.125337,-0.1238865,-0.04984238,-2.251991e-15,0.06090804,0.0679758,0.1299611,0.1188123,-0.06410478,-0.1107361,0.03781147,-0.002743537
풍속(m/s),0.2472334,0.004128413,1.0,0.4306414,-0.5663141,-0.1239536,-0.1664374,-0.326,-0.3352652,0.3665496,2.715629e-15,0.06127728,0.00167441,-0.03176325,-0.00173951,-0.08859017,0.2583784,0.2792412,-0.02213104
풍향(16방위),0.07565673,-0.003512626,0.4306414,1.0,-0.3328353,-0.1246091,-0.1606233,-0.1882465,-0.184127,0.1882431,-1.917378e-16,0.03445881,0.008853798,-0.02740496,-0.002514852,-0.03755633,0.1134231,0.1049336,-0.01545824
습도(%),-0.2604086,0.1067985,-0.5663141,-0.3328353,1.0,0.4470354,0.4818033,0.03030213,0.05898783,-0.5513788,-2.752279e-14,0.08965554,0.1000788,0.3287117,0.3730387,-0.1639824,-0.6201905,-0.2780216,0.04009884
증기압(hPa),0.6758182,0.1355751,-0.1239536,-0.1246091,0.4470354,1.0,0.9520029,-0.3694213,-0.4300696,-0.1809358,-9.872383e-15,-0.05195648,0.06249805,0.4696932,0.4678029,-0.2096496,-0.4791182,0.5872174,-0.01639831
이슬점온도(°C),0.6942386,0.1124092,-0.1664374,-0.1606233,0.4818033,0.9520029,1.0,-0.3366572,-0.40235,-0.1903435,-8.781977e-16,-0.03940612,0.0772968,0.4888895,0.4766734,-0.2030921,-0.5202185,0.5971626,-0.01329728
현지기압(hPa),-0.3978142,-0.125337,-0.326,-0.1882465,0.03030213,-0.3694213,-0.3366572,1.0,0.9922816,0.07221409,-6.453264e-13,-0.03371252,-0.08451452,-0.2649038,-0.2659699,0.2016229,0.1704177,-0.351964,0.03743966
해면기압(hPa),-0.4958314,-0.1238865,-0.3352652,-0.184127,0.05898783,-0.4300696,-0.40235,0.9922816,1.0,0.04988154,-1.716242e-12,-0.01538818,-0.0786892,-0.2810351,-0.2764243,0.2025406,0.1648743,-0.442253,0.04036361
일조(hr),0.2538284,-0.04984238,0.3665496,0.1882431,-0.5513788,-0.1809358,-0.1903435,0.07221409,0.04988154,1.0,3.008382e-15,-0.02182006,-0.06167921,-0.308914,-0.3097024,0.04597726,0.2633054,0.3609345,0.001655139


In [186]:
df3 = df.loc[df['지점명']=='서울']

In [233]:
df3.corr()

Unnamed: 0,Temp,Rain,WindSpeed,WindDir,Moist,hPa,DewTemp,CurhPa,SeahPa,Daylight,DaylightMJ,SnowCm,Snow3hr,A,B,C,D,SurfaceTemp,FZ_flag
Temp,1.0,0.032257,0.019254,-0.155702,0.038879,0.720154,0.753834,-0.417711,-0.450568,0.075163,0.170294,-0.388921,0.091403,0.3188,0.256045,-0.076491,-0.231956,0.849569,0.001352
Rain,0.032257,1.0,0.009126,-0.048988,0.171938,0.138139,0.116685,-0.122265,-0.121266,-0.057089,-0.050574,0.042535,0.134337,0.140459,0.13662,-0.076724,-0.137957,0.028585,0.003894
WindSpeed,0.019254,0.009126,1.0,0.319842,-0.248831,-0.1058,-0.142642,-0.190911,-0.18864,0.254924,0.275412,-0.05525,0.01353,-0.007498,0.03387,-0.049889,0.183956,0.179317,0.033008
WindDir,-0.155702,-0.048988,0.319842,1.0,-0.254369,-0.222321,-0.274018,-0.007744,-0.001246,0.159584,0.140601,0.01883,0.016538,-0.239637,-0.199721,-0.026708,0.137289,-0.017405,-0.035058
Moist,0.038879,0.171938,-0.248831,-0.254369,1.0,0.634665,0.662555,-0.284697,-0.281831,-0.503928,-0.491137,0.182119,0.028132,0.422373,0.485244,-0.312099,-0.665075,-0.087734,-0.015787
hPa,0.720154,0.138139,-0.1058,-0.222321,0.634665,1.0,0.949548,-0.475266,-0.49602,-0.252353,-0.18964,-0.183714,0.073202,0.457296,0.478817,-0.249794,-0.571944,0.552173,-0.01827
DewTemp,0.753834,0.116685,-0.142642,-0.274018,0.662555,0.949548,1.0,-0.482941,-0.505037,-0.278655,-0.204545,-0.177812,0.077805,0.483955,0.481672,-0.246277,-0.587438,0.548544,-0.014364
CurhPa,-0.417711,-0.122265,-0.190911,-0.007744,-0.284697,-0.475266,-0.482941,1.0,0.999289,0.131515,0.066907,0.077931,-0.251224,-0.348655,-0.357994,0.2103,0.297254,-0.373164,-0.003954
SeahPa,-0.450568,-0.121266,-0.18864,-0.001246,-0.281831,-0.49602,-0.505037,0.999289,1.0,0.127404,0.060051,0.091425,-0.24881,-0.355561,-0.362374,0.210496,0.301296,-0.400795,-0.003873
Daylight,0.075163,-0.057089,0.254924,0.159584,-0.503928,-0.252353,-0.278655,0.131515,0.127404,1.0,0.844521,-0.001215,-0.013863,-0.345573,-0.337355,0.079628,0.249289,0.288947,-0.017892


In [188]:
from sklearn.linear_model import LogisticRegression

In [200]:
feature_cols = ['기온(°C)', '강수량(mm)', '풍속(m/s)', '풍향(16방위)',  '습도(%)', 
       '증기압(hPa)', '이슬점온도(°C)', '현지기압(hPa)', '해면기압(hPa)',
       '일조(hr)', '일사(MJ/m2)', '적설(cm)', '3시간신적설(cm)',
       '전운량(10분위)', '중하층운량(10분위)', '최저운고(100m )', '시정(10m)', '지면온도(°C)']

In [213]:
df3.isnull().sum()

지점                 0
지점명                0
일시                 0
Temp               0
Rain               0
WindSpeed          0
WindDir            0
Moist              0
hPa                0
DewTemp            0
CurhPa             0
SeahPa             0
Daylight           0
DaylightMJ         0
SnowCm             0
Snow3hr            0
A                  0
B                  0
운형(운형약어)        4615
C                  0
D                  0
지면상태(지면상태코드)       0
현상번호(국내식)       4845
SurfaceTemp        0
FZ_flag            0
dtype: int64

In [201]:
X = df3.loc[:, feature_cols]
y = df3['FZ_flag'].values

In [202]:
logreg = LogisticRegression().fit(X, y)



In [203]:
df3.rename(columns = {'기온(°C)': 'Temp', 
                      '강수량(mm)': 'Rain', 
                      '풍속(m/s)': 'WindSpeed', 
                      '풍향(16방위)': 'WindDir',  
                      '습도(%)': 'Moist',
                      '증기압(hPa)': 'hPa', 
                      '이슬점온도(°C)': 'DewTemp', 
                      '현지기압(hPa)': 'CurhPa', 
                      '해면기압(hPa)': 'SeahPa',
                      '일조(hr)': 'Daylight', 
                      '일사(MJ/m2)': 'DaylightMJ', 
                      '적설(cm)': 'SnowCm', 
                      '3시간신적설(cm)': 'Snow3hr',
                      '전운량(10분위)': 'A', 
                      '중하층운량(10분위)': 'B', 
                      '최저운고(100m )': 'C', 
                      '시정(10m)': 'D', 
                      '지면온도(°C)': 'SurfaceTemp'
                     }, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [212]:
df3.columns

Index(['지점', '지점명', '일시', 'Temp', 'Rain', 'WindSpeed', 'WindDir', 'Moist',
       'hPa', 'DewTemp', 'CurhPa', 'SeahPa', 'Daylight', 'DaylightMJ',
       'SnowCm', 'Snow3hr', 'A', 'B', '운형(운형약어)', 'C', 'D', '지면상태(지면상태코드)',
       '현상번호(국내식)', 'SurfaceTemp', 'FZ_flag'],
      dtype='object')

In [206]:
import statsmodels.api as sm

In [210]:
#model = sm.Logit.from_formula('FZ_flag ~ Temp + Rain + WindSpeed + WindDir + Moist + hPa + DewTemp + CurhPa + SeahPa + Daylight + DaylightMJ + SnowCm + Snow3hr + A + B + C + D + SurfaceTemp', df3)

In [228]:
df3.groupby('SurfaceTemp').size()

SurfaceTemp
-13.2    3
-13.1    3
-13.0    1
-12.9    1
-12.8    1
        ..
 23.9    1
 24.1    1
 24.3    1
 24.9    1
 25.0    1
Length: 1810, dtype: int64

In [229]:
df3.groupby('FZ_flag').size()

FZ_flag
0    7241
1       7
dtype: int64

In [231]:
df.dtypes

지점                      object
지점명                     object
일시              datetime64[ns]
기온(°C)                 float64
강수량(mm)                float64
풍속(m/s)                float64
풍향(16방위)                 int64
습도(%)                  float64
증기압(hPa)               float64
이슬점온도(°C)              float64
현지기압(hPa)              float64
해면기압(hPa)              float64
일조(hr)                 float64
일사(MJ/m2)              float64
적설(cm)                 float64
3시간신적설(cm)             float64
전운량(10분위)              float64
중하층운량(10분위)            float64
운형(운형약어)                object
최저운고(100m )            float64
시정(10m)                float64
지면상태(지면상태코드)            object
현상번호(국내식)               object
지면온도(°C)               float64
FZ_flag                  int64
dtype: object

In [220]:
model = sm.Logit.from_formula('FZ_flag ~ Temp + Rain + WindSpeed + Moist + hPa + CurhPa + SurfaceTemp', df3)

In [230]:
result_med = model.fit(maxiter=100)
print(result_med.summary())

Optimization terminated successfully.
         Current function value: 0.005518
         Iterations 14
                           Logit Regression Results                           
Dep. Variable:                FZ_flag   No. Observations:                 7248
Model:                          Logit   Df Residuals:                     7240
Method:                           MLE   Df Model:                            7
Date:                Tue, 11 Feb 2020   Pseudo R-squ.:                  0.2806
Time:                        17:50:47   Log-Likelihood:                -39.996
converged:                       True   LL-Null:                       -55.595
Covariance Type:            nonrobust   LLR p-value:                 5.719e-05
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      43.0153     79.042      0.544      0.586    -111.903     197.934
Temp            1.0219   