In [1]:
# module import

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing
from sklearn.impute import KNNImputer
import datetime
import glob

# Train

In [14]:
# train file 경로 설정
global_path = './uv_data_train/*'
region_path = './region_train/'
knn_path = './KNN_train/'


In [15]:
def load_data():
    folders = glob.glob(global_path)
    
    # 새로운 데이터 프레임 생성
    all_data = pd.DataFrame()

    for files in folders:
        df= pd.read_csv(files)
        df.columns = ["Unnamed: 0","yyyymmdd", "hhnn","stn","lon","lat","uv","band1","band2","band3","band4","band5","band6","band7","band8","band9","band10","band11","band12","band13","band14","band15","band16","solarza","sateza","esr","height","landtype"]
        all_data = pd.concat([all_data, df])
        all_data.drop('Unnamed: 0', axis=1)
    
    return all_data

In [16]:
def split_region(all_data):
    all_data['lon'] = all_data['lon'].round(2)
    map_dict={126.16:"고산", 128.89:"강릉", 126.96:"서울", 126.62:"인천", 130.9:"울릉도독도" , 127.44:"청주", 126.33:"안면도", 127.37:"대전",
           129.38:"포항", 128.65:"대구", 127.12:"전주", 129.33:"울산", 126.89:"광주", 129.03:"부산", 126.38:"목포"}
    all_data['region'] = all_data['lon'].map(map_dict) 
    
    regions = ['고산','강릉','서울','인천','울릉도독도','청주','안면도','대전','포항','대구','전주','울산','광주','부산','목포']

    for i in regions:
        region = i
        i = all_data[all_data['region']==i]
        i.to_csv(f'{region_path}{region}.csv', index=False)  

In [17]:
# hhnn 길이 맞춰주는 함수
def fit_hhnn(data):
    for i in range(len(data)):
        if len(str(data['hhnn'][i]))==1:
            data['hhnn'][i]=str('000')+str(data['hhnn'][i])
        elif len(str(data['hhnn'][i]))==2:
            data['hhnn'][i]=str('00')+str(data['hhnn'][i])
        elif len(str(data['hhnn'][i]))==3:
            data['hhnn'][i]=str('0')+str(data['hhnn'][i])
        else:
            continue

In [18]:
# 날짜, 시간 datetime 형식으로 바꾸는 함수
def to_datetime(data):
    data['yyyymmdd'] = data['yyyymmdd'].astype(int)
    data['hhnn'] = data['hhnn'].astype(int)
    
    # hhnn 길이 맞추기
    fit_hhnn(data)
    
    # yyyymmdd, hhnn column 붙이기
    data['yyyymmdd'] = data['yyyymmdd'].astype(str)
    cols=['yyyymmdd','hhnn']
    data['yyyymmddhhnn']= data[cols].apply(lambda row:''.join(row.values.astype(str)), axis=1)

    # datetime으로 형식 변환 str to datetime
    data['yyyymmddhhnn'] = pd.to_datetime(data['yyyymmddhhnn'], format='%Y%m%d%H%M')
    data.sort_values(by=['yyyymmddhhnn'], inplace=True, ascending=True)
    
    return data.drop(columns=['yyyymmdd','hhnn'],axis=1,inplace=True)

In [19]:
def imputation(df):
    region = df
    df = pd.read_csv(f'{region_path}{df}.csv',index_col=0)
    
    # knnimputer 사용하기 위해서 날짜, 시간 따로 빼두기
    yyyymmdd = df['yyyymmdd']
    hhnn = df['hhnn']
#     uv = df['uv']
    yyyymmdd = np.array(yyyymmdd)
    hhnn = np.array(hhnn)
#     uv = np.array(uv)
    
    # knnimputer 사용
    df = df.replace(-999,np.NaN)
    df.drop(columns = ['region','hhnn','yyyymmdd'],axis=1,inplace=True)
    imputer = KNNImputer(n_neighbors = 5)
    imputed = imputer.fit_transform(df)
    
    # 떼어뒀던 행들 다시 붙이기
    df_imputed = pd.DataFrame(imputed,columns=df.columns)
    df_imputed['yyyymmdd'] = yyyymmdd
    df_imputed['hhnn'] = hhnn
#     df_imputed['uv'] = uv
    # 날짜, 시간 datetime 형식으로 바꾸기
    to_datetime(df_imputed)
    
    df_imputed.to_csv(f'{knn_path}{region}_knn_train.csv',index=False)
    

In [20]:
# train preprocessing 실행
all_data = load_data()

# 지역 분리하기
split_region(all_data)
regions = ['고산','강릉','서울','인천','울릉도독도','청주','안면도','대전','포항','대구','전주','울산','광주','부산','목포']

for i in regions:
    region = i
    i = pd.read_csv(f'{region_path}{region}.csv')
    imputation(region)

----------------------------------------

# Test

In [2]:
# test file 경로 설정
global_path = './uv_data_test/*'
region_path = './region_test/'
knn_path = './KNN_test/'


In [3]:
def load_data():
    folders = glob.glob(global_path)
    
    # 새로운 데이터 프레임 생성
    all_data = pd.DataFrame()

    for files in folders:
        df= pd.read_csv(files)
        df.columns = ["YearMonthDayHourMinute","stn","lon","lat","uv","band1","band2","band3","band4","band5","band6","band7","band8","band9","band10","band11","band12","band13","band14","band15","band16","solarza","sateza","esr","height","landtype"]
        all_data = pd.concat([all_data, df])
#         all_data.drop('Unnamed: 0', axis=1)
    
    return all_data

In [4]:
def imputation(df):
    region = df
    df = pd.read_csv(f'{region_path}{df}.csv')
    
    # knnimputer 사용하기 위해서 날짜, 시간 따로 빼두기
    index = df['YearMonthDayHourMinute']
    index = np.array(index)

    # knnimputer 사용
    df = df.replace(-999,np.NaN)
    df.drop(columns = ['region','uv','YearMonthDayHourMinute'],axis=1,inplace=True)
    imputer = KNNImputer(n_neighbors = 5)
    imputed = imputer.fit_transform(df)
    
    # 떼어뒀던 행들 다시 붙이기
    df_imputed = pd.DataFrame(imputed,columns=df.columns)
    df_imputed['uv']=0
    df_imputed['yyyymmddhhnn'] = index
    # 날짜, 시간 datetime 형식으로 바꾸기
    df_imputed['yyyymmddhhnn'] = pd.to_datetime(df_imputed['yyyymmddhhnn'],format='%Y%m%d%H%M')

    df_imputed.to_csv(f'{knn_path}{region}_knn_test.csv',index=False)

In [7]:
# test preprocessing 실행
all_data = load_data()

# 지역 분리하기
split_region(all_data)
regions = ['고산','강릉','서울','인천','울릉도독도','청주','안면도','대전','포항','대구','전주','울산','광주','부산','목포']

for i in regions:
    region = i
    i = pd.read_csv(f'{region_path}{region}.csv')
    imputation(region)