In [19]:
from datetime import datetime
from datetime import timedelta
import requests  # requests 모듈 임포트
import csv
import numpy as np
from io import StringIO
import pandas as pd
from time import sleep
import logging
import math
import urllib3
import random
from statsmodels.tsa.stattools import adfuller

In [31]:
class aws:
    
    code_needed = ["402", "403", "413", "415", "421", "510", "889"]
    colnames_AWS = ["time", "temp", "windDir", "windSpd", "precip"]
    code_dict = {"강동": "402", "광진": "413", "송파": "403", "용산": "415", "성동": "421", "영등포": "510", "현충원": "889"}
    columns = ['STN', 'windDir', 'windSpd', 'precip', 'time']
    
    data_dict = dict()
    
    def __init__(self):
        
        self.aws_data = pd.DataFrame(columns = self.columns)
        
    def update_data(self):
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        
        self.get_past_data()
        
        tm2 = datetime.now()
        tm1 = self.data_dict[self.code_needed[0]]['time'].iloc[len(self.data_dict[self.code_needed[0]])-1]
        tm1 = datetime.strptime(tm1, '%Y-%m-%d %H:%M:%S')
        tm1 += timedelta(hours = 1)
        
        hourly_times = []
        
        cur = tm1
        while cur <= tm2:
            hourly_times.append(cur)
            cur += timedelta(hours=1)

        for time in hourly_times:
            time = time.strftime("%Y%m%d%H%M")
            print("------"+time+"------")
            temp_data = self.make_aws(time)
            # print(temp_data[temp_data.iloc[:,1] in ["402", "403", "413", "415", "421", "510", "889"]])
            self.aws_data = pd.concat([self.aws_data, self.data_preprocessing(temp_data)], axis = 0)
        
        if hourly_times: # Additional data has joined
            self.data_merge()
            self.save_data()
    
    @classmethod
    def get_past_data(cls):
        for code in cls.code_needed:
            path = "./dataset/climate_data_"+str(code)+".csv"
            cls.data_dict[code] = pd.read_csv(path, header = 0, names = cls.colnames_AWS)
    
    @staticmethod
    def make_aws(time, num_of_seconds_to_wait = 3):
        url = f"https://apihub.kma.go.kr/api/typ01/url/awsh.php?tm={time}&help=0&authKey=ri4L36zbQcuuC9-s27HL3A"
        try:
            response_csv = requests.get(url, verify = False)
            print(response_csv.status_code)
            if response_csv.status_code not in (200, 204, 202):
                random_num_of_seconds = 0
                if random_num_of_seconds <= 50:
                    random_num_of_seconds = random.randint(num_of_seconds_to_wait, num_of_seconds_to_wait + 3)
                    sleep(random_num_of_seconds)
                    return AWS.make_aws(time, num_of_seconds_to_wait = num_of_seconds_to_wait + 3)
                else:
                    raise Exception(f'Your request failed with the following error: {response_csv.status_code}')
            else:
                aws_data = pd.read_csv(StringIO(response_csv.text))
                return aws_data
        except Exception as e:
            logging.warning(f'Http request failed with url={url}')
            logging.warning(e)
            raise e
    
    @staticmethod
    def data_preprocessing(df):
        df = pd.DataFrame(df)
        # code_needed = ["402", "403", "413", "415", "421", "510", "889"]
        
        column_name = (df.iloc[0, 0].strip().split())[1:]
        data_split = [x.strip().split() for x in df.iloc[2:, 0]]
        aws_data = pd.DataFrame(data_split, columns = column_name)
        aws_data.drop(aws_data.shape[0]-1, axis=0, inplace=True)
        aws_data.reset_index()

        aws_data['time'] = pd.to_datetime(aws_data['YYMMDDHHMI'], format = '%Y%m%d%H%M%S', errors='coerce')
        aws_data = aws_data.rename(columns={'TA': 'temp', 'WD': 'windDir', 'WS': 'windSpd', 'RN_HR1': 'precip'})
        aws_data = aws_data.astype({'temp' : float, 'windDir' : float, 'windSpd' : float, 'precip': float})
        aws_data = aws_data[aws_data['STN'].isin(aws.code_needed)]
        aws_data = aws_data.reset_index()
        aws_data.drop(['index', 'YYMMDDHHMI', 'RN_DAY', 'HM', 'PA', 'PS'], axis = 1, inplace = True)
        aws_data['time'] = aws_data['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
        print(aws_data)
        
        return aws_data
    
    def data_merge(self):
        df = self.aws_data
        for code in self.code_needed:
            df_temp = df[df['STN'] == code]
            df_temp = df_temp.reset_index()
            df_temp.drop(['STN', 'index'], inplace = True, axis = 1)
            df_temp = df_temp[['time', 'temp', 'windDir', 'windSpd', 'precip']]
            self.data_dict[code] = pd.concat([self.data_dict[code], df_temp], axis = 0)
            self.data_dict[code] = self.data_dict[code].reset_index()
            self.data_dict[code].drop(['index'], axis = 1, inplace = True)
            
    def save_data(self):
        for code in self.code_needed:
            self.data_dict[code].to_csv(f"./dataset/climate_data_{code}.csv", index = False)
    
    @classmethod
    def save_EDAdata(cls):
        for code in cls.code_needed:
            cls.data_dict[code].to_csv(f"./dataset_cleaning/climate_data_{code}.csv", index = False)
        print("Successfully saved")   
        
    def wind_dir_triangulation(self, name, theta):
        theta = math.radians(theta)
        code = self.code_dict[name]
        self.data_dict[code]["vertical_dir"] = -(self.data_dict[code]["windDir"]+theta).map(math.radians).map(math.cos)
        self.data_dict[code]["horizon_dir"] = (self.data_dict[code]["windDir"]+theta).map(math.radians).map(math.sin)
        
    def wind_dir_triangulation(self, code, theta):
        theta = math.radians(theta)
        self.data_dict[code]["vertical_dir"] = -(self.data_dict[code]["windDir"]+theta).map(math.radians).map(math.cos)
        self.data_dict[code]["horizon_dir"] = (self.data_dict[code]["windDir"]+theta).map(math.radians).map(math.sin)
        
    def apply_windSpd_windDif(self):
        for aws_code in AWS.code_needed:
            AWS.data_dict[aws_code]['vertical_dir'] = AWS.data_dict[aws_code]['vertical_dir'] * AWS.data_dict[aws_code]['windSpd']
            AWS.data_dict[aws_code]['horizon_dir'] = AWS.data_dict[aws_code]['horizon_dir'] * AWS.data_dict[aws_code]['windSpd']
            AWS.data_dict[aws_code].drop(['windDir', 'windSpd'], axis = 1, inplace = True)
          
    def missingInput(self):
        for code in self.code_needed:
            print("\n")
            print("code " + str(code) + " proceeding...")
            i = 0
            while self.data_dict[code][['precip', 'horizon_dir', 'vertical_dir', 'temp', 'windSpd']].isnull().values.any():
                print(i, end = "\r")
                i += 1
                self.data_dict[code]['precip'] = self.data_dict[code]['precip'].rolling(window = 3, min_periods = 1).mean()
                self.data_dict[code]['horizon_dir'] = self.data_dict[code]['horizon_dir'].rolling(window = 3, min_periods = 1).mean()
                self.data_dict[code]['vertical_dir'] = self.data_dict[code]['vertical_dir'].rolling(window = 3, min_periods = 1).mean()
                self.data_dict[code]['temp'] = self.data_dict[code]['temp'].interpolate().values
                self.data_dict[code]['windSpd'] = self.data_dict[code]['windSpd'].interpolate().values
                
    @staticmethod         
    def adf_subtest(df, code, col):
        dftest = adfuller(df, autolag="AIC")
        dfoutput = pd.DataFrame({
            "code": code,
            "component": col,
            "Test Statistic": dftest[0],
            "p-value": dftest[1],
            "#Lags Used": dftest[2],
            "Number of Observations Used": dftest[3]
        }, index = [0])

        return dfoutput


    def adf_test(self):
        adf_dict = pd.DataFrame()
        for code in self.code_needed:
            print(str(code) + ": ")
            for i, col in enumerate(self.data_dict[code].columns):
                if i == 0: continue
                print(str(col) + "                    ", end = "\r")
                adf_dict = pd.concat([adf_dict, self.adf_subtest(self.data_dict[code][col], code, col)])
            print("")    
        adf_dict.reset_index()
        return adf_dict
        # new version of aws class

In [35]:
class lvl():
    code_needed = ["1018640", "1018662", "1018680", "1018683"]
    angle_dict = {"1018640" : 160, "1018662" : 70, "1018680" : 110, "1018683" : 62}
    matching_dict = {"1018640" : ["402", "403", "413"], "1018662": ["403", "413", "421"], "1018680": ["415", "510", "889"], "1018683": ["415", "510", "889"]}
    colnames_lvl = ["lvl", "time"]
    data_dict = dict()
    
    def __init__(self):
        
        return
        
    def update_data(self):
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

        self.get_past_data()
        
        tm2 = datetime.now()
        tm2 -= timedelta(days = 1)
        tm2 = tm2.replace(hour = 0, minute = 0, second = 0)
        tm1 = self.data_dict[self.code_needed[0]]['time'].iloc[len(self.data_dict[self.code_needed[0]])-1]
        print(tm1)
        tm1 = tm1[:10]
        tm1 = datetime.strptime(tm1, '%Y-%m-%d')
        tm1 += timedelta(hours = 1)
        
        hour_list = self.generate_date_ranges(tm1, tm2)
    
        for code in self.code_needed:
            print(code+"------------")
            for time_list in hour_list:
                print(time_list)
                if type(time_list[0]) != str:
                    time_list_sub = [0, 0]
                    time_list_sub[0] = time_list[0]
                    time_list_sub[1] = time_list[1]
                    time_list_sub[0] = time_list_sub[0].strftime("%Y%m%d")
                    time_list_sub[1] = time_list_sub[1].strftime("%Y%m%d")
                    print(time_list)
                    print("------"+time_list_sub[0]+"~"+time_list_sub[1]+"------")
                    self.data_dict[code] = pd.concat([self.data_dict[code], self.data_preprocessing(self.make_lvl(code, time_list_sub[0], time_list_sub[1]))], axis = 0).reset_index()[:-1]
        
        
        
        if tm1 <= tm2: # Additional data joined.
            self.save_data()
    
    @classmethod
    def get_past_data(cls):
        for code in cls.code_needed:
            path = "./dataset/lvl_data_"+code+".csv"
            try:
                cls.data_dict[code] = pd.read_csv(path, header = 0, names = cls.colnames_lvl) 
            except FileNotFoundError as e:
                print(f'{e}')
                pass 
            
    @staticmethod        
    def generate_date_ranges(startdt, enddt, interval_months=6):
        date_ranges = []
        curdt = startdt

        while curdt < enddt:
            nextdt = min(curdt + timedelta(days=interval_months * 30), enddt)
            date_ranges.append([curdt, nextdt])
            curdt = nextdt
        
        return date_ranges
    
    @staticmethod
    def make_lvl(code, startdt, enddt, num_of_seconds_to_wait = 3):
        url = f"http://www.wamis.go.kr:8080/wamis/openapi/wkw/wl_hrdata"
        
        params = {
            'obscd': f'{code}',
            'startdt': f'{startdt}',
            'enddt': f'{enddt}'
        }
        num_of_seconds_to_wait= 3
        random_num_of_seconds = 0
        
        try:
            response_csv = requests.get(url, params = params, verify = False)

            if response_csv.status_code not in (200, 204, 202):
                if random_num_of_seconds <= 50: # 50초가 넘어가면 멈춘다.
                    random_num_of_seconds = random.randint(num_of_seconds_to_wait, num_of_seconds_to_wait + 3)
                    sleep(random_num_of_seconds)
                    return make_lvl(code, startdt, enddt,
                                        num_of_seconds_to_wait=num_of_seconds_to_wait + 3)
                else:
                    raise Exception(f'Your request failed with the following error: {response_csv.status_code}')
            else:
                lvl_data = pd.read_csv(StringIO(response_csv.text))
                lvl_data = lvl_data.columns.tolist()
                return lvl_data
        except Exception as e:
            logging.warning(f'Http request failed with url={url}')
            logging.warning(e)
            raise e
        
       
    @staticmethod
    def data_preprocessing(wat_data):
        filtered_wat_data = pd.DataFrame([(wat_data[i], wat_data[i+1]) for i in range(len(wat_data)-1) if i % 2 == 1])
        filtered_wat_data.drop([0], axis = 0, inplace = True)
        filtered_wat_data = filtered_wat_data.rename(columns={0: 'ymdh', 1: 'lvl'})
        filtered_wat_data['ymdh'] = filtered_wat_data['ymdh'].str.extract(r'"ymdh":"(\w{10})"')
        filtered_wat_data['lvl'] = filtered_wat_data['lvl'].str.extract(r'wl:"(.*)"')

        filtered_wat_data['date'] = filtered_wat_data['ymdh'].str[:-2]
        filtered_wat_data['date'] = pd.to_datetime(filtered_wat_data['date'], format='%Y%m%d')
        filtered_wat_data['date'].loc[filtered_wat_data['ymdh'].str.endswith('24')] += timedelta(days = 1)
        filtered_wat_data['ymdh'].loc[filtered_wat_data['ymdh'].str.endswith('24')] = filtered_wat_data['date'].dt.strftime('%Y%m%d')+"00:00:00"

        filtered_wat_data['time'] = pd.to_datetime(filtered_wat_data['ymdh'], format = '%Y%m%d%H', errors='coerce')
        filtered_wat_data.reset_index(inplace = True)
        filtered_wat_data.drop(['index', 'ymdh', 'date'], axis = 1, inplace = True)
        filtered_wat_data = filtered_wat_data.astype({'lvl' : float})
        
        
        return filtered_wat_data
    
    def save_data(self):
        for code in self.code_needed:
            self.data_dict[code].to_csv(f"./dataset/lvl_data_{code}.csv", index = False)
            
    @classmethod
    def save_EDAdata(cls):
        for code in cls.code_needed:
            cls.data_dict[code].to_csv(f"./dataset_cleaning/lvl_data_{code}.csv", index = False)
        print("Successfully saved")
            
    @staticmethod         
    def adf_subtest(df, code, col):
        dftest = adfuller(df, autolag="AIC")
        dfoutput = pd.DataFrame({
            "code": code,
            "component": col,
            "Test Statistic": dftest[0],
            "p-value": dftest[1],
            "#Lags Used": dftest[2],
            "Number of Observations Used": dftest[3]
        }, index = [0])

        return dfoutput


    def adf_test(self):
        adf_dict = pd.DataFrame()
        for code in self.code_needed:
            print(str(code) + ": waterlevel")
            adf_dict = pd.concat([adf_dict, self.adf_subtest(self.data_dict[code]["lvl"], code, "lvl")])
            print("")    
        adf_dict.reset_index()
        return adf_dict
    
    def missingInput(self):
        for code in self.code_needed:
            print("\n")
            print("code " + str(code) + " proceeding...")
            i = 0
            while self.data_dict[code][['lvl']].isnull().values.any():
                print(i, end = "\r")
                i += 1
                self.data_dict[code]['lvl'] = self.data_dict[code]['lvl'].interpolate().values

In [32]:
AWS = aws()
LVL = lvl()

In [34]:
AWS.update_data()
LVL.update_data()

------202310180100------
200
   STN  temp  windDir  windSpd  precip                 time
0  402   9.9     34.4      1.3     0.0  2023-10-18 01:00:00
1  403  12.2      0.0      0.0     0.0  2023-10-18 01:00:00
2  413  11.5    107.4      0.9     0.0  2023-10-18 01:00:00
3  415  11.6    343.5      0.6     0.0  2023-10-18 01:00:00
4  421  12.7      0.0      0.0     0.0  2023-10-18 01:00:00
5  510  14.5    157.9      1.0     0.0  2023-10-18 01:00:00
6  889   8.9    216.1      0.6     0.0  2023-10-18 01:00:00
------202310180200------
200
   STN  temp  windDir  windSpd  precip                 time
0  402   9.7     50.1      0.7     0.0  2023-10-18 02:00:00
1  403  12.0     68.6      0.1     0.0  2023-10-18 02:00:00
2  413  10.9    138.1      0.7     0.0  2023-10-18 02:00:00
3  415  11.2     27.0      0.6     0.0  2023-10-18 02:00:00
4  421  12.0     39.7      1.0     0.0  2023-10-18 02:00:00
5  510  13.7    166.3      0.7     0.0  2023-10-18 02:00:00
6  889   8.5    217.6      0.4     0.0  20



200
   STN  temp  windDir  windSpd  precip                 time
0  402   8.1    302.9      1.0     0.0  2023-10-21 01:00:00
1  403   9.8    297.9      1.6     0.0  2023-10-21 01:00:00
2  413   9.6    331.1      1.6     0.0  2023-10-21 01:00:00
3  415   9.6    135.7      0.4     0.0  2023-10-21 01:00:00
4  421   8.8    258.9      1.9     0.0  2023-10-21 01:00:00
5  510   9.8    286.8      0.8     0.0  2023-10-21 01:00:00
6  889   5.8    224.3      0.4     0.0  2023-10-21 01:00:00
------202310210200------


ConnectionError: HTTPSConnectionPool(host='apihub.kma.go.kr', port=443): Max retries exceeded with url: /api/typ01/url/awsh.php?tm=202310210200&help=0&authKey=ri4L36zbQcuuC9-s27HL3A (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fb760a3c9d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

In [6]:
AWS.data_dict["413"]

Unnamed: 0,time,temp,windDir,windSpd,precip
0,2018-01-01 01:00:00,-0.8,318.9,1.6,0.0
1,2018-01-01 02:00:00,-2.0,299.3,0.5,0.0
2,2018-01-01 03:00:00,-1.1,281.0,1.6,0.0
3,2018-01-01 04:00:00,-1.6,316.1,0.9,0.0
4,2018-01-01 05:00:00,-2.7,0.0,0.4,0.0
...,...,...,...,...,...
50881,2023-10-17 20:00:00,16.0,330.4,0.6,0.0
50882,2023-10-17 21:00:00,14.3,147.9,0.9,0.0
50883,2023-10-17 22:00:00,13.3,133.7,0.8,0.0
50884,2023-10-17 23:00:00,13.0,133.3,0.4,0.0


In [7]:
for lvl_code in LVL.code_needed:
    for aws_code in LVL.matching_dict[lvl_code]:
        AWS.wind_dir_triangulation(aws_code, LVL.angle_dict[lvl_code])

In [8]:
AWS.missingInput()
LVL.missingInput()



code 402 proceeding...


KeyError: "['horizon_dir', 'vertical_dir'] not in index"

In [9]:
AWS.apply_windSpd_windDif()

In [10]:
AWS.data_dict["403"]

Unnamed: 0,time,temp,precip,vertical_dir,horizon_dir
0,2018-01-01 01:00:00,-0.5,0.000000e+00,-1.482151,-1.758757
1,2018-01-01 02:00:00,-0.7,0.000000e+00,-0.837738,-0.994080
2,2018-01-01 03:00:00,-0.6,0.000000e+00,-0.966621,-1.147016
3,2018-01-01 04:00:00,-0.8,0.000000e+00,-1.095503,-1.299951
4,2018-01-01 05:00:00,-1.3,0.000000e+00,-0.515531,-0.611742
...,...,...,...,...,...
50881,2023-10-17 20:00:00,17.0,7.091923e-14,-0.419961,-0.428524
50882,2023-10-17 21:00:00,16.0,7.085029e-14,-0.139987,-0.142841
50883,2023-10-17 22:00:00,15.0,7.078403e-14,-0.349967,-0.357104
50884,2023-10-17 23:00:00,14.1,7.072037e-14,-0.769928,-0.785628


In [11]:
AWS.save_EDAdata()
LVL.save_EDAdata()

Successfully saved
Successfully saved


In [None]:
AWS.adf_test()
LVL.adf_test()