In [1]:
# split the data monthwise
# Make ready the observatory data
# Given a date, pollutant as input get the both CMAQ data and Observatory data 
    # For the date input figure out the location of the data in npy file and save it in a variable.
    # For the date input get the thirteen future hour data of observatory station.
    # Combine the CMAQ data and observatory data.

### Split the data based on month (12 files)

In [2]:
from datetime import datetime, timedelta
# from sklearn.metrics import mean_squared_error
from glob import glob
from math import sqrt
from dateutil import tz
import pandas as pd
import numpy as np
import json
import os

time = np.load('data/time_full.npy')

def get_pollutant_data(dir):
    return np.load(dir + "_full.npy")
# Formula for the timestamp conversion(UTC to HKT)
def UTC2HKT(timestamp):
    from_zone = tz.gettz('UTC')
    to_zone = tz.gettz('Asia/Hong_Kong')
    fmt = '%Y-%m-%d %H:%M:%S'

    HK_T = datetime.strptime(str(datetime.utcfromtimestamp(float(timestamp))),
                            '%Y-%m-%d %H:%M:%S').replace(tzinfo=from_zone)\
                            .astimezone(to_zone).strftime(fmt)
    return HK_T

# Convert UTC time to HK time and save as .npy file
def save_timedata():
    time_data = [UTC2HKT(x) for x in time]
    np.save("data/cacahe/time_data.npy", time_data)
    return None

# Save the monthwise data
def save_data(pollutant_data, direc):
    time_data = np.load("data/cache/time_data.npy")
    for i in range(0,12): 
        data = []
        first_date = next(x for x in time_data if datetime.strptime(x, '%Y-%m-%d %H:%M:%S').month == i+1)
        for x,j in zip(time_data,pollutant_data):
            date = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
            if date.month == i+1 :
                data.append(j)
        np.save(os.path.join(direc, first_date + ".npy"), data)
    return None

# Split the data into monthwise
def splitData_monthwise(method):
    for i in ['PM10', 'SO2', 'NO2']:
        directory = os.path.join('data','cache',method, i)
        if not os.path.exists(directory):
            try:
                os.makedirs(directory)
            except:
                continue
        pol_data = get_pollutant_data(directory)
        start_time = datetime.now()
        save_data(pol_data,directory)
        end_time = datetime.now()
        print(end_time -  start_time)
    return None

def save_Obsdata():
    files = sorted(glob('obs/*.csv'))
    df = pd.concat((pd.read_csv(file, index_col=0)for file in files), ignore_index = True)
    df['time'] =  pd.to_datetime(df['time'])
    df.sort_values(by=['time', 'station_code'])
    if not os.path.exists('cache/obs'):
        try:
            os.makedirs(directory)
        except:
            continue
    df.to_csv("data/cache/obs/Obs_data.csv", index=False)
    return df


In [3]:
def obs_data(Pollutant, Date_time):
    date_list = [(datetime.strptime(Date_time, '%Y-%m-%d %H:%M:%S') + timedelta(hours=x)).strftime('%Y-%m-%d %H:%M:%S') for x in range(0,13)]
    df = pd.read_csv('obs/Obs_data.csv', usecols=['time',Pollutant, 'x', 'y']).rename(columns={Pollutant: "Pollutant"})
    df = df[df['time'].isin(date_list)]
    grouped = df.groupby(['time'])
    d = {}
    for group_item in grouped:
        lis = []
        for i in group_item[1].iterrows():
            dic = {}
            dic['Pollutant'], dic['x'], dic['y'] = i[1]['Pollutant'], i[1]['x'], i[1]['y']
            lis.append(dic)
        d[group_item[0]] = lis
    return json.dumps(d)

def month_data(month_list, direc):
    final_data = []
    for file in os.listdir(direc):
        if file.endswith('.npy') and datetime.strptime(file[:-4], '%Y-%m-%d %H:%M:%S').month in month_list:
            data = np.load(os.path.join(direc,file))
            final_data.append(data)
    return np.concatenate(final_data), file
    
def get_data(date, Pollutant):
    month = [datetime.strptime(date, '%Y-%m-%d %H:%M:%S').month]
    dic = {}
    for i in ['CMAQ', 'Our_method'] :
        directory = os.path.join(i,Pollutant)
        data, filename = month_data(month, directory)
        diff = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')-datetime.strptime(filename[:-4], '%Y-%m-%d %H:%M:%S')
        hours = diff.days * 24 + diff.seconds // 3600
        dic[i] = data[hours].tolist()
    return json.dumps(dic)       

In [4]:
def time_poldata(pol_data):
    time_data = np.load('time_data.npy')
    data = {}
    for i,j in zip(time_data,pol_data):
        data[i] = j
    return data

# Getting the metrics value 
def metrics(df_1, df_2):
    act = df_1['Pollutant'].values
    pred = df_2['Pollutant'].values
#     rmse = round(sqrt(mean_squared_error(act, pred)),2)
    ioa = round(1 -(np.sum((act-pred)**2))/(np.sum((np.abs(pred-np.mean(act))+np.abs(act-np.mean(act)))**2)), 2)
    return ioa

# Station code with coordinates
def station_coord(key):
    d= {'CB_R': [37,13], 
        'CL_R': [34,13],
        'CW_A': [33,14],
        'EN_A': [40,14],
        'KC_A': [31,22],
        'KT_A': [41,17],
        'MB_A': [55,34],
        'MKaR': [35,18],
        'SP_A': [34,19],
        'ST_A': [37,24],
        'TC_A': [13,14],
        'TK_A': [45,17],
        'TM_A': [16,25],
        'TP_A': [35,32],
        'TW_A': [30,23],
        'YL_A': [21,31]}
    return d[key]

In [5]:
def line_chart_1(dt):
    start_date = dt['st_date']
    end_date = dt['en_date']
    Future_hour = dt['F_hour']
    Method = 'CMAQ'
    Pollutant = dt['pollutant']
    code = dt['St_code']

    st_date = datetime.strptime(start_date, '%Y-%m-%d %H:%M:%S') - timedelta(hours=int(Future_hour))
    en_date = datetime.strptime(end_date, '%Y-%m-%d %H:%M:%S') - timedelta(hours=int(Future_hour))
    delta = en_date - st_date
    hours = delta.days * 24 + delta.seconds // 3600
    date_list = [str(st_date + timedelta(hours=i)) for i in range(hours +1)]
    coords = station_coord(code)

    directory = os.path.join(Method,Pollutant)
    month = [st_date.month, en_date.month]
    final_data, filename = month_data(month,directory)
    whole_data = time_poldata(final_data)
    pol_lis = [whole_data[i][int(Future_hour)][coords[0]][coords[1]] for i in date_list]
    method_data = pd.DataFrame({'time': date_list, 'Pollutant': pol_lis, 'data': 'CMAQ_data'})
    
    station_df = obs_onsite_data('cache/obs/Obs_data.csv', code, Pollutant, date_list)
    df = pd.concat([method_data,obs_df])
    IOA = metrics(method_data, obs_df)
    result = df.to_dict('records')
    return json.dumps([{'RMSE': 0, 'IOA': IOA, 'line_data': result}])

In [None]:
def obs_onsite_data(path, code, Pollutant, date_list):
    df = pd.read_csv(path, usecols=['time', 'station_code', Pollutant]).rename(columns={Pollutant: "Pollutant"})
    df = df[((df["time"].isin(date_list)) & (df["station_code"] == code))].drop(['station_code'], axis=1)
    df['data'] = 'station_data'
    return df

In [6]:
dt = {'st_date' : '2017-01-03 21:00:00', 'en_date' : '2017-01-03 23:00:00', 'F_hour' : '5', 'St_code' : 'CB_R', 'pollutant': 'PM10'}

In [7]:
s = datetime.now()
l = line_chart_1(dt)
e = datetime.now()
print(e-s)

0:00:00.471610
