In [28]:
import pickle
import numpy as np
import pandas as pd
from datetime import datetime

In [299]:
model = pickle.load(open("sklearn_GPR_no2_monthly.sav", 'rb'))

In [300]:
lat_max, lon_max, month_max = model.X_train_.max(axis=0)
month_max

23.0

In [301]:
def load_data(pollutant, data_path="data/", timestep=None, subset=None):
    """
    :param pollutant: {"CO", "NO2", "O3", "SO2", "PM10", "PM25"}
    :param data_path: path to data directory
    :param timestep: {"H", "D", "M", "Y"}
    :param subset: if provided, get data after the datetime
    """
    df = pd.read_csv(f"{data_path}{pollutant}.csv", parse_dates=["date"]).set_index("date")
    if timestep in {"D", "M", "Y"}:
        index_format = {"D": "%Y-%m-%d", "M": "%Y-%m", "Y": "%Y"}
        df = df.groupby(by=["code"]).resample(timestep).mean().dropna().reset_index()
        # df["date"] = df["date"].apply(lambda x: x.strftime(index_format[timestep]))
    
    if subset:
        df = df.loc[df["date"] > subset]
    return df

In [30]:
monthly_NO2_df = load_data("NO2", data_path="../../data/", timestep="M", subset=datetime(2020, 1, 1))

In [44]:
monthly_NO2_sites = monthly_NO2_df['code'].unique()

In [45]:
sites_info_df = pd.read_csv("../../data/monitoring_sites.csv")

In [302]:
S_df = sites_info_df.loc[sites_info_df['SiteCode'].isin(monthly_NO2_sites)][['SiteCode', 'Latitude', 'Longitude']]

In [303]:
S_df['t'] = 20

In [304]:
S_df

Unnamed: 0,SiteCode,Latitude,Longitude,t
2,BG1,51.563752,0.177891,20
3,BG2,51.529389,0.132857,20
8,BX2,51.490610,0.158914,20
10,BQ7,51.494649,0.137279,20
15,BX1,51.465983,0.184877,20
...,...,...,...,...
230,WM0,51.494681,-0.131938,20
231,MY1,51.522540,-0.154590,20
233,WM6,51.513929,-0.152793,20
235,WMB,51.516066,-0.135164,20


In [289]:
S = set(S_df['SiteCode'].unique())

In [315]:
"""
Optimize sensor placements using GP model and sets S and U
"""
def krause(GP, k, S_df, U_df):
    A = list()
    for i in range(k):
        A_df = S_df.loc[S_df['SiteCode'].isin(A)]
        y_star_code = None
        y_star = None

        S_diff_A = [site for site in S_df['SiteCode'] if site not in A]
        for y in S_diff_A:
            y_df = S_df.loc[S_df['SiteCode'] == y]
            yA_df = y_df.append(A_df)
            
            yA_mean, yA_cov = GP.predict(yA_df[['Latitude', 'Longitude', 't']], return_cov=True)
            
            y_var = yA_cov[0][0]
            if len(A) == 0:
                numerator = y_var
            else:
                yA_cov_row = yA_cov[0, 1:]
                AA_cov = yA_cov[1:, 1:]
                numerator = y_var - yA_cov_row@np.linalg.inv(AA_cov)@yA_cov_row.T
                
            S_diff_A_y = [site for site in S_df['SiteCode'] if site not in A and site != y]
            A_bar_df = S_df.loc[S_df['SiteCode'].isin(S_diff_A_y)].append(U_df)
            yA_bar_df = y_df.append(A_bar_df)
            
            yA_bar_mean, yA_bar_cov = GP.predict(yA_bar_df[['Latitude', 'Longitude', 't']], return_cov=True)
            
            yA_bar_cov_row = yA_bar_cov[0, 1:]
            AA_bar_cov = yA_bar_cov[1:, 1:]
            denominator = y_var - yA_bar_cov_row@np.linalg.inv(AA_bar_cov)@yA_bar_cov_row.T
            
            delta_y = numerator/denominator
            if not y_star_code:
                y_star_code = y
                y_star = delta_y
            elif delta_y > y_star:
                y_star_code = y
                y_star = delta_y
        A.append(y_star_code)
    return A

In [316]:
U_df = pd.DataFrame({"SiteCode":["U1", "U2"], "Latitude": [51.5, 51.6], "Longitude": [-0.13, 0.14], "t": [20, 20]})

In [317]:
U_df

Unnamed: 0,SiteCode,Latitude,Longitude,t
0,U1,51.5,-0.13,20
1,U2,51.6,0.14,20


In [319]:
krause(model, 3, S_df, U_df)

























['EN7', 'EN1', 'HR1']