In [1]:
import os
import sys
seed = 42
import pytz
import numpy as np
import pandas as pd
from tqdm import tqdm
from geopy import distance
import datetime
from copy import deepcopy
import pickle as pkl
import skimage.measure
import math
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
os.environ['PYTHONWARNINGS']='ignore'
from joblib import Parallel, delayed
import random
random.seed(seed)
import scipy
from pykrige.ok import OrdinaryKriging
from pykrige.ok3d import OrdinaryKriging3D
from sklearn.model_selection import train_test_split
from scipy.interpolate import CubicSpline

In [7]:
locs = pd.read_csv('/scratch/ab9738/hidden_hotspots/data/purpleair_nyc_sensor_locations.csv')

locs['SensorIndex'] = locs['SensorIndex'].apply(str)

locs = locs.set_index('SensorIndex')

data = pd.read_csv('/scratch/ab9738/hidden_hotspots/data/purple_air_2023_nyc.csv',index_col=0,parse_dates=True)

data[data <= 1] = np.nan
data[data > 500] = np.nan

df = deepcopy(data)

Space Time Kriging

In [16]:
def process_row(idx, row):
    window_size = 3
    i = np.where(np.array(df.index) == idx)[0][0]
    df_slice = pd.concat([df[i-window_size:i],df[i+1:i+window_size+1]])
    x_win = locs.loc[df.columns]['Longitude'].values
    x_win = np.tile(x_win,df_slice.shape[0])
    y_win = locs.loc[df.columns]['Latitude'].values
    y_win = np.tile(y_win,df_slice.shape[0])    
    z_win = np.concatenate([np.arange(i-window_size,i),np.arange(i+1,i+window_size+1)])*0.01
    z_win = np.repeat(z_win,len(df.columns))
    vals_win = df_slice.values.flatten()
    x_win = x_win[~np.isnan(vals_win)]
    y_win = y_win[~np.isnan(vals_win)]
    z_win = z_win[~np.isnan(vals_win)]
    vals_win = vals_win[~np.isnan(vals_win)]
    
    x = locs.loc[df.columns]['Longitude'].values
    y = locs.loc[df.columns]['Latitude'].values
    z = np.ones_like(x)*i*0.01
    vals = row.values
    cols = np.array(df.columns)[~np.isnan(vals)]
    x = x[~np.isnan(vals)]
    y = y[~np.isnan(vals)]
    z = z[~np.isnan(vals)]
    vals = vals[~np.isnan(vals)]
    
    if(len(cols)<10):
        return(np.nan,np.nan)
    
    x_train, x_test, y_train, y_test, z_train, z_test, vals_train, vals_test, cols_train, cols_test = train_test_split(
        x, y, z, vals, cols, test_size=0.2, random_state=seed
    )
    
    x_train = np.concatenate([x_train,x_win])
    y_train = np.concatenate([y_train,y_win])
    z_train = np.concatenate([z_train,z_win])
    vals_train = np.concatenate([vals_train,vals_win])


    OK3D = OrdinaryKriging3D(
        x_train,
        y_train,
        z_train,
        vals_train,
        variogram_model="linear",
        verbose=False,
        enable_plotting=False,
    )

    vals_pred, ss_pred = OK3D.execute("points", x_test, y_test, z_test)
    ape_for_row = np.abs(vals_test-vals_pred)/vals_test
    se_for_row = np.square(vals_test-vals_pred)
    return(ape_for_row,se_for_row)

In [17]:
result_list = Parallel(n_jobs=12)(delayed(process_row)(idx,row) for idx,row in df[4:-4].iterrows())

In [20]:
ape = [x for (x,y) in result_list]
se = [y for (x,y) in result_list]
ape = [x for x in ape if str(x) != 'nan']
se = [x for x in se if str(x) != 'nan']
ape_arr = np.concatenate(ape)
se_arr = np.concatenate(se)
mape = np.mean(ape_arr)
rmse = np.sqrt(np.mean(se_arr))
print(mape, rmse)

1.034253062205728 9.179337291424442


Ordinary Kriging

In [21]:
def process_row(idx, row):
    x = locs.loc[df.columns]['Longitude'].values
    y = locs.loc[df.columns]['Latitude'].values
    z = row.values
    
    cols = np.array(df.columns)[~np.isnan(z)]
    x = x[~np.isnan(z)]
    y = y[~np.isnan(z)]
    z = z[~np.isnan(z)]
    
    if(len(cols)<10):
        return(np.nan,np.nan)
    
    x_train, x_test, y_train, y_test, z_train, z_test, cols_train, cols_test = train_test_split(
        x, y, z, cols, test_size=0.2, random_state=seed
    )

    OK = OrdinaryKriging(
        x_train,
        y_train,
        z_train,
        variogram_model="gaussian",
        verbose=False,
        enable_plotting=False,
    )

    z_pred, ss_pred = OK.execute("points", x_test, y_test)
    ape_for_row = np.abs(z_test-z_pred)/z_test
    se_for_row = se_for_row = np.square(z_test-z_pred)
    return(ape_for_row,se_for_row)

In [22]:
result_list = Parallel(n_jobs=12)(delayed(process_row)(idx,row) for idx,row in df[4:-4].iterrows())

In [23]:
ape = [x for (x,y) in result_list]
se = [y for (x,y) in result_list]
ape = [x for x in ape if str(x) != 'nan']
se = [x for x in se if str(x) != 'nan']
ape_arr = np.concatenate(ape)
se_arr = np.concatenate(se)
mape = np.mean(ape_arr)
rmse = np.sqrt(np.mean(se_arr))
print(mape, rmse)

1.340497563513339 15.976162900793089


In [25]:
df.mean().mean()

13.242716521885349