In [1]:
import os
import sys
# seed = int(sys.argv[1])
seed = 42
import pytz
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from geopy import distance
import datetime
import tilemapbase
from copy import deepcopy
import pickle as pkl
from PIL import Image
import skimage.measure
import math
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
os.environ['PYTHONWARNINGS']='ignore'
import hyperopt
from joblib import Parallel, delayed
import random
random.seed(seed)
import scipy
import torch
from pykrige.ok import OrdinaryKriging
from pykrige.ok3d import OrdinaryKriging3D
from pykrige.uk import UniversalKriging
from sklearn.model_selection import train_test_split
from scipy.interpolate import CubicSpline

source = 'combined'
sensor = 'pm25'
res_time = '1H'
filepath_root = '/scratch/ab9738/pollution_with_sensors/'

filepath_data_kai = filepath_root+'data/kaiterra/kaiterra_fieldeggid_{}_current_panel.csv'.format(res_time)
filepath_data_gov = filepath_root+'data/govdata/govdata_{}_current.csv'.format(res_time)
filepath_locs_kai = filepath_root+'data/kaiterra/kaiterra_locations.csv'
filepath_locs_gov = filepath_root+'data/govdata/govdata_locations.csv'

locs_kai = pd.read_csv(filepath_locs_kai, index_col=[0])
locs_kai['Type'] = 'Kaiterra'
locs_gov = pd.read_csv(filepath_locs_gov, index_col=[0])
locs_gov['Type'] = 'Govt'
locs = pd.merge(locs_kai, locs_gov, how='outer',\
                on=['Monitor ID', 'Latitude', 'Longitude', 'Location', 'Type'], copy=False)
data_kai = pd.read_csv(filepath_data_kai, index_col=[0,1], parse_dates=True)[sensor]
data_gov = pd.read_csv(filepath_data_gov, index_col=[0,1], parse_dates=True)[sensor]
data = pd.concat([data_kai, data_gov], axis=0, copy=False)
data.replace(0,np.nan,inplace=True)

start_dt = data.index.levels[1][0]
end_dt = data.index.levels[1][-1]

if start_dt.tzname != 'IST':
        if start_dt.tzinfo is None:
            start_dt = start_dt.tz_localize('UTC')
        start_dt = start_dt.tz_convert(pytz.FixedOffset(330))
    
if end_dt.tzname != 'IST':
    if end_dt.tzinfo is None: 
        end_dt = end_dt.tz_localize('UTC')
    end_dt = end_dt.tz_convert(pytz.FixedOffset(330))

# now, filter through the start and end dates
data.sort_index(inplace=True)
data = data.loc[(slice(None), slice(start_dt, end_dt))]

if(source=='govdata'):
    df = data_gov.unstack(level=0)
elif(source=='kaiterra'):
    df = data_kai.unstack(level=0)
else:
    df = data.unstack(level=0)
distances = pd.read_csv('/scratch/ab9738/pollution_with_sensors/data/combined_distances.csv', index_col=[0])
distances = distances.loc[df.columns, df.columns]
distances[distances == 0] = np.nan
df = np.log(df)


# sens = np.log(data).to_frame().reset_index()

# sens['hour_of_day'] = sens['timestamp_round'].apply(lambda x: x.hour)

# spline = sens.groupby(['field_egg_id', 'hour_of_day']).mean()['pm25'].reset_index()
# spline_avg = sens.groupby(['hour_of_day']).mean()['pm25'].reset_index()

# fields = []
# times = []
# pm25 = []
# for i in np.unique(spline['field_egg_id']):
#     s_i = spline[spline['field_egg_id']==i]
#     x = s_i['hour_of_day'].values
#     y = [t for t in s_i['pm25'].values]
#     c1 = CubicSpline(x[:8],y[:8])
#     c2 = CubicSpline(x[8:16],y[8:16])
#     c3 = CubicSpline(x[16:24],y[16:24])
#     ix = [k/100.0 for k in range(2400)]
#     iy = list(np.concatenate((c1(ix[:800]),c2(ix[800:1600]),c3(ix[1600:2400]))))
#     fields += [i]*2400
#     times += ix
#     pm25 += iy

# spline_df = pd.DataFrame((fields, times, pm25)).transpose()

# spline_df.columns = ['field_egg_id', 'time', 'pm25']

# hours_in_day = np.arange(24).astype(float)

# spline_df = spline_df[spline_df['time'].isin(hours_in_day)]

# spline_mat = np.transpose(spline_df['pm25'].to_numpy().reshape((60,24))).astype(float)

# spline_df = pd.DataFrame(spline_mat,columns=df.columns)
# spline_df = spline_df.drop(['Pusa_IMD'], axis=1)
# df = df.drop(['Pusa_IMD'], axis=1)
# spline_df = spline_df.mean(axis=1)
# df_full = deepcopy(df)
# for idx,row in df.iterrows():
#     df.loc[idx] = row-spline_df.loc[idx.hour]
# df_spline = df_full-df

## Kriging Model

In [387]:
# snapshot = df[4:-4].mean()
# idx = np.array(df.sample().index)[0]
# window_size = 3
# i = np.where(np.array(df.index) == idx)[0][0]
# df_slice = pd.concat([df[i-window_size:i],df[i+1:i+window_size+1]])
# x_win = locs.loc[df.columns]['Longitude'].values
# x_win = np.tile(x_win,df_slice.shape[0])
# y_win = locs.loc[df.columns]['Latitude'].values
# y_win = np.tile(y_win,df_slice.shape[0])    
# z_win = np.concatenate([np.arange(i-window_size,i),np.arange(i+1,i+window_size+1)])*0.01
# z_win = np.repeat(z_win,len(df.columns))
# vals_win = df_slice.values.flatten()
# # spl_win = pd.concat([df_spline[i-window_size:i],df_spline[i+1:i+window_size+1]]).values.flatten()

# # spl_win = spl_win[~np.isnan(vals_win)]
# x_win = x_win[~np.isnan(vals_win)]
# y_win = y_win[~np.isnan(vals_win)]
# z_win = z_win[~np.isnan(vals_win)]
# vals_win = vals_win[~np.isnan(vals_win)]

# x = locs.loc[df.columns]['Longitude'].values
# y = locs.loc[df.columns]['Latitude'].values
# z = np.ones_like(x)*i*0.01
# vals = snapshot.values[0]
# cols = np.array(df.columns)[~np.isnan(vals)]
# x = x[~np.isnan(vals)]
# y = y[~np.isnan(vals)]
# z = z[~np.isnan(vals)]
# vals = vals[~np.isnan(vals)]

# x_train, x_test, y_train, y_test, z_train, z_test, vals_train, vals_test, cols_train, cols_test = train_test_split(
#     x, y, z, vals, cols, test_size=0.2, random_state=42
# )

# x_train = np.concatenate([x_train,x_win,x_test])
# y_train = np.concatenate([y_train,y_win,y_test])
# z_train = np.concatenate([z_train,z_win,z_test])
# vals_train = np.concatenate([vals_train,vals_win,vals_test])


# OK3D = OrdinaryKriging3D(
#     x_train,
#     y_train,
#     z_train,
#     vals_train,
#     variogram_model="linear",
#     verbose=False,
#     enable_plotting=False,
#     exact_values=True,
# )

# # vals_pred, ss_pred = OK3D.execute("points", x_test, y_test, z_test)
# # spl_test = spline_df.loc[idx.hour][cols_test].values
# # ape.append(np.abs((np.exp(vals_test+spl_test)-np.exp(vals_pred+spl_test))/np.exp(vals_test+spl_test)))
# # ape_arr = np.concatenate(ape)
# # mape = np.mean(ape_arr)
# # print(mape)

## Average Kriging Map

In [50]:
# avg = df.iloc[105]
avg = df.sample()
# avg = df.mean()

x = locs.loc[df.columns]['Longitude'].values
y = locs.loc[df.columns]['Latitude'].values
z = avg.values[0]
print(z)

x = x[~np.isnan(z)]
y = y[~np.isnan(z)]
z = z[~np.isnan(z)]

OK = OrdinaryKriging(
    x,
    y,
    z,
    variogram_model="linear",
    verbose=False,
    enable_plotting=False,
)

[       nan        nan        nan        nan        nan        nan
 5.39261693 5.23066288        nan        nan 5.26182626        nan
        nan        nan 5.44313895 5.22485021 5.59842196 5.32056798
 4.99226764        nan 5.31689375        nan 5.23116209        nan
        nan 5.41902944        nan        nan 5.29831737 5.28502947
 4.93627088        nan        nan 5.40530156        nan 5.10392323
 4.77567217 5.36970736 5.39021263 5.37989735 5.30578938 5.26801279
 5.26785816 5.30330491 5.10594547 5.02657426 5.3506726  5.32056798
 5.30578938 5.17190344 5.30081425 5.31222027 5.1590553  5.0937502
 5.05821838 5.37296091 5.19434478 5.0767351  5.5022781  5.31073989]


In [51]:
np.exp(avg)

field_egg_id,113E,1FD7,20CA,2E9C,3ACF,498F,4BE7,56C3,5D7A,603A,...,Pusa_DPCC,Pusa_IMD,RKPuram_DPCC,Rohini_DPCC,Shadipur_CPCB,Sirifort_CPCB,SoniaVihar_DPCC,SriAurobindoMarg_DPCC,VivekVihar_DPCC,Wazirpur_DPCC
timestamp_round,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-01 23:00:00+05:30,,,,,,,219.777778,186.916667,,,...,200.5,202.8,174.0,163.0,157.31,215.5,180.25,160.25,245.25,202.5


## Heatmap

In [57]:
gridx = np.arange(77.01, 77.40, 0.001)
gridy = np.arange(28.39, 28.78, 0.001)
# gridz = i*0.01

z_grid, ss_grid = OK.execute("grid", gridx, gridy[::-1])#, gridz)
fig, ax = plt.subplots(figsize=(12,12), dpi=200)
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
# plt.figure(figsize=(4,4))
map_array = np.exp(np.squeeze(z_grid.data))
plt.imshow(map_array, cmap='viridis', interpolation='nearest')
plt.axis('off')
# plt.colorbar()
# plt.show()
plt.savefig('heatmap.png',bbox_inches='tight')
plt.close(fig)

## Hotspot

In [10]:
comp_vals = map_array.flatten()

kde = scipy.stats.gaussian_kde(comp_vals)

def val_at_prob(val,kde=kde,prob=0.8):
    return (abs(prob-kde.integrate_box_1d(-np.inf,val)))

res = scipy.optimize.minimize_scalar(val_at_prob,bounds=(min(comp_vals),max(comp_vals)),method="bounded")

mask_arr = np.zeros_like(map_array)
mask_arr[map_array>res.x] = 1
fig, ax = plt.subplots(figsize=(12,12), dpi=200)
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
plt.imshow(mask_arr, cmap='viridis', interpolation='nearest')
plt.axis('off')
plt.savefig('binmap.png',bbox_inches='tight')
plt.close(fig)

## Geographical Map

In [390]:
# region_lat_b, region_lat_t, region_long_l, region_long_r = 28.39, 28.78, 77.01, 77.40
# # region_lat_b, region_lat_t, region_long_l, region_long_r = 28.20, 29.00, 76.85, 77.65
# subset_locs = locs[(locs['Latitude']<region_lat_t) & (locs['Latitude']>region_lat_b) &\
#                    (locs['Longitude']<region_long_r) & (locs['Longitude']>region_long_l)]

# lat_lims = region_lat_b, region_lat_t
# lon_lims = region_long_l, region_long_r
# lat_center, lon_center = (region_lat_b + region_lat_t)/2, (region_long_l + region_long_r)/2

# extent = tilemapbase.Extent.from_lonlat(region_long_l, region_long_r, region_lat_b, region_lat_t)
# extent_proj = extent.to_project_3857

# # use openstreetmap (OSM)
# t = tilemapbase.tiles.Stamen_Toner_Background

# colordict = {'Kaiterra':'r', 'Govt':'b'}

# plt.rc('font', size=20)

# fig, ax = plt.subplots(figsize=(12,12), dpi=200)
# ax.xaxis.set_visible(False)
# ax.yaxis.set_visible(False)

# plotter = tilemapbase.Plotter(extent, t, width=600)
# plotter.plot(ax, t)

# for row in subset_locs.itertuples():
#     x, y = tilemapbase.project(row.Longitude, row.Latitude)
#     if row.Type == 'Kaiterra':
#         obj1 = ax.scatter(x, y, marker='.', color='r', s=10, label='Our sensors')
#     else:
#         obj2 = ax.scatter(x, y, marker='.', color='b', s=10, label='CPCB/DPCC/IMD')
#     ax.text(x, y, row.Index, fontsize=6)

# # ax.legend((obj1, obj2), (obj1.get_label(), obj2.get_label()), loc='lower right', ncol=2)
# fig.savefig('base_map.png',bbox_inches='tight')
# # plt.show()
# plt.close(fig)

## Overlay

In [55]:
base_map = Image.open('./base_map.png')
heatmap = Image.open('./heatmap.png')
base_map = Image.fromarray(np.asarray(base_map)[:-10,12:-3,:])
heatmap = Image.fromarray(np.asarray(heatmap))

binary_map = Image.open('./binmap.png')
binary_map = Image.fromarray(np.asarray(binary_map))
binary_map = binary_map.resize(heatmap.size)

base_map = base_map.resize(heatmap.size)
new_map = Image.blend(base_map, heatmap, alpha=0.7)
# new_map = Image.blend(new_map, binary_map, alpha=0.1)

new_map.save('./final_map.png')

## Measurement snapshot

In [392]:
np.exp(snapshot.squeeze())

field_egg_id
113E                           NaN
1FD7                           NaN
20CA                           NaN
2E9C                           NaN
3ACF                           NaN
498F                           NaN
4BE7                           NaN
56C3                           NaN
5D7A                           NaN
603A                           NaN
72CA                           NaN
8E2A                           NaN
91B8                           NaN
97D7                           NaN
A838                           NaN
A9BE                     56.916667
AnandVihar_DPCC          39.500000
AshokVihar_DPCC          40.500000
AyaNagar_IMD             48.990000
BB4A                           NaN
BC46                           NaN
BFDC                           NaN
BurariCrossing_IMD             NaN
C0A7                           NaN
CBC7                           NaN
CRRIMathuraRoad_IMD      30.790000
D804                           NaN
DF07                           NaN
DKSSR_D