In [2]:
%load_ext autoreload
%autoreload 2
import os
import sys
import pytz
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tilemapbase
tilemapbase.init()

from tqdm import tqdm
from geopy import distance

In [3]:
# Temporal hotspot
def is_win_thigh(win, ff):
    # "win" is of type numpy.ndarray
    c_ii = (len(win)-1)//2
    c = win[c_ii]
    m = np.maximum(win[:c_ii].max(), win[c_ii+1:].max())
    r = c >= (1 + ff)*m
    return r


def is_win_tlow(win, ff):
    # "win" is of type numpy.ndarray
    c_ii = (len(win)-1)//2
    c = win[c_ii]
    m = np.minimum(win[:c_ii].min(), win[c_ii+1:].min())
    r = c <= ff*m
    return r


In [26]:
def get_hotspots(data, sensor, params, locs, savedir):

    # get data and distances; data is expected to be a pandas.Series,
    # not pandas.DataFrame, containing only one column (either pm25 or
    # pm10)
    df = data.unstack(level=0)
    distances = pd.read_csv('/scratch/ab9738/epod-nyu-delhi-pollution/data/combined_distances.csv', index_col=[0])

    # select only the locations that are in the data
    distances = distances.loc[df.columns, df.columns]

    # invalidate diagonal entries so that sensor M does not get
    # counted in the M's radius
    distances[distances == 0] = np.nan

    # res: three digit entries 'abc' or NaN, where a/b/c = 1 or 9
    #
    # a == 9 => thigh, a == 1 => tlow
    # b == 9 => shigh, b == 1 => slow
    # c == 9 => jhigh, c == 1 => jlow
    res = pd.DataFrame(index=df.index, columns=df.columns)

    # (1) WINDOW HOTSPOTS

    # **Temporal Window Hotspot**: a timestamp is marked as a temporal
    # window hotspot if the value at that time is greater/lesser than
    # a window (radius "wtr") around it by a threshold fraction "wttf"
    wts = 2*params['wtr'] + 1
    rolling_wt = df.rolling(wts, min_periods=wts, center=True)
    res_win_thigh = rolling_wt.apply(is_win_thigh, raw=True, args=(params['wttf'],))
    res_win_tlow = rolling_wt.apply(is_win_tlow, raw=True, args=(params['wttf'],))
    res[res_win_thigh == 1] = 900
    res[res_win_tlow == 1] = 100

    # **Spatial Window Hotspot**: A location is marked as a
    # spatial window hotspot if, at a given time, the value at
    # that location is greater/lesser than the max of values in a
    # radius ("wsr") around it by a threshold frac "wstf"
    res_win_shigh = pd.DataFrame(index=df.index, columns=df.columns)
    res_win_slow = pd.DataFrame(index=df.index, columns=df.columns)
    for mid in df.columns:
        neighborhood = (distances.loc[mid] <= params['wsr'] * 1000)
        neighborhood_max = df.loc[:,neighborhood].max(axis=1)
        neighborhood_min = df.loc[:,neighborhood].min(axis=1)
        res_win_shigh.loc[:, mid] = (df[mid] > ((1 + params['wstf']) * neighborhood_max))
        res_win_shigh.loc[neighborhood_max.isna() | df[mid].isna(), mid] = np.nan
        res_win_slow.loc[:, mid]  = (df[mid] < (params['wstf'] * neighborhood_min))
        res_win_slow.loc[neighborhood_min.isna() | df[mid].isna(), mid] = np.nan
    res[(res_win_shigh == 1) & res.notna()] += 90
    res[(res_win_shigh == 1) & res.isna()] = 90
    res[(res_win_slow == 1) & res.notna()] += 10
    res[(res_win_slow == 1) & res.isna()] = 10

    # (2) JUMP HOTSPOTS

    # for jumps: first the data is smoothened using a rolling window
    # of radius "jtr", then every timestamp where the change from the
    # previous timestamp is greater/lesser than the threshold "jtv" is
    # marked, and finally a timestamp+location is marked as a hotspot
    # if the change in the values at that location is the
    # highest/lowest in a radius "jsr"
    rolling_j = df.rolling(2*params['jtr'] + 1, min_periods=1, center=True).mean().diff()
    res_jump_high = pd.DataFrame(index=df.index, columns=df.columns)
    res_jump_low = pd.DataFrame(index=df.index, columns=df.columns)
    for mid in df.columns:
        neighborhood = (distances.loc[mid] <= params['jsr'] * 1000)
        neighborhood_max = rolling_j.loc[:,neighborhood].max(axis=1)
        neighborhood_min = rolling_j.loc[:,neighborhood].min(axis=1)
        res_jump_high.loc[:, mid] = (rolling_j[mid] > params['jtv']).to_numpy() &\
        (rolling_j[mid] > neighborhood_max).to_numpy()
        res_jump_high.loc[rolling_j[mid].isna() | neighborhood_max.isna(), mid] = np.nan
        res_jump_low.loc[:, mid] = (rolling_j[mid] < -params['jtv']).to_numpy() &\
        (rolling_j[mid] < neighborhood_min).to_numpy()
        res_jump_low.loc[rolling_j[mid].isna() | neighborhood_min.isna(), mid] = np.nan
    res[(res_jump_high == 1) & res.notna()] += 9
    res[(res_jump_high == 1) & res.isna()] = 9
    res[(res_jump_low == 1) & res.notna()] += 1
    res[(res_jump_low == 1) & res.isna()] = 1

    suffix = 'wtr{}_wttf{:02.0f}_wsr{}_wstf{:02.0f}_jtr{}_jtv{:03.0f}_jsr{}'.format\
    (params['wtr'],params['wttf']*100,params['wsr'],params['wstf']*100,params['jtr'],params['jtv'], params['jsr'])
    finaldir = os.path.join(savedir, sensor, suffix)
    if not os.path.exists(finaldir):
        os.makedirs(finaldir)

    res.to_csv(os.path.join(finaldir, 'table.csv'.format(suffix)), float_format='%.0f')

    # illustrate each hotspot occurrence on a combined time
    # series-cum-map plot. show all the type of hotspots in the
    # figures -- high-time/high-space, high-time/low-space,
    # low-time/high-space, low-time/low-space, high-time, low-time,
    # high-space, low-space, high-jump, low-jump
#     serial_index = np.arange(res.index.size)

#     lon_max, lat_max = locs.Longitude.max(), locs.Latitude.max()
#     lon_min, lat_min = locs.Longitude.min(), locs.Latitude.min()
    #lon_center, lat_center = locs.Longitude.mean(), locs.Latitude.mean()
    #lat_pad = 1.1 * max(lat_center - lat_min, lat_max - lat_center)
    #lon_pad = 1.1 * max(lon_center - lon_min, lon_max - lon_center)
    #extent = tilemapbase.Extent.from_lonlat(lon_center - lon_pad,
    #                                        lon_center + lon_pad,
    #                                        lat_center - lat_pad,
    #                                        lat_center + lat_pad)
#     D_true = distance.distance((lat_max, lon_max), (lat_min, lon_min)).km
#     x_max, y_max = tilemapbase.project(lon_max, lat_max)
#     x_min, y_min = tilemapbase.project(lon_min, lat_min)
#     D_proj = np.sqrt((x_min - x_max)**2 + (y_min - y_max)**2)
#     wsr_proj = params['wsr'] * D_proj / D_true
#     jsr_proj = params['jsr'] * D_proj / D_true
#     ang_wsr_pts = np.linspace(0, 2*np.pi, 41)
#     ang_jsr_pts = np.linspace(0, 2*np.pi, 41) + ang_wsr_pts[1]/2

    #extent_proj = extent.to_project_3857
    #color_dict = {'Kaiterra' : 'r', 'Govt' : 'b'}
    
    # formula for computing marker size proportional to the pm value
#     pm_min, pm_max = 1, data.max()
#     ms_min, ms_max = 1, 300
#     size_ratio = (ms_max - ms_min) / (pm_max - pm_min)

#     plt.rc('font', size=12)
    #tile = tilemapbase.tiles.Stamen_Toner_Background
    #tile = tilemapbase.tiles.Carto_Light
#     tile = tilemapbase.tiles.Stamen_Terrain

    #mid_list = res.columns.drop(['113E', '1FD7', '20CA', '2E9C', '3ACF', '498F', '4BE7', '56C3', '5D7A'])
#     mid_list = res.columns
#     for jj, mid in enumerate(mid_list, 1):
        
#         print('{}/{} {}'.format(jj, len(mid_list), mid))

        # create directory for saving
#         subdir = os.path.join(finaldir, mid)
#         if not os.path.exists(subdir):
#             os.makedirs(subdir)

#         series = data.loc[mid]

        # compute projections for plotting
#         lon_mid, lat_mid = locs.loc[mid].Longitude, locs.loc[mid].Latitude
#         x_mid, y_mid = tilemapbase.project(lon_mid, lat_mid)

#         wsr_neighborhood = distances.columns[distances.loc[mid] < params['wsr'] * 1000]
#         jsr_neighborhood = distances.columns[distances.loc[mid] < params['jsr'] * 1000]
#         neighborhood = wsr_neighborhood if params['wsr'] >= params['jsr'] else jsr_neighborhood
#         if len(neighborhood) > 0:
#             x_pts, y_pts = zip(*[tilemapbase.project(locs.loc[l].Longitude, locs.loc[l].Latitude)\
#                                  for l in neighborhood])
#         else:
#             x_pts, y_pts = [], []

        # plot the neighborhood radius
#         x_wsr_pts, y_wsr_pts = x_mid - wsr_proj*np.cos(ang_wsr_pts), y_mid + wsr_proj*np.sin(ang_wsr_pts)
#         x_jsr_pts, y_jsr_pts = x_mid - jsr_proj*np.cos(ang_jsr_pts), y_mid + jsr_proj*np.sin(ang_jsr_pts)

#         suptitle_str = 'Location: {}, Sensor: {}'.format(mid, sensor)

#         length = 2.2 * max(params['wsr'], params['jsr']) * D_proj/D_true
#         extent = tilemapbase.Extent.from_centre_lonlat(lon_mid, lat_mid, aspect=1.3, ysize=length)
#         plotter = tilemapbase.Plotter(extent, tile, width=300)

#         indices = serial_index[res[mid].notna()]

#         count_dict = dict()
#         for ind in tqdm(indices):
#             code = res[mid].iloc[ind]
#             if not code in count_dict:
#                 count_dict[code] = 0
#             count_dict[code] += 1

#             win_rad = params['wtr'] if divmod(code, 100)[0] != 0 else params['jtr']
            
#             fig1 = plt.figure()
#             ax1 = fig1.add_subplot(111)
#             s_ii = 0 if ind-win_rad < 1 else ind-win_rad-1
#             series.iloc[s_ii:ind+win_rad+2].plot(ax=ax1, marker='o', ms=6, fontsize='small')
#             series.iloc[ind:ind+1].plot(ax=ax1, marker='o', c='r', ms=10, fontsize='small')
#             fig1.suptitle(suptitle_str)
#             ax1.set_title('Hotspot {:03d}, {}'.format(code, series.index[ind]))
#             ax1.set_xlabel('Time')
#             ax1.set_ylabel(sensor)
            
#             # size of marker should be proportional to "pm" value; we
#             # make a linear relationship between pm value and marker
#             # size
#             fig2 = plt.figure()
#             ax2 = fig2.add_subplot(111)
#             plotter.plot(ax2, tile)
#             #print('Location:', mid)
#             #print('Value:', df[mid].iloc[ind])
#             #print('WSR Radius:', wsr_radius)
#             #print('WSR Radius values:', df.iloc[ind].loc[wsr_radius].values)
#             #print('WSR Radius max:', df.iloc[ind].loc[wsr_radius].max())
#             #print('JSR Radius:', jsr_radius)
#             #print('JSR Radius values:', df.iloc[ind].loc[jsr_radius].values)
#             #print('JSR Radius max:', df.iloc[ind].loc[jsr_radius].max())

#             ms = (df.iloc[ind].loc[mid] - pm_min) * size_ratio + ms_min
#             ax2.scatter(x_mid, y_mid, marker='.', alpha=0.5, color='r', s=ms**2, edgecolors='none')

#             ms_pts = [((pm - pm_min) * size_ratio + ms_min)**2 for pm in df.iloc[ind].loc[neighborhood]]
#             ax2.scatter(x_pts, y_pts, marker='.', alpha=0.5, color='b', s=ms_pts, edgecolors='none')
#             # for (l, pm) in df.iloc[ind].iteritems():
#             #     x, y = tilemapbase.project(locs.loc[l].Longitude, locs.loc[l].Latitude)
#             #     ms = (pm - pm_min) * (ms_max - ms_min) / (pm_max - pm_min) + ms_min
#             #     if l == mid:
#             #         ax2.scatter(x, y, marker='.', alpha=0.4, color='r', s=ms**2, edgecolors='none')
#             #         ax2.text(x, y, l, fontsize='xx-small')
#             #     else:
#             #         ax2.scatter(x, y, marker='.', alpha=0.8, color='y', s=ms**2, edgecolors='none')
#             #     if l in wsr_radius and not np.isnan(pm):
#             #        ax2.text(x, y, l, fontsize='xx-small')
            
#             # draw a dotted circle showing the radius
#             ax2.plot(x_wsr_pts, y_wsr_pts, c='#003300', ls='--', lw=1)
#             ax2.plot(x_jsr_pts, y_jsr_pts, c='#003300', ls=':', lw=3)

#             fig2.suptitle(suptitle_str)
#             ax2.set_title('Hotspot {:03d}, {}'.format(code, series.index[ind]))

#             fig1.savefig(os.path.join(subdir, 'h_{:03d}_{:02d}_ts.png'.format(code, count_dict[code])))
#             fig2.savefig(os.path.join(subdir, 'h_{:03d}_{:02d}_map.png'.format(code, count_dict[code])))
#             plt.close(fig1)
#             plt.close(fig2)
        #     break
        # break
    
    return res

# Main

In [31]:
source = 'combined'
sensor = 'pm25'
res_time = '3H'
res_space = '0'
wtr = 1
wttf = 0.5
wsr = 5.0
wstf = 0.5
jtr = 1
jsr = 5.0

In [32]:
filepath_root = '/scratch/ab9738/epod-nyu-delhi-pollution/'
filepath_data_kai = filepath_root+'data/kaiterra/kaiterra_fieldeggid_{}_current_panel.csv'.format(res_time)
filepath_data_gov = filepath_root+'data/govdata/govdata_{}_current.csv'.format(res_time)
filepath_locs_kai = filepath_root+'data/kaiterra/kaiterra_locations.csv'
filepath_locs_gov = filepath_root+'data/govdata/govdata_locations.csv'

In [33]:
locs_kai = pd.read_csv(filepath_locs_kai, index_col=[0])
locs_kai['Type'] = 'Kaiterra'
locs_gov = pd.read_csv(filepath_locs_gov, index_col=[0])
locs_gov['Type'] = 'Govt'
locs = pd.merge(locs_kai, locs_gov, how='outer',\
                on=['Monitor ID', 'Latitude', 'Longitude', 'Location', 'Type'], copy=False)
data_kai = pd.read_csv(filepath_data_kai, index_col=[0,1], parse_dates=True)[sensor]
data_gov = pd.read_csv(filepath_data_gov, index_col=[0,1], parse_dates=True)[sensor]
data = pd.concat([data_kai, data_gov], axis=0, copy=False)

In [34]:
start_dt = data.index.levels[1][0]
end_dt = data.index.levels[1][-1]

In [35]:
if start_dt.tzname != 'IST':
        if start_dt.tzinfo is None:
            start_dt = start_dt.tz_localize('UTC')
        start_dt = start_dt.tz_convert(pytz.FixedOffset(330))
    
if end_dt.tzname != 'IST':
    if end_dt.tzinfo is None: 
        end_dt = end_dt.tz_localize('UTC')
    end_dt = end_dt.tz_convert(pytz.FixedOffset(330))

# now, filter through the start and end dates
data.sort_index(inplace=True)
data = data.loc[(slice(None), slice(start_dt, end_dt))]

In [36]:
jtv = 100 if sensor == 'pm25' else 200

In [37]:
hotspots_params = {'wtr':wtr, 'wttf':wttf, 'wsr':wsr, 'wstf':wstf, 'jtr':jtr, 'jtv':jtv, 'jsr':jsr}

In [38]:
savedir = os.path.join('output', 'hotspots_revised', source, 
                           '{}_{}'.format(start_dt.strftime('%Y%m%d'), end_dt.strftime('%Y%m%d')), 
                           'tres{}_sres{}'.format(res_time, res_space))
if not os.path.exists(savedir):
    os.makedirs(savedir)

    # compute and save the hotspots

In [39]:
hotspots = get_hotspots(data, sensor, hotspots_params, locs, savedir)

In [40]:
hotspots

field_egg_id,113E,1FD7,20CA,2E9C,3ACF,498F,4BE7,56C3,5D7A,603A,...,Pusa_DPCC,Pusa_IMD,RKPuram_DPCC,Rohini_DPCC,Shadipur_CPCB,Sirifort_CPCB,SoniaVihar_DPCC,SriAurobindoMarg_DPCC,VivekVihar_DPCC,Wazirpur_DPCC
timestamp_round,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-05-01 00:00:00+05:30,,,,,,,,,,,...,,,,,,,,,,
2018-05-01 03:00:00+05:30,,,,,,,,,,,...,,,,,,,,,,
2018-05-01 06:00:00+05:30,,,,,,,,,90,,...,,,,,,,,,,
2018-05-01 09:00:00+05:30,,,,,,,,,90,,...,,,900,,,,,,,
2018-05-01 12:00:00+05:30,,,,,,,,,90,,...,,90,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-31 09:00:00+05:30,,,,,,,,,,,...,,,,1,,,,1,1,
2020-10-31 12:00:00+05:30,,,,,,,,,,,...,1,,,1,,,,1,1,
2020-10-31 15:00:00+05:30,,,,,,,,,,,...,,,,,,,,,,
2020-10-31 18:00:00+05:30,,,,,,,,,,,...,,,,,,,,,,


In [41]:
(hotspots==10).sum().sum()

4645