# Import Statements

In [27]:
%load_ext autoreload
%autoreload 2
import os
import sys
import pytz
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from geopy import distance
import datetime
import tilemapbase
from copy import deepcopy
import pickle as pkl

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Relative Spikes

In [1]:
# Temporal hotspot
def is_win_thigh(win, ff):
    # "win" is of type numpy.ndarray
    c_ii = (len(win)-1)//2
    c = win[c_ii]
    m = np.maximum(win[:c_ii].max(), win[c_ii+1:].max())
    r = c >= (1 + ff)*m
    return r


def is_win_tlow(win, ff):
    # "win" is of type numpy.ndarray
    c_ii = (len(win)-1)//2
    c = win[c_ii]
    m = np.minimum(win[:c_ii].min(), win[c_ii+1:].min())
    r = c <= ff*m
    return r


def get_spikes(data, sensor, params, locs):

    # get data and distances; data is expected to be a pandas.Series,
    # not pandas.DataFrame, containing only one column (either pm25 or
    # pm10)
    df = data.unstack(level=0)
    distances = pd.read_csv('/scratch/ab9738/epod-nyu-delhi-pollution/data/combined_distances.csv', index_col=[0])

    # select only the locations that are in the data
    distances = distances.loc[df.columns, df.columns]

    # invalidate diagonal entries so that sensor M does not get
    # counted in the M's radius
    distances[distances == 0] = np.nan

    # res: three digit entries 'abc' or NaN, where a/b/c = 1 or 9
    #
    # a == 9 => thigh, a == 1 => tlow
    # b == 9 => shigh, b == 1 => slow
    # c == 9 => jhigh, c == 1 => jlow
    res = pd.DataFrame(index=df.index, columns=df.columns)

    # (1) WINDOW HOTSPOTS

    # **Temporal Window Hotspot**: a timestamp is marked as a temporal
    # window hotspot if the value at that time is greater/lesser than
    # a window (radius "wtr") around it by a threshold fraction "wttf"
    wts = 2*params['wtr'] + 1
    rolling_wt = df.rolling(wts, min_periods=wts, center=True)
    res_win_thigh = rolling_wt.apply(is_win_thigh, raw=True, args=(params['wttf'],))
    res_win_tlow = rolling_wt.apply(is_win_tlow, raw=True, args=(params['wttf'],))
    res[res_win_thigh == 1] = 900
    res[res_win_tlow == 1] = 100

    # **Spatial Window Hotspot**: A location is marked as a
    # spatial window hotspot if, at a given time, the value at
    # that location is greater/lesser than the max of values in a
    # radius ("wsr") around it by a threshold frac "wstf"
    res_win_shigh = pd.DataFrame(index=df.index, columns=df.columns)
    res_win_slow = pd.DataFrame(index=df.index, columns=df.columns)
    for mid in df.columns:
        neighborhood = (distances.loc[mid] <= params['wsr'] * 1000)
        neighborhood_max = df.loc[:,neighborhood].max(axis=1)
        neighborhood_min = df.loc[:,neighborhood].min(axis=1)
        res_win_shigh.loc[:, mid] = (df[mid] > ((1 + params['wstf']) * neighborhood_max))
        res_win_shigh.loc[neighborhood_max.isna() | df[mid].isna(), mid] = np.nan
        res_win_slow.loc[:, mid]  = (df[mid] < (params['wstf'] * neighborhood_min))
        res_win_slow.loc[neighborhood_min.isna() | df[mid].isna(), mid] = np.nan
    res[(res_win_shigh == 1) & res.notna()] += 90
    res[(res_win_shigh == 1) & res.isna()] = 90
    res[(res_win_slow == 1) & res.notna()] += 10
    res[(res_win_slow == 1) & res.isna()] = 10

    # (2) JUMP HOTSPOTS

    # for jumps: first the data is smoothened using a rolling window
    # of radius "jtr", then every timestamp where the change from the
    # previous timestamp is greater/lesser than the threshold "jtv" is
    # marked, and finally a timestamp+location is marked as a hotspot
    # if the change in the values at that location is the
    # highest/lowest in a radius "jsr"
    rolling_j = df.rolling(2*params['jtr'] + 1, min_periods=1, center=True).mean().diff()
    res_jump_high = pd.DataFrame(index=df.index, columns=df.columns)
    res_jump_low = pd.DataFrame(index=df.index, columns=df.columns)
    for mid in df.columns:
        neighborhood = (distances.loc[mid] <= params['jsr'] * 1000)
        neighborhood_max = rolling_j.loc[:,neighborhood].max(axis=1)
        neighborhood_min = rolling_j.loc[:,neighborhood].min(axis=1)
        res_jump_high.loc[:, mid] = (rolling_j[mid] > params['jtv']).to_numpy() &\
        (rolling_j[mid] > neighborhood_max).to_numpy()
        res_jump_high.loc[rolling_j[mid].isna() | neighborhood_max.isna(), mid] = np.nan
        res_jump_low.loc[:, mid] = (rolling_j[mid] < -params['jtv']).to_numpy() &\
        (rolling_j[mid] < neighborhood_min).to_numpy()
        res_jump_low.loc[rolling_j[mid].isna() | neighborhood_min.isna(), mid] = np.nan
    res[(res_jump_high == 1) & res.notna()] += 9
    res[(res_jump_high == 1) & res.isna()] = 9
    res[(res_jump_low == 1) & res.notna()] += 1
    res[(res_jump_low == 1) & res.isna()] = 1
    
    return res

# Parameters for Data and Spikes

In [2]:
source = 'combined'
sensor = 'pm25'
res_time = '3H'
res_space = '0'
wtr = 1
wttf = 0.5
wsr = 5.0
wstf = 0.5
jtr = 1
jsr = 5.0
jtv = 100 if sensor == 'pm25' else 200

# Data Loading

In [19]:
filepath_root = '/scratch/ab9738/epod-nyu-delhi-pollution/'
filepath_data_kai = filepath_root+'data/kaiterra/kaiterra_fieldeggid_{}_current_panel.csv'.format(res_time)
filepath_data_gov = filepath_root+'data/govdata/govdata_{}_current.csv'.format(res_time)
filepath_locs_kai = filepath_root+'data/kaiterra/kaiterra_locations.csv'
filepath_locs_gov = filepath_root+'data/govdata/govdata_locations.csv'

locs_kai = pd.read_csv(filepath_locs_kai, index_col=[0])
locs_kai['Type'] = 'Kaiterra'
locs_gov = pd.read_csv(filepath_locs_gov, index_col=[0])
locs_gov['Type'] = 'Govt'
locs = pd.merge(locs_kai, locs_gov, how='outer',\
                on=['Monitor ID', 'Latitude', 'Longitude', 'Location', 'Type'], copy=False)
data_kai = pd.read_csv(filepath_data_kai, index_col=[0,1], parse_dates=True)[sensor]
data_gov = pd.read_csv(filepath_data_gov, index_col=[0,1], parse_dates=True)[sensor]
data = pd.concat([data_kai, data_gov], axis=0, copy=False)

start_dt = data.index.levels[1][0]
end_dt = data.index.levels[1][-1]

if start_dt.tzname != 'IST':
        if start_dt.tzinfo is None:
            start_dt = start_dt.tz_localize('UTC')
        start_dt = start_dt.tz_convert(pytz.FixedOffset(330))
    
if end_dt.tzname != 'IST':
    if end_dt.tzinfo is None: 
        end_dt = end_dt.tz_localize('UTC')
    end_dt = end_dt.tz_convert(pytz.FixedOffset(330))

# now, filter through the start and end dates
data.sort_index(inplace=True)
data = data.loc[(slice(None), slice(start_dt, end_dt))]

if(source=='govdata'):
    df = data_gov.unstack(level=0)
elif(source=='kaiterra'):
    df = data_kai.unstack(level=0)
else:
    df = data.unstack(level=0)
distances = pd.read_csv('/scratch/ab9738/epod-nyu-delhi-pollution/data/combined_distances.csv', index_col=[0])
distances = distances.loc[df.columns, df.columns]
distances[distances == 0] = np.nan

# Generate Spikes

In [20]:
spikes_params = {'wtr':wtr, 'wttf':wttf, 'wsr':wsr, 'wstf':wstf, 'jtr':jtr, 'jtv':jtv, 'jsr':jsr}
spikes = get_spikes(data, sensor, spikes_params, locs)

In [22]:
spikes.to_csv('spikes.csv')

# Temporal Hotspots

In [45]:
th = 150
base_win_factor = 5

In [61]:
def find_temporal_high(arr, th):
    peak_indices = []
    hotspot_indices = []
    hotspot_windows = []
    plateau = 0
    for i in range(1,len(arr)-1):
        if(arr[i]>arr[i-1] and arr[i]>arr[i+1]):
            peak_indices.append(i)
    for ind in peak_indices:
        left_ind = ind-1
        right_ind = ind+1
        while(left_ind!=-1):
            if(arr[left_ind]<arr[left_ind+1]):
                left_ind -= 1
            else:
                break
        while(right_ind!=len(arr)):
            if(arr[right_ind]<arr[right_ind-1]):
                right_ind += 1
            else:
                break
        right_ind -= 1
        left_ind += 1
        if(left_ind==-1):
            left_ind = 0
        if(right_ind==len(arr)):
            right_ind = len(arr)-1
        val_left = arr[left_ind]
        val_right = arr[right_ind]
        val_peak = arr[ind]
        win_size = right_ind-left_ind+1
        base_window_left = arr[max(left_ind-base_win_factor*win_size,0):left_ind]
        base_window_right = arr[right_ind+1:max(right_ind+base_win_factor*win_size,len(arr)-1)]
        base_window = np.concatenate((base_window_left,base_window_right))
        base_height = np.mean(base_window)
#         if(val_peak-val_right>th or val_peak-val_left>th):
#             hotspot_indices.append(ind)
#             hotspot_windows.append((left_ind,right_ind))
        if(val_peak-base_height>th):
            hotspot_indices.append(ind)
            hotspot_windows.append((left_ind,right_ind))
        if(abs(val_right-val_left)>th):
            plateau+=1
    return hotspot_indices, hotspot_windows, plateau

def find_temporal_low(arr, th):
    peak_indices = []
    hotspot_indices = []
    hotspot_windows = []
    plateau = 0
    for i in range(1,len(arr)-1):
        if(arr[i]<arr[i-1] and arr[i]<arr[i+1]):
            peak_indices.append(i)
    for ind in peak_indices:
        left_ind = ind-1
        right_ind = ind+1
        while(left_ind!=-1):
            if(arr[left_ind]>arr[left_ind+1]):
                left_ind -= 1
            else:
                break
        while(right_ind!=len(arr)):
            if(arr[right_ind]>arr[right_ind-1]):
                right_ind += 1
            else:
                break
        right_ind -= 1
        left_ind += 1
        if(left_ind==-1):
            left_ind = 0
        if(right_ind==len(arr)):
            right_ind = len(arr)-1
        val_left = arr[left_ind]
        val_right = arr[right_ind]
        val_peak = arr[ind] 
        win_size = right_ind-left_ind+1
        base_window_left = arr[max(left_ind-base_win_factor*win_size,0):left_ind]
        base_window_right = arr[right_ind+1:max(right_ind+base_win_factor*win_size,len(arr)-1)]
        base_window = np.concatenate((base_window_left,base_window_right))
        base_height = np.mean(base_window)
        if(abs(val_peak-base_height)>th):
            hotspot_indices.append(ind)
            hotspot_windows.append((left_ind,right_ind))
#         if(abs(val_peak-val_right)>th or abs(val_peak-val_left)>th):
#             hotspot_indices.append(ind)
#             hotspot_windows.append((left_ind,right_ind))
        if(abs(val_right-val_left)>th):
            plateau+=1
    return hotspot_indices, hotspot_windows, plateau

In [62]:
num_hsps, num_plts = 0,0
indices, windows = [], []
thsp_high = {}
for col in list(df.columns):
    thsp_high[col] = []
    arr = df[col].dropna().to_numpy()
    ts = df[col].dropna().index.to_numpy()
    hsp_ind, hsp_win, plat = find_temporal_high(arr, th)
    for i in range(len(hsp_ind)):
        index = ts[hsp_ind[i]]
        value = df[col].dropna().loc[index]
        hsp_indices = ts[hsp_win[i][0]:hsp_win[i][1]+1]
        thsp_high[col].append([index, value, hsp_indices])
        
    indices += hsp_ind
    num_hsps += len(hsp_ind)
    num_plts += plat
print(num_hsps, num_plts)

8906 2284


In [63]:
num_hsps, num_plts = 0,0
indices, windows = [], []
thsp_low = {}
for col in list(df.columns):
    thsp_low[col] = []
    arr = df[col].dropna().to_numpy()
    ts = df[col].dropna().index.to_numpy()
    hsp_ind, hsp_win, plat = find_temporal_low(arr, th)
    for i in range(len(hsp_ind)):
        index = ts[hsp_ind[i]]
        value = df[col].dropna().loc[index]
        hsp_indices = ts[hsp_win[i][0]:hsp_win[i][1]+1]
        thsp_low[col].append([index, value, hsp_indices])
        
    indices += hsp_ind
    num_hsps += len(hsp_ind)
    num_plts += plat
print(num_hsps, num_plts)

2394 3158


In [64]:
with open('time_high_3H.pkl','wb') as file:
    pkl.dump(thsp_high, file)
with open('time_low_3H.pkl','wb') as file:
    pkl.dump(thsp_low, file)

# Spatial hotspots

In [32]:
nn_dict = {}
for col in df.columns:
    dist_df = distances.loc[col].sort_values().dropna()
    nn_dict[col] = dist_df

In [57]:
th = 150
rad_th = 10
base_rad_factor = 2

In [67]:
def area(x1, y1, x2, y2, x3, y3):
 
    return abs((x1 * (y2 - y3) + x2 * (y3 - y1)
                + x3 * (y1 - y2)) / 2.0)

def isInside(x1, y1, x2, y2, x3, y3, x, y):
 
    # Calculate area of triangle ABC
    A = area (x1, y1, x2, y2, x3, y3)
 
    # Calculate area of triangle PBC
    A1 = area (x, y, x2, y2, x3, y3)
     
    # Calculate area of triangle PAC
    A2 = area (x1, y1, x, y, x3, y3)
     
    # Calculate area of triangle PAB
    A3 = area (x1, y1, x2, y2, x, y)
     
    # Check if sum of A1, A2 and A3
    # is same as A
    if(A == A1 + A2 + A3):
        return True
    else:
        return False

def find_interior_points(hsp, pt, nnk, rem_pts):
    hsp_cord = (locs.loc[hsp]['Latitude'],locs.loc[hsp]['Longitude'])
    pt_cord = (locs.loc[pt]['Latitude'],locs.loc[pt]['Longitude'])
    nnk_cord = (locs.loc[nnk]['Latitude'],locs.loc[nnk]['Longitude'])
    interior_pts = []
    for point in rem_pts:
        cord = (locs.loc[point]['Latitude'],locs.loc[point]['Longitude'])
        if(isInside(hsp_cord[0],hsp_cord[1],pt_cord[0],pt_cord[1],nnk_cord[0],nnk_cord[1],cord[0],cord[1])):
            interior_pts.append(point)
    return interior_pts

def check_interior_condition_high(hsp, hsp_set, nnk, snap):
#     if(snap[hsp]-snap[nnk]<th):
#         return False
    if(snap[hsp]-snap[nnk]<0):
        return False
    if(len(hsp_set)<=1):
        return True
    
    for pt in hsp_set:
        int_pts = find_interior_points(hsp, pt, nnk, [x for x in hsp_set if x != pt])
        for intpt in int_pts:
            if(snap[intpt]<snap[nnk] or snap[intpt]<snap[pt]):
                return False
    return True

def check_interior_condition_low(hsp, hsp_set, nnk, snap):
#     if(snap[hsp]-snap[nnk]>-th):
#         return False
    if(snap[hsp]-snap[nnk]>0):
        return False
    if(len(hsp_set)<=1):
        return True
    
    for pt in hsp_set:
        int_pts = find_interior_points(hsp, pt, nnk, [x for x in hsp_set if x != pt])
        for intpt in int_pts:
            if(snap[intpt]>snap[nnk] or snap[intpt]>snap[pt]):
                return False
    return True

In [68]:
shsp_high = []
for ts in tqdm(list(df.index)):
    df_snap = df.loc[ts].dropna()
    loc_list = list(df_snap.index)
    for location in loc_list:
        available_locs = [x for x in loc_list if x != location]
        nn_locs = list(nn_dict[location][nn_dict[location]<rad_th*1000].index)
        avg_locs = list(nn_dict[location][nn_dict[location]<base_rad_factor*rad_th*1000].index)
        nn_list = [x for x in nn_locs if x in available_locs]
        avg_list = [x for x in avg_locs if x in available_locs]
        avg_reading = df_snap[avg_list].mean()
        if(df_snap[location]-avg_reading>th):
            hsp = location
            hsp_set = []
            for nnk in nn_list:
                if(check_interior_condition_high(hsp,hsp_set,nnk,df_snap)):
                    hsp_set.append(nnk)
            if(len(hsp_set)):
                shsp_high.append([ts,hsp,hsp_set])

100%|██████████| 7320/7320 [07:10<00:00, 16.99it/s]  


In [69]:
shsp_low = []
for ts in tqdm(list(df.index)):
    df_snap = df.loc[ts].dropna()
    loc_list = list(df_snap.index)
    for location in loc_list:
        available_locs = [x for x in loc_list if x != location]
        nn_locs = list(nn_dict[location][nn_dict[location]<rad_th*1000].index)
        avg_locs = list(nn_dict[location][nn_dict[location]<base_rad_factor*rad_th*1000].index)
        nn_list = [x for x in nn_locs if x in available_locs]
        avg_list = [x for x in avg_locs if x in available_locs]
        avg_reading = df_snap[avg_list].mean()
        if(df_snap[location]-avg_reading>th):
            hsp = location
            hsp_set = []
            for nnk in nn_list:
                if(check_interior_condition_low(hsp,hsp_set,nnk,df_snap)):
                    hsp_set.append(nnk)
            if(len(hsp_set)):
                shsp_low.append([ts,hsp,hsp_set])

100%|██████████| 7320/7320 [04:07<00:00, 29.58it/s]


In [72]:
len(shsp_high)

2198

In [73]:
len(shsp_low)

644

In [74]:
with open('space_high_3H.pkl','wb') as file:
    pkl.dump(shsp_high, file)
with open('space_low_3H.pkl','wb') as file:
    pkl.dump(shsp_low, file)