In [1]:
import os
import sys
import pytz
import argparse
# import jax.numpy as jnp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from geopy import distance
import datetime
import tilemapbase
from copy import deepcopy
import pickle as pkl
from PIL import Image
import skimage.measure
import math
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
os.environ['PYTHONWARNINGS']='ignore'
import hyperopt
from joblib import Parallel, delayed
import random
random.seed(42)
import scipy
import torch
from pykrige.ok import OrdinaryKriging
from pykrige.ok3d import OrdinaryKriging3D
from pykrige.uk import UniversalKriging
from sklearn.model_selection import train_test_split
from scipy.interpolate import CubicSpline

In [2]:
source = 'combined'
sensor = 'pm25'
res_time = '1H'
filepath_root = '/scratch/ab9738/pollution_with_sensors/'

filepath_data_kai = filepath_root+'data/kaiterra/kaiterra_fieldeggid_{}_current_panel.csv'.format(res_time)
filepath_data_gov = filepath_root+'data/govdata/govdata_{}_current.csv'.format(res_time)
filepath_locs_kai = filepath_root+'data/kaiterra/kaiterra_locations.csv'
filepath_locs_gov = filepath_root+'data/govdata/govdata_locations.csv'

locs_kai = pd.read_csv(filepath_locs_kai, index_col=[0])
locs_kai['Type'] = 'Kaiterra'
locs_gov = pd.read_csv(filepath_locs_gov, index_col=[0])
locs_gov['Type'] = 'Govt'
locs = pd.merge(locs_kai, locs_gov, how='outer',\
                on=['Monitor ID', 'Latitude', 'Longitude', 'Location', 'Type'], copy=False)
data_kai = pd.read_csv(filepath_data_kai, index_col=[0,1], parse_dates=True)[sensor]
data_gov = pd.read_csv(filepath_data_gov, index_col=[0,1], parse_dates=True)[sensor]
data = pd.concat([data_kai, data_gov], axis=0, copy=False)
data.replace(0,np.nan,inplace=True)

start_dt = data.index.levels[1][0]
end_dt = data.index.levels[1][-1]

if start_dt.tzname != 'IST':
        if start_dt.tzinfo is None:
            start_dt = start_dt.tz_localize('UTC')
        start_dt = start_dt.tz_convert(pytz.FixedOffset(330))
    
if end_dt.tzname != 'IST':
    if end_dt.tzinfo is None: 
        end_dt = end_dt.tz_localize('UTC')
    end_dt = end_dt.tz_convert(pytz.FixedOffset(330))

# now, filter through the start and end dates
data.sort_index(inplace=True)
data = data.loc[(slice(None), slice(start_dt, end_dt))]

if(source=='govdata'):
    df = data_gov.unstack(level=0)
elif(source=='kaiterra'):
    df = data_kai.unstack(level=0)
else:
    df = data.unstack(level=0)
distances = pd.read_csv('/scratch/ab9738/pollution_with_sensors/data/combined_distances.csv', index_col=[0])
distances = distances.loc[df.columns, df.columns]
distances[distances == 0] = np.nan

In [3]:
df = df.drop(['Pusa_IMD'], axis=1)

In [4]:
df_m10 = pd.read_csv("missing_sensors_0.1.csv",index_col=[0],parse_dates=True)
df_m20 = pd.read_csv("missing_sensors_0.2.csv",index_col=[0],parse_dates=True)
df_m30 = pd.read_csv("missing_sensors_0.3.csv",index_col=[0],parse_dates=True)
df_m40 = pd.read_csv("missing_sensors_0.4.csv",index_col=[0],parse_dates=True)
df_m50 = pd.read_csv("missing_sensors_0.5.csv",index_col=[0],parse_dates=True)

In [5]:
df_missing = {}
df_missing[10] = df_m10
df_missing[20] = df_m20
df_missing[30] = df_m30
df_missing[40] = df_m40
df_missing[50] = df_m50

In [6]:
df_i10 = pd.read_csv("missing_sensors_interpolated_0.1.csv",index_col=[0],parse_dates=True)
df_i20 = pd.read_csv("missing_sensors_interpolated_0.2.csv",index_col=[0],parse_dates=True)
df_i30 = pd.read_csv("missing_sensors_interpolated_0.3.csv",index_col=[0],parse_dates=True)
df_i40 = pd.read_csv("missing_sensors_interpolated_0.4.csv",index_col=[0],parse_dates=True)
df_i50 = pd.read_csv("missing_sensors_interpolated_0.5.csv",index_col=[0],parse_dates=True)

In [7]:
df_interpolated = {}
df_interpolated[10] = df_i10
df_interpolated[20] = df_i20
df_interpolated[30] = df_i30
df_interpolated[40] = df_i40
df_interpolated[50] = df_i50

## APH Paper Hotspots

In [8]:
df = df.groupby(df.index.date).mean()
df.index = pd.to_datetime(df.index)
for key in [10,20,30,40,50]:
    df_missing[key] = df_missing[key].groupby(df_missing[key].index.date).mean()
    df_interpolated[key] = df_interpolated[key].groupby(df_interpolated[key].index.date).mean()
    df_missing[key].index = pd.to_datetime(df_missing[key].index)
    df_interpolated[key].index = pd.to_datetime(df_interpolated[key].index)

In [9]:
def process_month(df_month):
    freq, scale, cons = [], [], []
    for sensor in df_month.columns:
        if(df_month[sensor].isna().sum()>10):
            continue
        if(len(df_month[sensor][df_month[sensor]>60])>0.6*len(df_month[sensor].dropna())):
            freq.append(sensor)
        if(df_month[sensor].mean()>90):
            scale.append(sensor)
        y = df_month[sensor].dropna()
        y = y-60
        y = (y>0).astype(int)
        y = y * (y.groupby((y != y.shift()).cumsum()).cumcount() + 1)
        if(max(y)>=3):
            cons.append(sensor)
    return(freq,scale,cons)

In [10]:
def count_hsps(dataframe): 
    df_18 = dataframe[dataframe.index.year==2018]
    df_19 = dataframe[dataframe.index.year==2019]
    df_20 = dataframe[dataframe.index.year==2020]
    # year 2018
    hsps = {2018:{},2019:{},2020:{}}
    for month in range(5,13):
        df_month = df_18[df_18.index.month==month]    
        hsps[2018][month]=process_month(df_month)


    # year 2019
    for month in range(1,13):
        df_month = df_19[df_19.index.month==month]    
        hsps[2019][month]=process_month(df_month)

    # year 2020
    for month in range(1,10):
        df_month = df_20[df_20.index.month==month]    
        hsps[2020][month]=process_month(df_month)

    return(hsps)

In [11]:
def find_tp_fp_fn(gt, pred):
    tp_list, fp_list, fn_list = [], [], []
    for k in range(3):
        gt_list = gt[k]
        pred_list = pred[k]
        tp_list += [x for x in pred_list if x in gt_list]
        fp_list += [x for x in pred_list if x not in gt_list]
        fn_list += [x for x in gt_list if x not in pred_list]
    return(tp_list, fp_list, fn_list)

In [12]:
gt = count_hsps(df)

In [13]:
for key in [10,20,30,40,50]:
    pred = count_hsps(df_missing[key])

    tp_list, fp_list, fn_list = [],[],[]
    for i in gt.keys():
        for j in gt[i].keys():
            tp, fp, fn = find_tp_fp_fn(gt[i][j], pred[i][j])
            tp_list += tp
            fp_list += fp
            fn_list += fn

    precision = len(tp_list)/(len(tp_list)+len(fp_list))
    recall = len(tp_list)/(len(tp_list)+len(fn_list))
    print(precision,recall)

1.0 0.9166224110462029
1.0 0.8093467870419543
1.0 0.7424322889006904
1.0 0.6165693043016464
1.0 0.5315985130111525


In [14]:
for key in [10,20,30,40,50]:
    pred = count_hsps(df_interpolated[key])

    tp_list, fp_list, fn_list = [],[],[]
    for i in gt.keys():
        for j in gt[i].keys():
            tp, fp, fn = find_tp_fp_fn(gt[i][j], pred[i][j])
            tp_list += tp
            fp_list += fp
            fn_list += fn

    precision = len(tp_list)/(len(tp_list)+len(fp_list))
    recall = len(tp_list)/(len(tp_list)+len(fn_list))
    print(precision,recall)

0.9978529253891573 0.987254381306426
0.9929805615550756 0.9766330323951142
0.9918610960390667 0.9707912904938927
0.983013698630137 0.9527349973446628
0.9791094007696537 0.9458311205523101
