# Imports

In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import pytz
import argparse
# import jax.numpy as jnp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from geopy import distance
import datetime
import tilemapbase
from copy import deepcopy
import pickle as pkl
from PIL import Image
import skimage.measure
import math
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
os.environ['PYTHONWARNINGS']='ignore'
import hyperopt
from joblib import Parallel, delayed
import random
random.seed(42)
import scipy
import torch
from pykrige.ok import OrdinaryKriging
from pykrige.ok3d import OrdinaryKriging3D
from pykrige.uk import UniversalKriging
from sklearn.model_selection import train_test_split
from scipy.interpolate import CubicSpline

# Data Loading

In [2]:
source = 'combined'
sensor = 'pm25'
res_time = '1H'
filepath_root = '/scratch/ab9738/hidden_hotspots/'

In [20]:
filepath_data_kai = filepath_root+'data/kaiterra/kaiterra_fieldeggid_{}_current_panel.csv'.format(res_time)
filepath_data_gov = filepath_root+'data/govdata/govdata_{}_current.csv'.format(res_time)
filepath_locs_kai = filepath_root+'data/kaiterra/kaiterra_locations.csv'
filepath_locs_gov = filepath_root+'data/govdata/govdata_locations.csv'

locs_kai = pd.read_csv(filepath_locs_kai, index_col=[0])
locs_kai['Type'] = 'Kaiterra'
locs_gov = pd.read_csv(filepath_locs_gov, index_col=[0])
locs_gov['Type'] = 'Govt'
locs = pd.merge(locs_kai, locs_gov, how='outer',\
                on=['Monitor ID', 'Latitude', 'Longitude', 'Location', 'Type'], copy=False)
data_kai = pd.read_csv(filepath_data_kai, index_col=[0,1], parse_dates=True)[sensor]
data_gov = pd.read_csv(filepath_data_gov, index_col=[0,1], parse_dates=True)[sensor]
data = pd.concat([data_kai, data_gov], axis=0, copy=False)
data.replace(0,np.nan,inplace=True)

start_dt = data.index.levels[1][0]
end_dt = data.index.levels[1][-1]

if start_dt.tzname != 'IST':
        if start_dt.tzinfo is None:
            start_dt = start_dt.tz_localize('UTC')
        start_dt = start_dt.tz_convert(pytz.FixedOffset(330))
    
if end_dt.tzname != 'IST':
    if end_dt.tzinfo is None: 
        end_dt = end_dt.tz_localize('UTC')
    end_dt = end_dt.tz_convert(pytz.FixedOffset(330))

# now, filter through the start and end dates
data.sort_index(inplace=True)
data = data.loc[(slice(None), slice(start_dt, end_dt))]

if(source=='govdata'):
    df = data_gov.unstack(level=0)
elif(source=='kaiterra'):
    df = data_kai.unstack(level=0)
else:
    df = data.unstack(level=0)
distances = pd.read_csv('/scratch/ab9738/hidden_hotspots/data/combined_distances.csv', index_col=[0])
distances = distances.loc[df.columns, df.columns]
distances[distances == 0] = np.nan

In [21]:
from statsmodels.tsa.stattools import adfuller

In [22]:
for col in df.columns:
    X = df[col].dropna().values
    result = adfuller(X)
    # print('ADF Statistic: %f' % result[0])
    print('sensor:{}, p-value: {}'.format(col,result[1]))
    # print('Critical Values:')
    # for key, value in result[4].items():
    #     print('\t%s: %.3f' % (key, value))

sensor:113E, p-value: 9.635986211871643e-06
sensor:1FD7, p-value: 0.0421570029513192
sensor:20CA, p-value: 9.511182423179522e-05
sensor:2E9C, p-value: 0.04375856614360218
sensor:3ACF, p-value: 0.976760910009257
sensor:498F, p-value: 0.0012742462166559585
sensor:4BE7, p-value: 1.1479493083420188e-06
sensor:56C3, p-value: 0.00014606552718580303
sensor:5D7A, p-value: 1.526560503750724e-05
sensor:603A, p-value: 0.0006723684866630278
sensor:72CA, p-value: 2.995599847820941e-06
sensor:8E2A, p-value: 1.985092878650329e-06
sensor:91B8, p-value: 0.002052228792883589
sensor:97D7, p-value: 0.0003981522968403064
sensor:A838, p-value: 0.0008380210760562134
sensor:A9BE, p-value: 7.482885838889009e-06
sensor:AnandVihar_DPCC, p-value: 3.975582214271166e-10
sensor:AshokVihar_DPCC, p-value: 1.529669793185092e-09
sensor:AyaNagar_IMD, p-value: 4.5200002747668913e-10
sensor:BB4A, p-value: 0.008776908107393585
sensor:BC46, p-value: 0.00012571323418282014
sensor:BFDC, p-value: 0.39986426386592117
sensor:Bura

In [23]:
df = np.log(df)

In [24]:
for col in df.columns:
    X = df[col].dropna().values
    result = adfuller(X)
    # print('ADF Statistic: %f' % result[0])
    print('sensor:{}, p-value: {}'.format(col,result[1]))
    # print('Critical Values:')
    # for key, value in result[4].items():
    #     print('\t%s: %.3f' % (key, value))

sensor:113E, p-value: 7.518600145906119e-05
sensor:1FD7, p-value: 0.009257643873516842
sensor:20CA, p-value: 0.0014838358381846404
sensor:2E9C, p-value: 0.027950347793309883
sensor:3ACF, p-value: 0.164435070729053
sensor:498F, p-value: 0.028944669019323806
sensor:4BE7, p-value: 3.8549335500937e-05
sensor:56C3, p-value: 0.0002058104820923644
sensor:5D7A, p-value: 3.052555674864269e-05
sensor:603A, p-value: 0.0006820825985258879
sensor:72CA, p-value: 9.388407736664602e-05
sensor:8E2A, p-value: 0.00015290437912993712
sensor:91B8, p-value: 0.005130190710158837
sensor:97D7, p-value: 0.0006518173663447429
sensor:A838, p-value: 0.0005145542215528568
sensor:A9BE, p-value: 1.874468909237184e-05
sensor:AnandVihar_DPCC, p-value: 1.5067080808350507e-07
sensor:AshokVihar_DPCC, p-value: 1.0126585850327229e-06
sensor:AyaNagar_IMD, p-value: 1.096700074965333e-09
sensor:BB4A, p-value: 0.1336447482415417
sensor:BC46, p-value: 0.0001826333036101168
sensor:BFDC, p-value: 0.004439840726677537
sensor:Burari

In [25]:
sens = np.log(data).to_frame().reset_index()

sens['hour_of_day'] = sens['timestamp_round'].apply(lambda x: x.hour)

spline = sens.groupby(['field_egg_id', 'hour_of_day']).mean()['pm25'].reset_index()
spline_avg = sens.groupby(['hour_of_day']).mean()['pm25'].reset_index()

fields = []
times = []
pm25 = []
for i in np.unique(spline['field_egg_id']):
    s_i = spline[spline['field_egg_id']==i]
    x = s_i['hour_of_day'].values
    y = [t for t in s_i['pm25'].values]
    c1 = CubicSpline(x[:8],y[:8])
    c2 = CubicSpline(x[8:16],y[8:16])
    c3 = CubicSpline(x[16:24],y[16:24])
    ix = [k/100.0 for k in range(2400)]
    iy = list(np.concatenate((c1(ix[:800]),c2(ix[800:1600]),c3(ix[1600:2400]))))
    fields += [i]*2400
    times += ix
    pm25 += iy

spline_df = pd.DataFrame((fields, times, pm25)).transpose()

spline_df.columns = ['field_egg_id', 'time', 'pm25']

hours_in_day = np.arange(24).astype(float)

spline_df = spline_df[spline_df['time'].isin(hours_in_day)]

spline_mat = np.transpose(spline_df['pm25'].to_numpy().reshape((60,24))).astype(float)

spline_df = pd.DataFrame(spline_mat,columns=df.columns)
spline_df = spline_df.drop(['Pusa_IMD'], axis=1)
df = df.drop(['Pusa_IMD'], axis=1)
spline_df = spline_df.mean(axis=1)
df_full = deepcopy(df)
for idx,row in df.iterrows():
    df.loc[idx] = row-spline_df.loc[idx.hour]
df_spline = df_full-df

In [26]:
for col in df.columns:
    X = df[col].dropna().values
    result = adfuller(X)
    # print('ADF Statistic: %f' % result[0])
    print('sensor:{}, p-value: {}'.format(col,result[1]))
    # print('Critical Values:')
    # for key, value in result[4].items():
    #     print('\t%s: %.3f' % (key, value))

sensor:113E, p-value: 0.0001589011860672702
sensor:1FD7, p-value: 0.0027695135656779016
sensor:20CA, p-value: 0.0026037429050420552
sensor:2E9C, p-value: 0.03090348376706523
sensor:3ACF, p-value: 0.771502861291134
sensor:498F, p-value: 1.4424604099966026e-07
sensor:4BE7, p-value: 1.0665251150163475e-05
sensor:56C3, p-value: 0.0002810986447358
sensor:5D7A, p-value: 8.716836256524607e-05
sensor:603A, p-value: 0.0007984998906120299
sensor:72CA, p-value: 8.690379720135994e-05
sensor:8E2A, p-value: 0.00023148943496936077
sensor:91B8, p-value: 0.012784970397089418
sensor:97D7, p-value: 0.0012037750405288915
sensor:A838, p-value: 0.0011216013630001341
sensor:A9BE, p-value: 4.854694027305082e-06
sensor:AnandVihar_DPCC, p-value: 5.204998673816713e-08
sensor:AshokVihar_DPCC, p-value: 2.806381383351259e-07
sensor:AyaNagar_IMD, p-value: 1.2917306651169454e-09
sensor:BB4A, p-value: 0.19238654709200592
sensor:BC46, p-value: 0.0002808466744330511
sensor:BFDC, p-value: 7.26662596570687e-08
sensor:Bura

In [34]:
df_daily_avg = df_full.groupby(df_full.index.date).mean()

In [35]:
for col in df.columns:
    X = df_daily_avg[col].dropna().values
    result = adfuller(X)
    # print('ADF Statistic: %f' % result[0])
    print('sensor:{}, p-value: {}'.format(col,result[1]))
    # print('Critical Values:')
    # for key, value in result[4].items():
    #     print('\t%s: %.3f' % (key, value))

sensor:113E, p-value: 0.5138421266121035
sensor:1FD7, p-value: 0.02006764872868221
sensor:20CA, p-value: 0.8531359169131987
sensor:2E9C, p-value: 0.43884178662890416
sensor:3ACF, p-value: 0.8903174993233485
sensor:498F, p-value: 0.41055274298932287
sensor:4BE7, p-value: 0.37701746625975874
sensor:56C3, p-value: 0.36174658041265007
sensor:5D7A, p-value: 0.07435644219395415
sensor:603A, p-value: 0.20314198264795957
sensor:72CA, p-value: 0.43989521156715583
sensor:8E2A, p-value: 0.34081530334536386
sensor:91B8, p-value: 0.48552414801888844
sensor:97D7, p-value: 0.7246442247301207
sensor:A838, p-value: 0.1587262316191556
sensor:A9BE, p-value: 0.17086062575429312
sensor:AnandVihar_DPCC, p-value: 0.025238609966519585
sensor:AshokVihar_DPCC, p-value: 0.0245868068467886
sensor:AyaNagar_IMD, p-value: 0.03401431569805104
sensor:BB4A, p-value: 0.877280708644475
sensor:BC46, p-value: 0.22729233075990407
sensor:BFDC, p-value: 0.13465214566080214
sensor:BurariCrossing_IMD, p-value: 0.025673640944879