# Calibration

This notebook demonstrates how we can calibrate the network of sensors using the colocation events.

In [1]:
import pandas as pd
from google.cloud import bigquery
import numpy as np
import pickle
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import download_airnow_web
import pyproj
%matplotlib inline
from glob import glob
import google.auth
from google.cloud import bigquery_storage
from pathlib import Path
import os

Plots all the channel pairs, to let us see where our encounter data lies, and how it looks

In [2]:
#loading cached data
encounters = pickle.load(open('encounters.p','rb'))

In [3]:
encounters.head()

Unnamed: 0,created_at,channel_id_sensorA,pm2_5_sensorA,s2_pm2_5_sensorA,latitude_sensorA,longitude_sensorA,temperature_sensorA,humidity_sensorA,x_sensorA,y_sensorA,...,s2_pm2_5_sensorB,latitude_sensorB,longitude_sensorB,temperature_sensorB,humidity_sensorB,x_sensorB,y_sensorB,created_at_2_sensorB,dist,timedelta
20171,2020-08-06 12:51:18+00:00,930434,47.15,43.37,0.360178,32.610779,37.0,0.0,3630215.0,40095.095632,...,44.33,0.360328,32.610836,39.0,0.0,3630222.0,40111.793886,2020-08-06 12:51:20+00:00,17.863185,0 days 00:00:02
20174,2020-08-06 12:55:33+00:00,930434,48.13,43.22,0.360345,32.610737,37.0,0.0,3630211.0,40113.686354,...,44.67,0.360342,32.610817,39.0,0.0,3630220.0,40113.352389,2020-08-06 12:55:38+00:00,8.911819,0 days 00:00:05
20178,2020-08-06 13:01:08+00:00,930434,48.13,44.78,0.360157,32.610783,37.0,0.0,3630216.0,40092.757876,...,44.33,0.360325,32.610817,38.0,0.0,3630220.0,40111.459921,2020-08-06 13:00:56+00:00,19.081185,0 days 00:00:12
20181,2020-08-06 13:05:23+00:00,930434,48.05,44.63,0.360186,32.610783,37.0,0.0,3630216.0,40095.986205,...,44.67,0.360348,32.610825,38.0,0.0,3630220.0,40114.020319,2020-08-06 13:05:14+00:00,18.63032,0 days 00:00:09
25578,2020-08-12 08:52:28+00:00,930434,39.97,37.83,0.360162,32.610806,36.0,0.0,3630218.0,40093.314485,...,37.0,0.360334,32.610825,39.0,0.0,3630220.0,40112.461816,2020-08-12 08:52:18+00:00,19.263795,0 days 00:00:10


In [4]:
#Get unique ids
unq = np.unique(np.r_[encounters['channel_id_sensorA'].unique(),encounters['channel_id_sensorB'].unique()])
unq

array([-2.45170e+04, -2.45160e+04, -1.00000e+00,  6.89761e+05,
        7.18030e+05,  7.30014e+05,  7.37273e+05,  7.37276e+05,
        7.55612e+05,  7.55614e+05,  7.82718e+05,  7.82719e+05,
        8.32251e+05,  8.32253e+05,  8.32254e+05,  8.32255e+05,
        8.70139e+05,  8.70143e+05,  8.70144e+05,  8.70145e+05,
        8.70147e+05,  9.12219e+05,  9.12220e+05,  9.12222e+05,
        9.30426e+05,  9.30427e+05,  9.30428e+05,  9.30429e+05,
        9.30430e+05,  9.30431e+05,  9.30432e+05,  9.30434e+05,
        9.30435e+05,  9.67600e+05,  9.67601e+05])

In [5]:
#define variables for the simple calibration fuction
t = (encounters['created_at']-pd.Timestamp('2020-07-15',tz='UTC')).dt.total_seconds()/3600 #hours since 15th July put to 1970 and substract some dates
idA = [np.where(a==unq)[0][0] for a in encounters['channel_id_sensorA']]
idB = [np.where(a==unq)[0][0] for a in encounters['channel_id_sensorB']]
sA = np.nanmean(encounters[['pm2_5_sensorA','s2_pm2_5_sensorA']],1)
sB = np.nanmean(encounters[['pm2_5_sensorB','s2_pm2_5_sensorB']],1)
X = np.c_[t,idA,idB]
Y = np.c_[sA,sB]
t

20171     540.855000
20174     540.925833
20178     541.018889
20181     541.089722
25578     680.874444
            ...     
2592     2842.000000
2593     2843.000000
2594     2844.000000
2596     2846.000000
2597     2847.000000
Name: created_at, Length: 184608, dtype: float64

Plot all the colocation events together to get an idea of how similar pairs will be & if there's structure in the data.

In [6]:
#Specify the reference sensor/ reference node for the graph
refsensor = np.zeros(len(unq))
refsensor[2]=1
refsensor

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])

In [7]:
delta = 24*7

In [8]:
import networkx as nx
from scipy.optimize import curve_fit

In [9]:
def f(x,a,b):
    return x*a+b

This makes it so the smaller id is always first in the pair

In [10]:
def compute_simple_calibration(X,Y,delta,refsensor):
    G = nx.DiGraph()
    maxnum = int(np.max(X[:,1:]))
    for it,starttime in enumerate(np.arange(0,np.max(X[:,0]),delta)):
        keep = (X[:,0]>starttime) & (X[:,0]<starttime+delta)
        Xkeep = X[keep,:]
        Ykeep = Y[keep,:]
        for i in range(maxnum+1):
            for j in range(maxnum+1):
                keep = (Xkeep[:,1]==i) & (Xkeep[:,2]==j)
                if len(Ykeep[keep,0])>3: #need a few data points for confidence?
                    logratio=np.nanmean(np.log(Ykeep[keep,0]/Ykeep[keep,1]))
                    popt, pcov = curve_fit(f,Ykeep[keep,1],Ykeep[keep,0])
                    G.add_edge((i,it),(j,it),val=logratio,popt=popt,pcov=pcov,weight=2)
                    popt, pcov = curve_fit(f,Ykeep[keep,0],Ykeep[keep,1])
                    G.add_edge((j,it),(i,it),val=-logratio,popt=popt,pcov=pcov,weight=2)
    maxit = it
    for it,starttime in enumerate(np.arange(0,np.max(X[:,0]),delta)):
        if it>0:
            for i in range(maxnum+1):
                #if np.all(np.isnan(data[i,:])): continue
                if np.any([(i,j) in G.nodes for j in range(maxit)]):
                    popt = np.array([0,0])
                    pcov = np.eye(2)
                    G.add_edge((i,it-1),(i,it),val=0,popt=popt,pcov=pcov,weight=1)
                    G.add_edge((i,it),(i,it-1),val=0,popt=popt,pcov=pcov,weight=1)
                    
    allsp = {}
    for ref in np.where(refsensor)[0]:
        for timeidx in range(maxit+1):

            #sp = nx.shortest_paths.single_target_shortest_path(G,(ref,timeidx))
            sp = nx.shortest_paths.single_source_dijkstra_path(G,(ref,timeidx))
            for s in sp:
                if s in allsp:
                    if len(sp[s])<len(allsp[s]):
                        allsp[s]=sp[s]
                else:
                    allsp[s]=sp[s]
    allcals = {}
    allcallists = {}
    allpopts = {}
    allpcovs = {}
    allpoptslists = {}
    for s in allsp:
        allcallists[s] = [G.get_edge_data(u,v)['val'] for u,v in zip(allsp[s][:-1],allsp[s][1:])]
        allcals[s] = np.sum([G.get_edge_data(u,v)['val'] for u,v in zip(allsp[s][:-1],allsp[s][1:])])
        allpoptslists[s] = [G.get_edge_data(u,v)['popt'] for u,v in zip(allsp[s][:-1],allsp[s][1:])]
        allpopts[s] = np.sum(np.log([G.get_edge_data(u,v)['popt'] for u,v in zip(allsp[s][:-1],allsp[s][1:])]),0)
        allpcovs[s] = np.sum([G.get_edge_data(u,v)['pcov'] for u,v in zip(allsp[s][:-1],allsp[s][1:])],0)

        #allpopt
    return G,allsp,allcals,allcallists,allpopts,allpcovs,allpoptslists


In [11]:
G,allsp,allcals,allcallists,allpopts,allpcovs,allpoptslists = compute_simple_calibration(X,Y,delta,refsensor)
allcals

  if sys.path[0] == '':


{(2, 0): 0.0,
 (34, 0): 0.16668446783835422,
 (32, 0): 0.811980840252029,
 (2, 1): 0.0,
 (34, 1): 0.046898927532782644,
 (28, 1): 0.34173917570076223,
 (32, 1): 0.3737037094167434,
 (2, 2): 0.0,
 (5, 0): 0.8257288045010196,
 (8, 0): 0.8462013484475964,
 (11, 0): 1.1275232085083422,
 (34, 2): 0.17929865734912262,
 (32, 2): 0.6498479812983541,
 (2, 3): 0.0,
 (28, 0): 0.34173917570076223,
 (28, 2): 0.34173917570076223,
 (5, 1): 0.26009562328565006,
 (8, 1): 0.2802543298630875,
 (11, 1): 0.36291661279063964,
 (25, 1): 0.10258864442020105,
 (34, 3): 0.14704870898798933,
 (28, 3): 0.3361310237610682,
 (32, 3): 0.29631459540404487,
 (2, 4): 0.0,
 (5, 2): 0.5864383090159395,
 (8, 2): 0.5636029626812122,
 (11, 2): 0.6768755603879492,
 (25, 2): 0.5224411953944077,
 (1, 2): 0.9747774441977255,
 (33, 2): 0.5930203430413592,
 (3, 2): 0.4704189694177303,
 (34, 4): 0.1737403564520103,
 (28, 4): 0.4961830387243843,
 (32, 4): 0.3774794064081132,
 (2, 5): 0.0,
 (25, 0): 0.10258864442020105,
 (3, 3): 0.4

In [12]:
# def plot_simple_calibration_graph(G):
#     plt.figure(figsize=[15,15])
#     nx.draw_networkx(G,pos=nx.spring_layout(G))#,node_color=cols)#draw_networkx_edge_labels(G,pos=nx.spring_layout(G))

# plot_simple_calibration_graph(G)

In [18]:
#testX = np.c_[15.32855000e+02, 3.10000000e+01,100.66]
testX = np.c_[X[:,0], X[:,1], Y[:,0]]
idx = (testX[:,0]/delta).astype(int)
idx

array([ 3,  3,  3, ..., 16, 16, 16])

In [19]:
def compute_simple_predictions(testX,allcals,delta,allpcovs): # changed
    idx = (testX[:,0]/delta).astype(int)
    scale = []
    preds = []
    res = []
    key = []
    for i,(timeidx,sensorid0,test0) in enumerate(zip(idx,testX[:,1],testX[:,2])):
        try:
            scaling = np.exp(allcals[(sensorid0,timeidx)])
            scale.append(scaling)
            preds.append(scaling*test0)  
            res.append([test0])
            key.append([(sensorid0,timeidx)])
        except KeyError as ke:
            print('Key Not Found:', ke)
    return res,scale,preds,key

In [20]:
res,scale,preds,key = compute_simple_predictions(testX,allcals,delta,allpcovs)
df_results = pd.DataFrame(list(zip(key,res,scale,preds)), columns = ['key','raw','ratio','calibrated'])
df_results

Key Not Found: (9.0, 8)
Key Not Found: (9.0, 17)
Key Not Found: (16.0, 10)
Key Not Found: (20.0, 2)
Key Not Found: (20.0, 5)
Key Not Found: (22.0, 14)
Key Not Found: (22.0, 14)
Key Not Found: (22.0, 14)


Unnamed: 0,key,raw,ratio,calibrated
0,"[(31.0, 3)]",[45.26],0.966257,43.732801
1,"[(31.0, 3)]",[45.675],0.966257,44.133798
2,"[(31.0, 3)]",[46.455],0.966257,44.887479
3,"[(31.0, 3)]",[46.34],0.966257,44.776359
4,"[(31.0, 4)]",[38.9],1.349527,52.496582
...,...,...,...,...
184595,"[(2.0, 16)]",[37.0],1.000000,37.000000
184596,"[(2.0, 16)]",[26.0],1.000000,26.000000
184597,"[(2.0, 16)]",[-999.0],1.000000,-999.000000
184598,"[(2.0, 16)]",[-999.0],1.000000,-999.000000
