In [1]:
import numpy as np
import pandas as pd
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('..')
from scripts.utils import cross_validation, preprocessing
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
ids = ['aq_54', 'aq_76', 'aq_94','aq_g507','aq_g508','aq_g509', 'aq_g512', 'aq_91', 'ANQ16PZJ', 'ALS2LCWY', 
       'AB6051M4', 'AW66FF7V', 'A743BPWK']

In [3]:
filename = '../data/final_data.csv'
data = pd.read_csv(filename, parse_dates=['time'], usecols =['time', 'latitude', 'longitude', 'pm2_5'])
data.head()

Unnamed: 0,time,latitude,longitude,pm2_5
0,2021-06-22 12:50:09+00:00,0.3564,32.573,16.62
1,2021-06-22 12:44:38+00:00,0.3564,32.573,23.1
2,2021-06-22 12:40:36+00:00,0.3564,32.573,18.48
3,2021-06-22 12:35:09+00:00,0.3564,32.573,15.77
4,2021-06-22 12:30:57+00:00,0.3564,32.573,29.62


In [4]:
f = lambda time: pd.Timestamp.fromtimestamp(time*3600)
f2 = np.vectorize(f)

In [5]:
latitudes = data['latitude'].unique()
longitudes = data['longitude'].unique()

In [6]:
final_df = pd.DataFrame()
start = pd.Timestamp('2021-06-15 00:00:00', tz='UTC')
end = pd.Timestamp('2021-06-21 23:59:59', tz='UTC')
for i, latitude in enumerate(latitudes):
    df = data[data['latitude'] == latitude]
    longitude = longitudes[i]
    processed_df = preprocessing(df, latitude, longitude, start, end)
    final_df = pd.concat([final_df, processed_df])
final_df.reset_index(drop=True, inplace=True)
final_df.head()

Unnamed: 0,time,latitude,longitude,pm2_5,day,day_of_week,hour
0,451032.0,0.3564,32.573,37.13125,15.0,1.0,0.0
1,451033.0,0.3564,32.573,57.322727,15.0,1.0,1.0
2,451034.0,0.3564,32.573,51.066364,15.0,1.0,2.0
3,451035.0,0.3564,32.573,77.211111,15.0,1.0,3.0
4,451036.0,0.3564,32.573,54.32,15.0,1.0,4.0


In [7]:
X = final_df[['longitude', 'latitude', 'day', 'day_of_week', 'hour', 'time']]
Y = final_df[['pm2_5']]
X.head()

Unnamed: 0,longitude,latitude,day,day_of_week,hour,time
0,32.573,0.3564,15.0,1.0,0.0,451032.0
1,32.573,0.3564,15.0,1.0,1.0,451033.0
2,32.573,0.3564,15.0,1.0,2.0,451034.0
3,32.573,0.3564,15.0,1.0,3.0,451035.0
4,32.573,0.3564,15.0,1.0,4.0,451036.0


In [8]:
X.describe()

Unnamed: 0,longitude,latitude,day,day_of_week,hour,time
count,1743.0,1743.0,1743.0,1743.0,1743.0,1743.0
mean,32.571065,0.356968,18.041308,2.985083,11.471601,451116.462995
std,0.009634,0.016858,2.016725,2.023628,6.557454,49.001859
min,32.553714,0.331732,15.0,0.0,0.0,451032.0
25%,32.562228,0.349565,16.0,1.0,6.0,451073.0
50%,32.573,0.3564,18.0,3.0,11.0,451116.0
75%,32.581617,0.359292,20.0,5.0,17.0,451160.0
max,32.5849,0.390741,21.0,6.0,23.0,451199.0


In [9]:
X = np.asarray(X)
Y = np.asarray(Y)
X[0, :]

array([3.25730e+01, 3.56400e-01, 1.50000e+01, 1.00000e+00, 0.00000e+00,
       4.51032e+05])

In [10]:
np.count_nonzero(np.isnan(X))

0

In [11]:
longs = [X[:,0][index] for index in sorted(np.unique(X[:,0], return_index=True)[1])]
lats = [X[:,1][index] for index in sorted(np.unique(X[:,1], return_index=True)[1])]
len(longs), len(lats)

(12, 12)

In [13]:
rmse_list =[]
for i in range(len(longs)):
    try:
        mean, var, Xtest, Ytest, rmse = cross_validation(X, Y, longs[i], lats[i])
        print(f'{ids[i]} successful')
        rmse_list.append(rmse)

        plt.figure(figsize=(12,6))
        plt.title(f'{ids[i]}, rmse:{rmse}')
        plt.xlim(f2(Xtest[:,5]).min()-timedelta(hours=1), f2(Xtest[:,5]).max()+timedelta(hours=1))
        plt.ylim(0,250)
        plt.plot(f2(Xtest[:, 5]), Ytest, label='Actual')
        plt.plot(f2(Xtest[:, 5]), mean, label='Predicted')
        plt.fill_between(f2(Xtest[:, 5]),
                         mean[:,0]-1.96*np.sqrt(var[:, 0]),
                         mean[:,0]+1.96*np.sqrt(var[:, 0]),
                         color="C0",
                         alpha=0.2)
        plt.legend(loc='best')
        plt.savefig(f'../plots/basic/{ids[i]}.png') 
        plt.close()
    except Exception as e:
        print(f'{ids[i]} failed')
        print(e)

aq_54 successful
aq_76 successful
aq_94 successful
aq_g507 successful
aq_g508 successful
aq_g509 successful
aq_g512 successful
aq_91 successful
ANQ16PZJ successful
ALS2LCWY successful
AB6051M4 successful
AW66FF7V successful


In [14]:
print(f'Mean RMSE: {np.mean(rmse_list)}, Minimum RMSE: {np.min(rmse_list)}, Maximum RMSE: {np.max(rmse_list)}')

Mean RMSE: 24.095, Minimum RMSE: 9.24, Maximum RMSE: 66.11


In [15]:
rmse_list

[23.5,
 14.54,
 23.19,
 11.53,
 9.24,
 53.11,
 22.9,
 66.11,
 15.27,
 15.15,
 14.63,
 19.97]