In [6]:
import numpy as np
import xarray as xr
import rasterio
%matplotlib inline
from matplotlib.pyplot import *
from glob import glob
import os
import datetime

import pandas as pd

from rasterio import features

from rasterio_to_xarray import rasterio_to_xarray, xarray_to_rasterio

import rasterstats
import fiona

from tqdm import tqdm

from shapely.geometry import shape
from rasterstats.io import read_features

In [7]:
data = xr.open_mfdataset(r'C:\MAIACData\nc_monthly_daily\*.nc')['data']

In [12]:
# Subsetting params
x_start = 950
x_end = None

y_start = 950
y_stop = None

In [13]:
subset = data.isel(x=slice(x_start, y_stop), y=slice(y_start, y_stop))

In [14]:
subset = subset.isel(time=np.argsort(subset.time))

In [11]:
After2009 = subset.sel(time=slice('2009', '2016'))

In [15]:
data = subset

## Monthly

In [7]:
monthly_data = After2009.resample('M', dim='time', how='mean', keep_attrs=True)

In [8]:
monthly_data

<xarray.DataArray 'data' (time: 66, y: 212, x: 290)>
dask.array<transpo..., shape=(66, 212, 290), dtype=float32, chunksize=(1, 212, 290)>
Coordinates:
  * x        (x) float64 2.461e+05 2.473e+05 2.486e+05 2.498e+05 2.511e+05 ...
  * y        (y) float64 2.356e+05 2.343e+05 2.33e+05 2.318e+05 2.305e+05 ...
  * time     (time) datetime64[ns] 2009-01-31 2009-02-28 2009-03-31 ...
Attributes:
    affine: [ -9.47639631e+05   1.25654304e+03   0.00000000e+00   1.42927781e+06
   0.00000000e+00  -1.25654304e+03]
    crs: +init=epsg:27700

In [9]:
data = monthly_data

The next few cells are the bits that convert the affine transform

In [16]:
# Get the actual Affine object from the data stored in the attrs
orig_aff = rasterio.Affine.from_gdal(*data.attrs['affine'])

In [17]:
def window_bounds(window, affine):
    (row_start, row_stop), (col_start, col_stop) = window
    w, s = (col_start, row_stop) * affine
    e, n = (col_stop, row_start) * affine
    return w, s, e, n

In [18]:
c, _, _, f = window_bounds( ( (x_start, 5000), (y_start, 5000)), orig_aff)  # c ~ west, f ~ north
a, b, _, d, e, _, _, _, _ = tuple(orig_aff)
new_aff = rasterio.Affine(a, b, c, d, e, f)

In [19]:
orig_aff

Affine(1256.5430440955893, 0.0, -947639.63051064778,
       0.0, -1256.5430440955893, 1429277.8120091767)

In [20]:
new_aff

Affine(1256.5430440955893, 0.0, 246076.26138016209,
       0.0, -1256.5430440955893, 235561.92011836683)

In [24]:
# Image to rasterize the polygons in to
rasterized_image = np.zeros(data.isel(time=0).shape, dtype=np.int)

# List to store dataframes in
dfs = []

feats = read_features(r'D:\Annies_Dissertation\Data\Boundaries\Bmth_and_Soton.shp')

out_shape = data.isel(time=0).shape

This single line is the key thing that makes the code faster!

In [25]:
data = data.load()

In [26]:
data

<xarray.DataArray 'data' (time: 5191, y: 212, x: 290)>
array([[[         nan,          nan,          nan, ...,          nan,
                  nan,          nan],
        [         nan,          nan,          nan, ...,          nan,
                  nan,          nan],
        [         nan,          nan,          nan, ...,          nan,
                  nan,          nan],
        ..., 
        [         nan,          nan,          nan, ...,          nan,
                  nan,          nan],
        [         nan,          nan,          nan, ...,          nan,
                  nan,          nan],
        [         nan,          nan,          nan, ...,          nan,
                  nan,          nan]],

       [[ 36.06666183,  37.74933243,  33.51363754, ...,          nan,
                  nan,          nan],
        [ 41.46812057,  41.55789566,          nan, ...,          nan,
                  nan,          nan],
        [         nan,          nan,          nan, ...,          

In [27]:
# Loop over features (polygons) in the shapefile
for f in tqdm(feats):
    # Rasterize the polygon into an array
    rasterized_image = features.rasterize([(shape(f['geometry']),1)],
                                          out_shape=out_shape,
                                          transform=new_aff,
                                          fill=0,
                                          all_touched=True)

    # Extract from the xarray where the rasterized polygon is
    region = data.where(rasterized_image == 1)
    
    # Combine x and y into a new dimension called allpoints and calculate the mean over it
    # and then convert to a dataframe with an appropriate name
    res = region.stack(allpoints=['x','y']).mean(dim='allpoints').to_dataframe(name=f['properties']['CTYUA15CD'])
    
    # Append to the list of data frames so we can concatenate them all at the end
    dfs.append(res)
    
stats = pd.concat(dfs, axis=1)

2it [00:03,  1.72s/it]


In [28]:
stats

Unnamed: 0_level_0,E06000028,E06000045
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-02-25,17.279724,14.118634
2000-02-26,,20.691448
2000-02-27,,
2000-02-28,20.444691,20.323462
2000-02-29,,
2000-03-01,20.349199,
2000-03-02,,
2000-03-03,,17.397324
2000-03-04,18.707706,16.998253
2000-03-05,24.391594,


In [29]:
stats = stats.dropna(how='all')

In [30]:
melted_stats = pd.melt(stats.reset_index(), id_vars='time', var_name='City').dropna()

In [32]:
melted_stats

Unnamed: 0,time,City,value
0,2000-02-25,E06000028,17.279724
2,2000-02-28,E06000028,20.444691
3,2000-03-01,E06000028,20.349199
5,2000-03-04,E06000028,18.707706
6,2000-03-05,E06000028,24.391594
8,2000-03-10,E06000028,46.667805
9,2000-03-12,E06000028,15.574203
10,2000-03-15,E06000028,27.811934
12,2000-03-19,E06000028,16.731461
14,2000-03-22,E06000028,43.010914


In [33]:
melted_stats['month'] = melted_stats.time.dt.month

In [34]:
melted_stats['year'] = melted_stats.time.dt.year

In [35]:
melted_stats['day'] = melted_stats.time.dt.day

In [36]:
melted_stats.head()

Unnamed: 0,time,City,value,month,year,day
0,2000-02-25,E06000028,17.279724,2,2000,25
2,2000-02-28,E06000028,20.444691,2,2000,28
3,2000-03-01,E06000028,20.349199,3,2000,1
5,2000-03-04,E06000028,18.707706,3,2000,4
6,2000-03-05,E06000028,24.391594,3,2000,5


In [37]:
melted_stats.to_csv(r'D:\Annies_Dissertation\Analysis\Regression\Lamb\AP_Cities.csv')

## Validating with AP for a LSOA (E01017182) that is located within a pixel

In [None]:
PM25 = xr.open_mfdataset(r'C:\MAIACData\nc_monthly_daily\*PM25.nc')['data']

In [None]:
PM25

In [None]:
a = PM25.attrs['affine']
a = rasterio.Affine.from_gdal(*a)
~a * (439040, 115775)

In [None]:
PM25 = PM25.isel(time=np.argsort(PM25.time))

In [None]:
After2009 = PM25.sel(time=slice('2009', '2016'))

In [None]:
monthly_data = After2009.resample('M', dim='time', how='mean', keep_attrs=True)

In [None]:
ts = monthly_data.isel(x=1103, y=1045).load()

In [None]:
ts

In [None]:
result = ts.to_dataframe()

In [None]:
result.dropna()

In [None]:
result.to_csv(r'D:\Annies_Dissertation\Analysis\Regression\Validation\Monthly_PM25_LSOA_Validation.csv')

### Joining data for pixel and LSOA to validate Rasterstats method

In [1]:
from dateutil.parser import parse

In [34]:
Pixel = pd.read_csv(r'D:\Annies_Dissertation\Analysis\Regression\Validation\Monthly_PM25_LSOA_Validation.csv', parse_dates=['time'])

In [66]:
Pixel[:10]

Unnamed: 0_level_0,data,LSOA
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-01-31,8.010561,E01017182
2009-02-28,7.79007,E01017182
2009-03-31,13.203969,E01017182
2009-04-30,14.404096,E01017182
2009-05-31,13.63233,E01017182
2009-06-30,11.638875,E01017182
2009-07-31,13.614918,E01017182
2009-08-31,12.393079,E01017182
2009-09-30,13.358428,E01017182
2009-10-31,12.025865,E01017182


In [36]:
Pixel = Pixel.set_index('time')

In [37]:
Pixel = Pixel[['data']].dropna()

In [38]:
Pixel['LSOA'] = 'E01017182'

In [58]:
Area = pd.read_csv(r'D:\Annies_Dissertation\Analysis\Regression\Monthly_PM25_LSOA.csv', parse_dates=['time'])

In [62]:
Area[:10]

Unnamed: 0_level_0,LSOA,value
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-01-31,E01014869,4.894197
2009-02-28,E01014869,8.269163
2009-03-31,E01014869,16.443802
2009-04-30,E01014869,12.831035
2009-05-31,E01014869,9.9837
2009-06-30,E01014869,14.679679
2009-07-31,E01014869,11.896524
2009-08-31,E01014869,17.366266
2009-09-30,E01014869,11.546007
2009-10-31,E01014869,8.697833


In [60]:
Area = Area.set_index('time')

In [61]:
Area = Area[['LSOA', 'value']].dropna()

In [64]:
E01 = Area.loc[Area['LSOA'] == 'E01017182']

In [65]:
E01

Unnamed: 0_level_0,LSOA,value
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-01-31,E01017182,8.010561
2009-02-28,E01017182,7.790070
2009-03-31,E01017182,13.203969
2009-04-30,E01017182,14.404096
2009-05-31,E01017182,13.632330
2009-06-30,E01017182,11.638875
2009-07-31,E01017182,13.614918
2009-08-31,E01017182,12.393079
2009-09-30,E01017182,13.358428
2009-10-31,E01017182,12.025865


In [67]:
result = pd.merge(Pixel, E01, left_index=True, right_index=True)

In [68]:
result.dropna()

Unnamed: 0_level_0,data,LSOA_x,LSOA_y,value
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-01-31,8.010561,E01017182,E01017182,8.010561
2009-02-28,7.790070,E01017182,E01017182,7.790070
2009-03-31,13.203969,E01017182,E01017182,13.203969
2009-04-30,14.404096,E01017182,E01017182,14.404096
2009-05-31,13.632330,E01017182,E01017182,13.632330
2009-06-30,11.638875,E01017182,E01017182,11.638875
2009-07-31,13.614918,E01017182,E01017182,13.614918
2009-08-31,12.393079,E01017182,E01017182,12.393079
2009-09-30,13.358428,E01017182,E01017182,13.358428
2009-10-31,12.025865,E01017182,E01017182,12.025865


In [69]:
result['Difference'] = result['data']- result['value']

In [70]:
result[:10]

Unnamed: 0_level_0,data,LSOA_x,LSOA_y,value,Difference
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-01-31,8.010561,E01017182,E01017182,8.010561,0.0
2009-02-28,7.79007,E01017182,E01017182,7.79007,0.0
2009-03-31,13.203969,E01017182,E01017182,13.203969,0.0
2009-04-30,14.404096,E01017182,E01017182,14.404096,0.0
2009-05-31,13.63233,E01017182,E01017182,13.63233,0.0
2009-06-30,11.638875,E01017182,E01017182,11.638875,0.0
2009-07-31,13.614918,E01017182,E01017182,13.614918,0.0
2009-08-31,12.393079,E01017182,E01017182,12.393079,0.0
2009-09-30,13.358428,E01017182,E01017182,13.358428,0.0
2009-10-31,12.025865,E01017182,E01017182,12.025865,0.0


In [71]:
result.Difference.value_counts()

0.0    66
Name: Difference, dtype: int64

In [72]:
result.groupby('Difference').count()

Unnamed: 0_level_0,data,LSOA_x,LSOA_y,value
Difference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,66,66,66,66


## Yearly

In [11]:
yearly_data = After2009.resample('A', dim='time', how='mean', keep_attrs=True)

In [12]:
yearly_data

<xarray.DataArray 'data' (time: 6, y: 212, x: 290)>
dask.array<transpo..., shape=(6, 212, 290), dtype=float32, chunksize=(1, 212, 290)>
Coordinates:
  * x        (x) float64 2.461e+05 2.473e+05 2.486e+05 2.498e+05 2.511e+05 ...
  * y        (y) float64 2.356e+05 2.343e+05 2.33e+05 2.318e+05 2.305e+05 ...
  * time     (time) datetime64[ns] 2009-12-31 2010-12-31 2011-12-31 ...
Attributes:
    affine: [ -9.47639631e+05   1.25654304e+03   0.00000000e+00   1.42927781e+06
   0.00000000e+00  -1.25654304e+03]
    crs: +init=epsg:27700

In [13]:
data = yearly_data

In [14]:
# Get the actual Affine object from the data stored in the attrs
orig_aff = rasterio.Affine.from_gdal(*data.attrs['affine'])

In [15]:
def window_bounds(window, affine):
    (row_start, row_stop), (col_start, col_stop) = window
    w, s = (col_start, row_stop) * affine
    e, n = (col_stop, row_start) * affine
    return w, s, e, n

In [16]:
c, _, _, f = window_bounds( ( (x_start, 5000), (y_start, 5000)), orig_aff)  # c ~ west, f ~ north
a, b, _, d, e, _, _, _, _ = tuple(orig_aff)
new_aff = rasterio.Affine(a, b, c, d, e, f)

In [17]:
orig_aff

Affine(1256.5430440955893, 0.0, -947639.63051064778,
       0.0, -1256.5430440955893, 1429277.8120091767)

In [18]:
new_aff

Affine(1256.5430440955893, 0.0, 246076.26138016209,
       0.0, -1256.5430440955893, 235561.92011836683)

In [19]:
# Image to rasterize the polygons in to
rasterized_image = np.zeros(data.isel(time=0).shape, dtype=np.int)

# List to store dataframes in
dfs = []

feats = read_features(r'D:\Annies_Dissertation\Data\Boundaries\LSOA_Wessex.shp')

out_shape = data.isel(time=0).shape

In [20]:
data = data.load()

  x = np.divide(x1, x2, out)


In [21]:
# Loop over features (polygons) in the shapefile
for f in tqdm(feats):
    # Rasterize the polygon into an array
    rasterized_image = features.rasterize([(shape(f['geometry']),1)],
                                          out_shape=out_shape,
                                          transform=new_aff,
                                          fill=0,
                                          all_touched=True)

    # Extract from the xarray where the rasterized polygon is
    region = data.where(rasterized_image == 1)
    
    # Combine x and y into a new dimension called allpoints and calculate the mean over it
    # and then convert to a dataframe with an appropriate name
    res = region.stack(allpoints=['x','y']).mean(dim='allpoints').to_dataframe(name=f['properties']['LSOA11CD'])
    
    # Append to the list of data frames so we can concatenate them all at the end
    dfs.append(res)
    
stats = pd.concat(dfs, axis=1)

2578it [00:24, 104.86it/s]


In [22]:
stats

Unnamed: 0_level_0,E01014869,E01014890,E01014891,E01015272,E01015273,E01015274,E01015275,E01015276,E01015277,E01015279,...,E01033241,E01033242,E01033283,E01033285,E01033286,E01033288,E01033380,E01033381,E01033383,E01033384
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-12-31,11.05674,10.958618,10.680703,15.233035,15.614657,15.820494,15.245512,15.811707,15.684235,15.220558,...,15.220558,17.743645,15.220558,17.743645,18.015972,18.061174,13.105443,14.531994,14.531994,11.883699
2010-12-31,9.198227,8.324327,8.408011,12.563782,13.049275,13.424196,12.988467,13.504366,13.160589,12.139097,...,12.139097,14.03282,12.139097,14.03282,14.383579,14.215097,11.614311,12.219908,12.219908,11.361659
2011-12-31,9.872396,10.465442,10.453424,16.608112,16.794128,17.424191,17.384394,17.27528,16.854765,15.831831,...,15.831831,17.672132,15.831831,17.672132,18.272284,18.490141,15.904312,16.433113,16.433113,14.119357
2012-12-31,9.418056,8.765922,8.61519,12.270921,13.020046,13.808305,12.656074,13.587185,13.379829,11.885768,...,11.885768,16.277164,11.885768,16.277164,16.840157,16.948296,12.707359,14.07688,14.07688,11.542398
2013-12-31,9.172091,9.38799,8.88612,12.437767,12.8708,13.216434,13.343239,13.540052,13.102798,11.532296,...,11.532296,16.084095,11.532296,16.084095,16.323599,16.072855,11.4894,11.75861,11.75861,10.227263
2014-12-31,9.128167,8.018132,8.864597,14.177834,14.556643,15.937187,14.690467,15.002365,14.955285,13.6652,...,13.6652,14.386749,13.6652,14.386749,15.996344,16.168161,14.252884,14.098279,14.098279,14.398606


In [23]:
stats = stats.dropna(how='all')

In [24]:
melted_stats = pd.melt(stats.reset_index(), id_vars='time', var_name='LSOA').dropna()

In [25]:
melted_stats

Unnamed: 0,time,LSOA,value
0,2009-12-31,E01014869,11.056740
1,2010-12-31,E01014869,9.198227
2,2011-12-31,E01014869,9.872396
3,2012-12-31,E01014869,9.418056
4,2013-12-31,E01014869,9.172091
5,2014-12-31,E01014869,9.128167
6,2009-12-31,E01014890,10.958618
7,2010-12-31,E01014890,8.324327
8,2011-12-31,E01014890,10.465442
9,2012-12-31,E01014890,8.765922


In [26]:
melted_stats['year'] = melted_stats.time.dt.year

In [27]:
melted_stats.head()

Unnamed: 0,time,LSOA,value,year
0,2009-12-31,E01014869,11.05674,2009
1,2010-12-31,E01014869,9.198227,2010
2,2011-12-31,E01014869,9.872396,2011
3,2012-12-31,E01014869,9.418056,2012
4,2013-12-31,E01014869,9.172091,2013


In [28]:
melted_stats.to_csv(r'D:\Annies_Dissertation\Analysis\Regression\Yearly_PM25_LSOA.csv')