In [1]:
import numpy as np
import xarray as xr
import rasterio
%matplotlib inline
from matplotlib.pyplot import *
from glob import glob
import os
import datetime

import pandas as pd

from rasterio import features

from rasterio_to_xarray import rasterio_to_xarray, xarray_to_rasterio

import rasterstats
import fiona

from tqdm import tqdm

from shapely.geometry import shape
from rasterstats.io import read_features

#from dask.diagnostics import ProgressBar

pbar = ProgressBar()
pbar.register()

pbar.unregister()

In [22]:
data = xr.open_mfdataset(r'C:\MAIACData\nc_monthly_daily\*.nc')['data']

In [23]:
# Subsetting params
x_start = 950
x_end = None

y_start = 950
y_stop = None

In [24]:
subset = data.isel(x=slice(x_start, y_stop), y=slice(y_start, y_stop))

In [25]:
subset = subset.isel(time=np.argsort(subset.time))

In [26]:
After2009 = subset.sel(time=slice('2009', '2016'))

In [27]:
monthly_data = After2009.resample('M', dim='time', how='mean', keep_attrs=True)

In [28]:
monthly_data

<xarray.DataArray 'data' (time: 66, y: 212, x: 290)>
dask.array<transpo..., shape=(66, 212, 290), dtype=float32, chunksize=(1, 212, 290)>
Coordinates:
  * x        (x) float64 2.461e+05 2.473e+05 2.486e+05 2.498e+05 2.511e+05 ...
  * y        (y) float64 2.356e+05 2.343e+05 2.33e+05 2.318e+05 2.305e+05 ...
  * time     (time) datetime64[ns] 2009-01-31 2009-02-28 2009-03-31 ...
Attributes:
    affine: [ -9.47639631e+05   1.25654304e+03   0.00000000e+00   1.42927781e+06
   0.00000000e+00  -1.25654304e+03]
    crs: +init=epsg:27700

In [29]:
data = monthly_data

The next few cells are the bits that convert the affine transform

In [30]:
# Get the actual Affine object from the data stored in the attrs
orig_aff = rasterio.Affine.from_gdal(*data.attrs['affine'])

In [31]:
def window_bounds(window, affine):
    (row_start, row_stop), (col_start, col_stop) = window
    w, s = (col_start, row_stop) * affine
    e, n = (col_stop, row_start) * affine
    return w, s, e, n

In [32]:
c, _, _, f = window_bounds( ( (x_start, 5000), (y_start, 5000)), orig_aff)  # c ~ west, f ~ north
a, b, _, d, e, _, _, _, _ = tuple(orig_aff)
new_aff = rasterio.Affine(a, b, c, d, e, f)

In [33]:
orig_aff

Affine(1256.5430440955893, 0.0, -947639.63051064778,
       0.0, -1256.5430440955893, 1429277.8120091767)

In [34]:
new_aff

Affine(1256.5430440955893, 0.0, 246076.26138016209,
       0.0, -1256.5430440955893, 235561.92011836683)

In [35]:
# Image to rasterize the polygons in to
rasterized_image = np.zeros(data.isel(time=0).shape, dtype=np.int)

# List to store dataframes in
dfs = []

feats = read_features(r'D:\Annies_Dissertation\Data\Boundaries\LSOA_Wessex.shp')

out_shape = data.isel(time=0).shape

This single line is the key thing that makes the code faster!

In [36]:
data = data.load()

KeyboardInterrupt: 

In [19]:
data

<xarray.DataArray 'data' (time: 66, y: 212, x: 290)>
array([[[  7.46854973,   7.31030512,   7.47690392, ...,          nan,
                  nan,          nan],
        [  6.85919046,   6.87198734,   5.62207413, ...,          nan,
                  nan,          nan],
        [  8.83175087,   9.8598671 ,   5.67711639, ...,          nan,
                  nan,          nan],
        ..., 
        [  8.56533813,   8.49901867,          nan, ...,          nan,
                  nan,          nan],
        [  8.56972504,          nan,          nan, ...,          nan,
                  nan,          nan],
        [         nan,          nan,          nan, ...,          nan,
                  nan,          nan]],

       [[ 15.02777386,   8.10723877,  12.05983162, ...,          nan,
                  nan,          nan],
        [         nan,          nan,   8.38002014, ...,          nan,
                  nan,          nan],
        [  9.77899742,  13.60194397,   9.22825813, ...,          na

In [20]:
# Loop over features (polygons) in the shapefile
for f in tqdm(feats):
    # Rasterize the polygon into an array
    rasterized_image = features.rasterize([(shape(f['geometry']),1)],
                                          out_shape=out_shape,
                                          transform=new_aff,
                                          fill=0,
                                          all_touched=True)

    # Extract from the xarray where the rasterized polygon is
    region = data.where(rasterized_image == 1)
    
    # Combine x and y into a new dimension called allpoints and calculate the mean over it
    # and then convert to a dataframe with an appropriate name
    res = region.stack(allpoints=['x','y']).mean(dim='allpoints').to_dataframe(name=f['properties']['LSOA11CD'])
    
    # Append to the list of data frames so we can concatenate them all at the end
    dfs.append(res)
    
stats = pd.concat(dfs, axis=1)

2575it [01:09, 36.38it/s]


In [37]:
stats

Unnamed: 0_level_0,E01014869,E01014890,E01015272,E01015273,E01015274,E01015275,E01015276,E01015277,E01015279,E01015280,...,E01033241,E01033242,E01033283,E01033285,E01033286,E01033288,E01033380,E01033381,E01033383,E01033384
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01-31,4.894197,6.379436,11.609479,11.410336,11.650494,11.682238,11.347145,11.450066,11.536720,11.609479,...,11.536720,13.849295,11.536720,13.849295,12.915979,12.240013,10.188380,11.962096,11.962096,8.193390
2009-02-28,8.269163,8.117812,16.521988,17.099459,17.253952,19.085062,18.669735,17.370146,13.958913,16.521988,...,13.958913,18.481438,13.958913,18.481438,17.215921,16.913860,14.799878,16.302748,16.302748,11.476199
2009-03-31,16.443802,15.787933,24.719713,23.110231,22.489740,24.195459,22.043364,22.186796,25.243967,24.719713,...,25.243967,25.079872,25.243967,25.079872,25.198463,26.157143,15.213874,16.075760,16.075760,14.631869
2009-04-30,12.831035,12.642556,13.966952,15.072072,14.637295,12.367358,14.824835,14.989799,15.566546,13.966952,...,15.566546,19.950306,15.566546,19.950306,20.367689,20.624439,15.470265,15.977748,15.977748,15.075045
2009-05-31,9.983700,6.370895,14.851036,14.757053,14.214451,13.850450,14.209768,14.698997,15.851622,14.851036,...,15.851622,15.305582,15.851622,15.305582,17.040155,16.425224,13.010407,14.559179,14.559179,12.237666
2009-06-30,14.679679,13.817136,15.499490,17.341637,18.810036,15.580470,18.303200,17.797129,15.418509,15.499490,...,15.418509,21.191246,15.418509,21.191246,22.128574,21.482872,12.945766,15.101236,15.101236,12.648201
2009-07-31,11.896524,12.728112,21.609333,22.213881,22.655102,21.706160,22.564568,22.486118,21.512508,21.609333,...,21.512508,24.789444,21.512508,24.789444,25.428127,25.453865,14.245003,17.616417,17.616417,13.207088
2009-08-31,17.366266,8.997812,14.671386,15.520734,15.764976,14.220521,15.719976,16.175755,15.122251,14.671386,...,15.122251,17.737610,15.122251,17.737610,17.989836,18.541046,14.485721,15.198454,15.198454,13.455544
2009-09-30,11.546007,9.072837,16.571581,15.893867,15.805025,16.007593,15.273018,15.810766,17.135569,16.571581,...,17.135569,16.350376,17.135569,16.350376,16.094454,16.704769,15.732574,17.352114,17.352114,13.025869
2009-10-31,8.697833,9.306887,14.976901,15.424840,15.962790,16.716471,16.518593,15.286570,13.237331,14.976901,...,13.237331,13.000283,13.237331,13.000283,13.559897,13.826775,10.713559,12.608535,12.608535,10.269573


In [38]:
stats = stats.dropna(how='all')

In [39]:
melted_stats = pd.melt(stats.reset_index(), id_vars='time', var_name='LSOA').dropna()

In [40]:
melted_stats

Unnamed: 0,time,LSOA,value
0,2009-01-31,E01014869,4.894197
1,2009-02-28,E01014869,8.269163
2,2009-03-31,E01014869,16.443802
3,2009-04-30,E01014869,12.831035
4,2009-05-31,E01014869,9.983700
5,2009-06-30,E01014869,14.679679
6,2009-07-31,E01014869,11.896524
7,2009-08-31,E01014869,17.366266
8,2009-09-30,E01014869,11.546007
9,2009-10-31,E01014869,8.697833


In [41]:
melted_stats['month'] = melted_stats.time.dt.month

In [42]:
melted_stats['year'] = melted_stats.time.dt.year

In [43]:
melted_stats.head()

Unnamed: 0,time,LSOA,value,month,year
0,2009-01-31,E01014869,4.894197,1,2009
1,2009-02-28,E01014869,8.269163,2,2009
2,2009-03-31,E01014869,16.443802,3,2009
3,2009-04-30,E01014869,12.831035,4,2009
4,2009-05-31,E01014869,9.9837,5,2009


In [44]:
melted_stats.to_csv(r'D:\Annies_Dissertation\Analysis\Regression\Monthly_PM25_LSOA.csv')