In [1]:
import numpy as np
import xarray as xr
import rasterio
%matplotlib inline
from matplotlib.pyplot import *
from glob import glob
import os
import datetime

import pandas as pd

from rasterio import features

from rasterio_to_xarray import rasterio_to_xarray, xarray_to_rasterio

import rasterstats
import fiona

from tqdm import tqdm

from shapely.geometry import shape
from rasterstats.io import read_features

#from dask.diagnostics import ProgressBar

pbar = ProgressBar()
pbar.register()

pbar.unregister()

In [2]:
data = xr.open_dataset('http://opendap.knmi.nl/knmi/thredds/dodsC/e-obs_0.25regular/tg_stderr_0.25deg_reg_v13.1.nc')

In [3]:
tg = data['tg']

## Monthly data for South of UK

In [4]:
%time monthly = tg.sel(time=slice('2009-01-01','2016-07-01')).resample('M', dim='time', how='mean')

Wall time: 1min 18s


In [5]:
uk = tg.sel(longitude=slice(-10,3), latitude=slice(50,54))

In [6]:
%time uk_monthly = uk.sel(time=slice('2009-01-01','2016-07-01')).resample('M', dim='time', how='mean')

Wall time: 7.75 s


In [7]:
data = uk_monthly

The next few cells are the bits that convert the affine transform

In [8]:
# Image to rasterize the polygons in to
rasterized_image = np.zeros(data.isel(time=0).shape, dtype=np.int)

# List to store dataframes in
dfs = []

feats = read_features(r'D:\Annies_Dissertation\Data\Boundaries\LSOA_WGS.shp')

out_shape = data.isel(time=0).shape

This single line is the key thing that makes the code faster!

In [9]:
data = data.load()

In [10]:
data

<xarray.DataArray 'tg' (time: 84, latitude: 16, longitude: 52)>
array([[[        nan,         nan,         nan, ...,  1.40935481,
          1.42354836,  1.43129029],
        [        nan,         nan,         nan, ...,  1.466129  ,
          1.44580642,  1.41064513],
        [        nan,         nan,         nan, ...,  1.35774191,
          1.35903223,  1.39677416],
        ..., 
        [        nan,  1.47838706,  1.33387094, ...,         nan,
                 nan,         nan],
        [ 1.45516126,  1.40709674,  1.31580642, ...,         nan,
                 nan,         nan],
        [        nan,  1.42516126,  1.36225803, ...,         nan,
                 nan,         nan]],

       [[        nan,         nan,         nan, ...,  1.31035711,
          1.2960714 ,  1.28749997],
        [        nan,         nan,         nan, ...,  1.31214283,
          1.30035711,  1.30499997],
        [        nan,         nan,         nan, ...,  1.27821426,
          1.27785711,  1.30321426],
  

In [13]:
# Loop over features (polygons) in the shapefile
for f in tqdm(feats):
    # Rasterize the polygon into an array
    rasterized_image = features.rasterize([(shape(f['geometry']),1)],
                                          out_shape=out_shape,
                                          fill=0,
                                          all_touched=True)

    # Extract from the xarray where the rasterized polygon is
    region = data.where(rasterized_image == 1)
    
    # Combine x and y into a new dimension called allpoints and calculate the mean over it
    # and then convert to a dataframe with an appropriate name
    res = region.stack(allpoints=['longitude','latitude']).mean(dim='allpoints').to_dataframe(name=f['properties']['LSOA11CD'])
    
    # Append to the list of data frames so we can concatenate them all at the end
    dfs.append(res)
    
stats = pd.concat(dfs, axis=1)

2576it [00:18, 138.49it/s]


In [14]:
stats

Unnamed: 0_level_0,E01014891,E01015272,E01015273,E01015274,E01015275,E01015276,E01015277,E01015279,E01015280,E01015281,...,E01033241,E01033242,E01033283,E01033285,E01033286,E01033288,E01033380,E01033381,E01033383,E01033384
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01-31,,,,,,,,,,,...,,,,,,,,,,
2009-02-28,,,,,,,,,,,...,,,,,,,,,,
2009-03-31,,,,,,,,,,,...,,,,,,,,,,
2009-04-30,,,,,,,,,,,...,,,,,,,,,,
2009-05-31,,,,,,,,,,,...,,,,,,,,,,
2009-06-30,,,,,,,,,,,...,,,,,,,,,,
2009-07-31,,,,,,,,,,,...,,,,,,,,,,
2009-08-31,,,,,,,,,,,...,,,,,,,,,,
2009-09-30,,,,,,,,,,,...,,,,,,,,,,
2009-10-31,,,,,,,,,,,...,,,,,,,,,,


In [15]:
stats = stats.dropna(how='all')

In [16]:
melted_stats = pd.melt(stats.reset_index(), id_vars='time', var_name='LSOA').dropna()

In [17]:
melted_stats

Unnamed: 0,time,LSOA,value


In [None]:
melted_stats['month'] = melted_stats.time.dt.month

In [None]:
melted_stats['year'] = melted_stats.time.dt.year

In [None]:
melted_stats.head()

In [None]:
melted_stats.to_csv(r'D:\Annies_Dissertation\Analysis\Regression\MSOA\Monthly_Temp_LSOA.csv')

## Validating with AP for a LSOA (E01017182) that is located within a pixel

In [None]:
PM25 = xr.open_mfdataset(r'C:\MAIACData\nc_monthly_daily\*PM25.nc')['data']

In [None]:
PM25

In [None]:
a = PM25.attrs['affine']
a = rasterio.Affine.from_gdal(*a)
~a * (439040, 115775)

In [None]:
PM25 = PM25.isel(time=np.argsort(PM25.time))

In [None]:
After2009 = PM25.sel(time=slice('2009', '2016'))

In [None]:
monthly_data = After2009.resample('M', dim='time', how='mean', keep_attrs=True)

In [None]:
ts = monthly_data.isel(x=1103, y=1045).load()

In [None]:
ts

In [None]:
result = ts.to_dataframe()

In [None]:
result.dropna()

In [None]:
result.to_csv(r'D:\Annies_Dissertation\Analysis\Regression\Validation\Monthly_PM25_LSOA_Validation.csv')

### Joining data for pixel and LSOA to validate Rasterstats method

In [None]:
from dateutil.parser import parse

In [None]:
Pixel = pd.read_csv(r'D:\Annies_Dissertation\Analysis\Regression\Validation\Monthly_PM25_LSOA_Validation.csv', parse_dates=['time'])

In [None]:
Pixel[:10]

In [None]:
Pixel = Pixel.set_index('time')

In [None]:
Pixel = Pixel[['data']].dropna()

In [None]:
Pixel['LSOA'] = 'E01017182'

In [None]:
Area = pd.read_csv(r'D:\Annies_Dissertation\Analysis\Regression\Monthly_PM25_LSOA.csv', parse_dates=['time'])

In [None]:
Area[:10]

In [None]:
Area = Area.set_index('time')

In [None]:
Area = Area[['LSOA', 'value']].dropna()

In [None]:
E01 = Area.loc[Area['LSOA'] == 'E01017182']

In [None]:
E01

In [None]:
result = pd.merge(Pixel, E01, left_index=True, right_index=True)

In [None]:
result.dropna()

In [None]:
result['Difference'] = result['data']- result['value']

In [None]:
result[:10]

In [None]:
result.Difference.value_counts()

In [None]:
result.groupby('Difference').count()

## Yearly

In [None]:
yearly_data = After2009.resample('A', dim='time', how='mean', keep_attrs=True)

In [None]:
yearly_data

In [None]:
data = yearly_data

In [None]:
# Get the actual Affine object from the data stored in the attrs
orig_aff = rasterio.Affine.from_gdal(*data.attrs['affine'])

In [None]:
def window_bounds(window, affine):
    (row_start, row_stop), (col_start, col_stop) = window
    w, s = (col_start, row_stop) * affine
    e, n = (col_stop, row_start) * affine
    return w, s, e, n

In [None]:
c, _, _, f = window_bounds( ( (x_start, 5000), (y_start, 5000)), orig_aff)  # c ~ west, f ~ north
a, b, _, d, e, _, _, _, _ = tuple(orig_aff)
new_aff = rasterio.Affine(a, b, c, d, e, f)

In [None]:
orig_aff

In [None]:
new_aff

In [None]:
# Image to rasterize the polygons in to
rasterized_image = np.zeros(data.isel(time=0).shape, dtype=np.int)

# List to store dataframes in
dfs = []

feats = read_features(r'D:\Annies_Dissertation\Data\Boundaries\LSOA_Wessex.shp')

out_shape = data.isel(time=0).shape

In [None]:
data = data.load()

In [None]:
# Loop over features (polygons) in the shapefile
for f in tqdm(feats):
    # Rasterize the polygon into an array
    rasterized_image = features.rasterize([(shape(f['geometry']),1)],
                                          out_shape=out_shape,
                                          transform=new_aff,
                                          fill=0,
                                          all_touched=True)

    # Extract from the xarray where the rasterized polygon is
    region = data.where(rasterized_image == 1)
    
    # Combine x and y into a new dimension called allpoints and calculate the mean over it
    # and then convert to a dataframe with an appropriate name
    res = region.stack(allpoints=['x','y']).mean(dim='allpoints').to_dataframe(name=f['properties']['LSOA11CD'])
    
    # Append to the list of data frames so we can concatenate them all at the end
    dfs.append(res)
    
stats = pd.concat(dfs, axis=1)

In [None]:
stats

In [None]:
stats = stats.dropna(how='all')

In [None]:
melted_stats = pd.melt(stats.reset_index(), id_vars='time', var_name='LSOA').dropna()

In [None]:
melted_stats

In [None]:
melted_stats['year'] = melted_stats.time.dt.year

In [None]:
melted_stats.head()

In [None]:
melted_stats.to_csv(r'D:\Annies_Dissertation\Analysis\Regression\Yearly_PM25_LSOA.csv')