# Import relevant libraries/modules etc

In [1]:
import numpy as np
import xarray as xr
import rasterio
%matplotlib inline
from matplotlib.pyplot import *
from glob import glob
import os
import datetime
import pandas as pd
from rasterio import features
from rasterio_to_xarray import rasterio_to_xarray, xarray_to_rasterio, xarray_to_rasterio_by_band
import rasterstats
import fiona
from tqdm import tqdm

from shapely.geometry import shape
from rasterstats.io import read_features

from dask.diagnostics import ProgressBar

In [None]:
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, ProgressBar
pbar = ProgressBar()
pbar.register()

# Read NetCDF files in to create dataset with all data in

In [2]:
PM25 = xr.open_mfdataset(r'C:\MAIACData\nc_monthly_daily\*PM25.nc')['data']

In [3]:
PM25

<xarray.DataArray 'data' (time: 5191, y: 1162, x: 1240)>
dask.array<concate..., shape=(5191, 1162, 1240), dtype=float32, chunksize=(30, 1162, 1240)>
Coordinates:
  * x        (x) float64 -9.476e+05 -9.464e+05 -9.451e+05 -9.439e+05 ...
  * y        (y) float64 1.429e+06 1.428e+06 1.427e+06 1.426e+06 1.424e+06 ...
  * time     (time) datetime64[ns] 2000-10-01 2000-10-02 2000-10-03 ...
Attributes:
    affine: [ -9.47639631e+05   1.25654304e+03   0.00000000e+00   1.42927781e+06
   0.00000000e+00  -1.25654304e+03]
    crs: +init=epsg:27700

# Extract data over one pixel

In [None]:
a = PM25.attrs['affine']
a = rasterio.Affine.from_gdal(*a)
~a * (382611, 80604)

In [None]:
PM25 = PM25.isel(time=np.argsort(PM25.time))

After2009 = PM25.sel(time=slice('2009', '2016'))

monthly_data = After2009.resample('M', dim='time', how='mean', keep_attrs=True)

In [None]:
ts = PM25.isel(x=1058, y=1073).load()

In [None]:
ts

In [None]:
result = ts.to_dataframe()

In [None]:
result.dropna()

In [None]:
result.to_csv(r'D:\Annies_Dissertation\Analysis\Timeseries\Westlulworth.csv')

## Merging timeseries data so can create a graph

In [None]:
from dateutil.parser import parse

In [None]:
Unisoton = pd.read_csv(r'D:\Annies_Dissertation\Analysis\Timeseries\UniSoton.csv', parse_dates=['time'])

In [None]:
Unisoton[:10]

In [None]:
Unisoton = Unisoton.set_index('time').rename(columns={'data':'Unisoton_data'})

In [None]:
Bolderwood = pd.read_csv(r'D:\Annies_Dissertation\Analysis\Timeseries\Bolderwood.csv', parse_dates=['time'])

In [None]:
Bolderwood[:10]

In [None]:
Bolderwood = Bolderwood.set_index('time').rename(columns={'data':'Bolderwood_data'})

In [None]:
result = pd.merge(Unisoton, Bolderwood, left_index=True, right_index=True)

In [None]:
result[:10]

In [None]:
Westlulworth = pd.read_csv(r'D:\Annies_Dissertation\Analysis\Timeseries\Westlulworth.csv', parse_dates=['time'])

In [None]:
Westlulworth[:10]

In [None]:
Westlulworth = Westlulworth.set_index('time').rename(columns={'data':'Westlulworth_data'})

In [None]:
result_3 = pd.merge(result, Westlulworth, left_index=True, right_index=True)

In [None]:
result_3[:10]

In [None]:
result_3 = result_3[['Unisoton_data', 'Bolderwood_data', 'Westlulworth_data']]

In [None]:
result_3.dropna().to_csv(r'D:\Annies_Dissertation\Analysis\Timeseries\Timeseries.csv')

In [None]:
result[:10]

# Group data
- merge multiple images per day using the maximum AOT estimates

In [None]:
PM25 = PM25.isel(time=np.argsort(PM25.time))

In [None]:
PM25.time

In [None]:
m = PM25.mean(dim='time', keep_attrs=True)

In [None]:
m = m.load()

In [None]:
figure(figsize=(20, 8))
m.plot()

In [None]:
from dask.dot import dot_graph

In [None]:
##dot_graph(, format='svg')

In [None]:
from dask.dot import to_graphviz

In [None]:
dot_graph(m.data.dask)

In [None]:
dot_graph(m.data.dask, filename='robin2.pdf', format='pdf')

In [None]:
reordered_PM25 = PM25.isel(time=np.argsort(PM25.time))

In [None]:
Daily_PM25 = reordered_PM25.resample('D', dim='time', how='max')
Daily_PM25 = Daily_PM25.dropna(dim='time', how='all')

In [None]:
Daily_PM25.time

In [None]:
Daily_PM25.attrs

# Analysis
## overall average of every pixel

The next four cells were added by Robin for profiling - ignore them for the moment :P

In [None]:
om = PM25.mean(dim='time', keep_attrs=True)

In [None]:
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler

In [None]:
with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof:
    om.load()

In [None]:
from dask.diagnostics import visualize
visualize([prof, rprof], show=True)

In [None]:
overall_mean = PM25.mean(dim='time', keep_attrs=True)

In [None]:
overall_mean.attrs

In [None]:
with ProgressBar():
    res = overall_mean.load()

In [None]:
xarray_to_rasterio(overall_mean, r'D:\Annies_Dissertation\Analysis\overall_mean.tif')

## Averages
### Monthly

In [None]:
monthly_mean = PM25.groupby('time.month').mean(dim='time', keep_attrs=True)

In [None]:
monthly_mean.coords

In [None]:
monthly_mean.attrs

In [None]:
figure(figsize=(20, 12))
monthly.plot(col='month', robust=True)

In [None]:
xarray_to_rasterio_by_band(monthly_mean, r'D:\Annies_Dissertation\Analysis\monthly\monthly_mean_', dim='month')

### Seasonal

In [None]:
seasonal_mean = PM25.groupby('time.season').mean(dim='time', keep_attrs=True)

In [None]:
seasonal_mean.coords

In [None]:
figure(figsize=(20, 12))
seasonal_mean.plot(col='season', robust=True)

In [None]:
xarray_to_rasterio_by_band(seasonal_mean, r'D:\Annies_Dissertation\Analysis\seasonal\seasonal_mean_', dim='season')

### Yearly

In [None]:
yearly_mean = PM25.groupby('time.year').mean(dim='time', keep_attrs=True)

In [None]:
yearly_mean.coords

In [None]:
figure(figsize=(20, 12))
yearly_mean.plot(col='year', robust=True)

In [None]:
xarray_to_rasterio_by_band(yearly_mean, r'D:\Annies_Dissertation\Analysis\yearly\yearly_mean_', dim='year')

## Standard deviation of AP for each pixel over time- not currently what i want to do!

In [None]:
std = PM25.groupby('time.year').std(dim = 'time', keep_attrs=True)

In [None]:
std = PM25.std(dim = 'time', keep_attrs=True)

In [None]:
std.coords

In [None]:
figure(figsize=(20, 12))
std.plot(col='year', robust=True)

In [None]:
xarray_to_rasterio(std, r'D:\Annies_Dissertation\Analysis\overall_std.tif')

# Count of values that aren't missing

In [None]:
observations = PM25.count(dim='time', keep_attrs=True)

In [None]:
xarray_to_rasterio(observations, r'D:\Annies_Dissertation\Analysis\observations_count.tif')

In [None]:
count_histo = observations.plot.hist()
savefig(r'D:\Annies_Dissertation\Analysis\count_histogram.jpeg')

In [None]:
figure(figsize=(20, 12))
observations.plot(robust=True)

# Regression
- linear regression for each pixel (Rabernat code)

In [4]:
from dask.cache import Cache
cache = Cache(2e9)  # Leverage two gigabytes of memory
cache.register()    # Turn cache on globally

In [5]:
i = 0

In [6]:
from scipy import stats

In [7]:
# define a function to compute a linear trend of a timeseries
def linear_trend(x):
    # These next few lines get i and add 1 to it, and then print it if it is divisible by 10
    global i
    i = i+1
    
    if i % 10 == 0:
        print('Current iteration: %d' % i)
    
    # Remove the NaN values
    x = x.dropna(dim='time')
    
    if len(x) == 0:
        return xr.DataArray(np.array([np.nan] * 5),
                            dims=['stats'],
                            coords={'stats': ['slope', 'intercept', 'rvalue', 'pvalue', 'stderr']})
    
    # Get the results of the linear regression
    regr = stats.linregress(x.time, x)

    # We need to return a dataarray or else xarray's groupby won't be happy
    # We convert the regression output to an array, and then set up the
    # DataArray so that it has a dimension called 'stats' with labels for each of
    # the values
    return xr.DataArray(np.array(regr),
                        dims=['stats'],
                        coords={'stats': ['slope', 'intercept', 'rvalue', 'pvalue', 'stderr']})

In [8]:
# Subsetting params
x_start = 975
x_end = None

y_start = 975
y_stop = None

In [9]:
subset = PM25.isel(x=slice(x_start, y_stop), y=slice(y_start, y_stop))

In [10]:
subset['x'] = np.arange(len(subset.x))
subset['y'] = np.arange(len(subset.y))

In [11]:
subset = subset.isel(time=np.argsort(subset.time))

In [12]:
# Get the actual Affine object from the data stored in the attrs
orig_aff = rasterio.Affine.from_gdal(*subset.attrs['affine'])

In [13]:
def window_bounds(window, affine):
    (row_start, row_stop), (col_start, col_stop) = window
    w, s = (col_start, row_stop) * affine
    e, n = (col_stop, row_start) * affine
    return w, s, e, n

In [14]:
c, _, _, f = window_bounds( ( (x_start, 5000), (y_start, 5000)), orig_aff)  # c ~ west, f ~ north
a, b, _, d, e, _, _, _, _ = tuple(orig_aff)
new_aff = rasterio.Affine(a, b, c, d, e, f)

In [15]:
orig_aff

Affine(1256.5430440955893, 0.0, -947639.63051064778,
       0.0, -1256.5430440955893, 1429277.8120091767)

In [16]:
new_aff

Affine(1256.5430440955893, 0.0, 277489.83748255181,
       0.0, -1256.5430440955893, 204148.34401597711)

In [17]:
subset.shape

(5191, 187, 265)

In [18]:
stacked = subset.stack(allpoints=['y', 'x'])

In [19]:
stacked['time'] = (pd.to_datetime(stacked.time.values) - pd.to_datetime(stacked.time.values[0])).astype('timedelta64[D]')

In [20]:
stacked.time

<xarray.DataArray 'time' (time: 5191)>
array([   0,    1,    2, ..., 5237, 5238, 5239], dtype=int64)
Coordinates:
  * time     (time) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...

In [21]:
stacked.load()

<xarray.DataArray 'data' (time: 5191, allpoints: 49555)>
array([[ 20.43780518,  18.43171692,  16.50647163, ...,          nan,
                 nan,          nan],
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       ..., 
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan]], dtype=float32)
Coordinates:
  * time       (time) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ...
  * allpoints  (allpoints) object (0, 0) (0, 1) (0, 2) (0, 3) (0, 4) (0, 5) ...
Attributes:
    affine: [ -9.47639631e+05   1.25654304e+03   0.00000000e+00   1.42927781e+06
   0.00000000

In [22]:
%time trend = stacked.groupby('allpoints').apply(linear_trend)

Current iteration: 10
Current iteration: 20
Current iteration: 30
Current iteration: 40
Current iteration: 50
Current iteration: 60
Current iteration: 70
Current iteration: 80
Current iteration: 90
Current iteration: 100
Current iteration: 110
Current iteration: 120
Current iteration: 130
Current iteration: 140
Current iteration: 150
Current iteration: 160
Current iteration: 170
Current iteration: 180
Current iteration: 190
Current iteration: 200
Current iteration: 210
Current iteration: 220
Current iteration: 230
Current iteration: 240
Current iteration: 250
Current iteration: 260
Current iteration: 270
Current iteration: 280
Current iteration: 290
Current iteration: 300
Current iteration: 310
Current iteration: 320
Current iteration: 330
Current iteration: 340
Current iteration: 350
Current iteration: 360
Current iteration: 370
Current iteration: 380
Current iteration: 390
Current iteration: 400
Current iteration: 410
Current iteration: 420
Current iteration: 430
Current iteration: 4

In [23]:
trend.attrs['affine'] = new_aff.to_gdal()

In [24]:
trend.attrs['crs'] = stacked.attrs['crs']

In [25]:
res = trend.unstack('allpoints')

In [26]:
res.sel(stats='rvalue')

<xarray.DataArray (y: 187, x: 265)>
array([[-0.07183078, -0.05492095, -0.02803343, ...,         nan,
                nan,         nan],
       [-0.07184386, -0.05489315, -0.03334102, ...,         nan,
                nan,         nan],
       [-0.07252268, -0.04560569, -0.05606452, ...,         nan,
                nan,         nan],
       ..., 
       [-0.03935783, -0.03123281, -0.00684372, ...,         nan,
                nan,         nan],
       [-0.02614183,  0.00103424, -0.01058629, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan]])
Coordinates:
    stats    <U9 'rvalue'
  * y        (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * x        (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
Attributes:
    affine: (277489.83748255181, 1256.5430440955893, 0.0, 204148.34401597711, 0.0, -1256.5430440955893)
    crs: +init=epsg:27700

In [27]:
xarray_to_rasterio_by_band(res, r'D:\Annies_Dissertation\Analysis\Regression\overall_reg_75', dim='stats')

Exported slope
Exported intercept
Exported rvalue
Exported pvalue
Exported stderr
