# Linear regression over each pixel- PM2.5 and time
- Code based on Rabernat (2016), modified to include stats.linregress

### Daily Mean PM2.5 for the time period 2000-mid 2014

In [1]:
import numpy as np
import xarray as xr
import rasterio
%matplotlib inline
from matplotlib.pyplot import *
import pandas as pd
from rasterio_to_xarray import rasterio_to_xarray, xarray_to_rasterio, xarray_to_rasterio_by_band
from tqdm import tqdm
from scipy import stats

from shapely.geometry import shape
from rasterstats.io import read_features

#### Linear regression

In [3]:
i = 0

In [4]:
# define a function to compute a linear trend of a timeseries
def linear_trend(x):
    # These next few lines get i and add 1 to it, and then print it if it is divisible by 10
    global i
    i = i+1
    
    if i % 10 == 0:
        print('Current iteration: %d' % i)
    
    # Remove the NaN values
    x = x.dropna(dim='time')
    
    if len(x) == 0:
        return xr.DataArray(np.array([np.nan] * 5),
                            dims=['stats'],
                            coords={'stats': ['slope', 'intercept', 'rvalue', 'pvalue', 'stderr']})
    
    # Get the results of the linear regression
    regr = stats.linregress(x.time, x)

    # We need to return a dataarray or else xarray's groupby won't be happy
    # We convert the regression output to an array, and then set up the
    # DataArray so that it has a dimension called 'stats' with labels for each of
    # the values
    return xr.DataArray(np.array(regr),
                        dims=['stats'],
                        coords={'stats': ['slope', 'intercept', 'rvalue', 'pvalue', 'stderr']})

#### Daily Mean PM2.5 data

In [2]:
PM25 = xr.open_mfdataset(r'C:\MAIACData\nc_monthly_daily\*PM25.nc')['data']

In [6]:
subset = PM25.isel(x=slice(x_start, y_stop), y=slice(y_start, y_stop))

In [7]:
subset['x'] = np.arange(len(subset.x))
subset['y'] = np.arange(len(subset.y))

In [8]:
subset = subset.isel(time=np.argsort(subset.time))

- get the affine variables for the subset dataset

In [5]:
# Subsetting params for x and y
x_start = 975
x_end = None

y_start = 975
y_stop = None

In [9]:
# Get the actual Affine object from the data stored in the attrs
orig_aff = rasterio.Affine.from_gdal(*subset.attrs['affine'])

In [10]:
def window_bounds(window, affine):
    (row_start, row_stop), (col_start, col_stop) = window
    w, s = (col_start, row_stop) * affine
    e, n = (col_stop, row_start) * affine
    return w, s, e, n

In [11]:
c, _, _, f = window_bounds( ( (x_start, 5000), (y_start, 5000)), orig_aff)  # c ~ west, f ~ north
a, b, _, d, e, _, _, _, _ = tuple(orig_aff)
new_aff = rasterio.Affine(a, b, c, d, e, f)

In [12]:
orig_aff

Affine(1256.5430440955893, 0.0, -947639.63051064778,
       0.0, -1256.5430440955893, 1429277.8120091767)

In [13]:
new_aff

Affine(1256.5430440955893, 0.0, 277489.83748255181,
       0.0, -1256.5430440955893, 204148.34401597711)

In [14]:
subset.shape

(5191, 187, 265)

In [15]:
stacked = subset.stack(allpoints=['y', 'x'])

In [16]:
stacked['time'] = (pd.to_datetime(stacked.time.values) - pd.to_datetime(stacked.time.values[0])).astype('timedelta64[D]')

In [17]:
stacked.time

<xarray.DataArray 'time' (time: 5191)>
array([   0,    1,    2, ..., 5237, 5238, 5239], dtype=int64)
Coordinates:
  * time     (time) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...

In [18]:
stacked.load()

<xarray.DataArray 'data' (time: 5191, allpoints: 49555)>
array([[ 20.43780518,  18.43171692,  16.50647163, ...,          nan,
                 nan,          nan],
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       ..., 
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan]], dtype=float32)
Coordinates:
  * time       (time) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ...
  * allpoints  (allpoints) object (0, 0) (0, 1) (0, 2) (0, 3) (0, 4) (0, 5) ...
Attributes:
    affine: [ -9.47639631e+05   1.25654304e+03   0.00000000e+00   1.42927781e+06
   0.00000000

In [19]:
%time trend = stacked.groupby('allpoints').apply(linear_trend)

Current iteration: 10
Current iteration: 20
Current iteration: 30
Current iteration: 40
Current iteration: 50
Current iteration: 60
Current iteration: 70
Current iteration: 80
Current iteration: 90
Current iteration: 100
Current iteration: 110
Current iteration: 120
Current iteration: 130
Current iteration: 140
Current iteration: 150
Current iteration: 160
Current iteration: 170
Current iteration: 180
Current iteration: 190
Current iteration: 200
Current iteration: 210
Current iteration: 220
Current iteration: 230
Current iteration: 240
Current iteration: 250
Current iteration: 260
Current iteration: 270
Current iteration: 280
Current iteration: 290
Current iteration: 300
Current iteration: 310
Current iteration: 320
Current iteration: 330
Current iteration: 340
Current iteration: 350
Current iteration: 360
Current iteration: 370
Current iteration: 380
Current iteration: 390
Current iteration: 400
Current iteration: 410
Current iteration: 420
Current iteration: 430
Current iteration: 4

In [20]:
trend.attrs['affine'] = new_aff.to_gdal()

In [21]:
trend.attrs['crs'] = stacked.attrs['crs']

In [22]:
res = trend.unstack('allpoints')

In [23]:
res.sel(stats='rvalue')

<xarray.DataArray (y: 187, x: 265)>
array([[-0.07183078, -0.05492095, -0.02803343, ...,         nan,
                nan,         nan],
       [-0.07184386, -0.05489315, -0.03334102, ...,         nan,
                nan,         nan],
       [-0.07252268, -0.04560569, -0.05606452, ...,         nan,
                nan,         nan],
       ..., 
       [-0.03935783, -0.03123281, -0.00684372, ...,         nan,
                nan,         nan],
       [-0.02614183,  0.00103424, -0.01058629, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan]])
Coordinates:
    stats    <U9 'rvalue'
  * y        (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * x        (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
Attributes:
    affine: (277489.83748255181, 1256.5430440955893, 0.0, 204148.34401597711, 0.0, -1256.5430440955893)
    crs: +init=epsg:27700

In [24]:
xarray_to_rasterio_by_band(res, r'D:\Annies_Dissertation\Analysis\Regression\overall_reg_', dim='stats')

Exported slope
Exported intercept
Exported rvalue
Exported pvalue
Exported stderr


- produce rsquared value

In [None]:
rvalue = rasterio_to_xarray(r'D:\Annies_Dissertation\Analysis\Regression\overall_reg_rvalue.tif')

In [None]:
rvalue

In [None]:
rsq = rvalue**2

In [None]:
rsq = np.square(rvalue)

In [None]:
rsq

In [None]:
rsq.attrs = rvalue.attrs

In [None]:
xarray_to_rasterio(rsq, r'D:\Annies_Dissertation\Analysis\Regression\overall_reg_rsquared.tif')

#### Data from 2009 onwards and Monthly

In [25]:
After2009 = subset.sel(time=slice('2009', '2016'))

In [26]:
monthly_data = After2009.resample('M', dim='time', how='mean', keep_attrs=True)

In [27]:
subset = monthly_data

In [28]:
subset.shape

(66, 187, 265)

In [29]:
stacked = subset.stack(allpoints=['y', 'x'])

In [30]:
stacked['time'] = (pd.to_datetime(stacked.time.values) - pd.to_datetime(stacked.time.values[0])).astype('timedelta64[D]')

In [31]:
stacked.load()

  x = np.divide(x1, x2, out)


<xarray.DataArray 'data' (time: 66, allpoints: 49555)>
array([[  6.16499186,   7.27143002,   7.54503965, ...,          nan,
                 nan,          nan],
       [ 13.24014473,  11.80920887,  12.46907806, ...,          nan,
                 nan,          nan],
       [ 13.30632782,  16.23963165,  19.32287788, ...,          nan,
                 nan,          nan],
       ..., 
       [  5.43005943,   8.4121685 ,  13.8059721 , ...,          nan,
                 nan,          nan],
       [  7.76672745,   8.56555176,  10.1230917 , ...,          nan,
                 nan,          nan],
       [  4.4721446 ,   4.81483126,   7.09846687, ...,          nan,
                 nan,          nan]], dtype=float32)
Coordinates:
  * time       (time) int64 0 28 59 89 120 150 181 212 242 273 303 334 365 ...
  * allpoints  (allpoints) object (0, 0) (0, 1) (0, 2) (0, 3) (0, 4) (0, 5) ...
Attributes:
    affine: [ -9.47639631e+05   1.25654304e+03   0.00000000e+00   1.42927781e+06
   0.00000000e+

In [32]:
%time trend = stacked.groupby('allpoints').apply(linear_trend)

Current iteration: 49560
Current iteration: 49570
Current iteration: 49580
Current iteration: 49590
Current iteration: 49600
Current iteration: 49610
Current iteration: 49620
Current iteration: 49630
Current iteration: 49640
Current iteration: 49650
Current iteration: 49660
Current iteration: 49670
Current iteration: 49680
Current iteration: 49690
Current iteration: 49700
Current iteration: 49710
Current iteration: 49720
Current iteration: 49730
Current iteration: 49740
Current iteration: 49750
Current iteration: 49760
Current iteration: 49770
Current iteration: 49780
Current iteration: 49790
Current iteration: 49800
Current iteration: 49810
Current iteration: 49820
Current iteration: 49830
Current iteration: 49840
Current iteration: 49850
Current iteration: 49860
Current iteration: 49870
Current iteration: 49880
Current iteration: 49890
Current iteration: 49900
Current iteration: 49910
Current iteration: 49920
Current iteration: 49930
Current iteration: 49940
Current iteration: 49950


In [33]:
trend.attrs['affine'] = new_aff.to_gdal()

In [34]:
trend.attrs['crs'] = stacked.attrs['crs']

In [35]:
res = trend.unstack('allpoints')

In [36]:
res.sel(stats='rvalue')

<xarray.DataArray (y: 187, x: 265)>
array([[-0.00781883, -0.21179636, -0.20302125, ...,         nan,
                nan,         nan],
       [-0.00790708, -0.21153031, -0.19773669, ...,         nan,
                nan,         nan],
       [-0.14489348, -0.18745629, -0.06267111, ...,         nan,
                nan,         nan],
       ..., 
       [ 0.09912626,  0.11926991,  0.19151793, ...,         nan,
                nan,         nan],
       [ 0.09741865,  0.11293997,  0.19098706, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan]])
Coordinates:
    stats    <U9 'rvalue'
  * y        (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * x        (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
Attributes:
    affine: (277489.83748255181, 1256.5430440955893, 0.0, 204148.34401597711, 0.0, -1256.5430440955893)
    crs: +init=epsg:27700

In [37]:
xarray_to_rasterio_by_band(res, r'D:\Annies_Dissertation\Analysis\Regression\subset_overall_reg_', dim='stats')

Exported slope
Exported intercept
Exported rvalue
Exported pvalue
Exported stderr


In [38]:
subsetrvalue = rasterio_to_xarray(r'D:\Annies_Dissertation\Analysis\Regression\subset_overall_reg_rvalue.tif')

  data = np.where(data == src.nodata, np.nan, data)


In [39]:
subsetrvalue

<xarray.DataArray (y: 187, x: 265)>
array([[-0.00781883, -0.21179636, -0.20302125, ...,         nan,
                nan,         nan],
       [-0.00790708, -0.21153031, -0.19773669, ...,         nan,
                nan,         nan],
       [-0.14489348, -0.18745629, -0.06267111, ...,         nan,
                nan,         nan],
       ..., 
       [ 0.09912626,  0.11926991,  0.19151793, ...,         nan,
                nan,         nan],
       [ 0.09741865,  0.11293997,  0.19098706, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan]])
Coordinates:
  * x        (x) int32 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * y        (y) int32 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
Attributes:
    crs: +init=epsg:27700
    affine: (277489.8374825518, 1256.5430440955893, 0.0, 204148.3440159771, 0.0, -1256.5430440955893)

In [40]:
rsq = np.square(subsetrvalue)

In [41]:
rsq

<xarray.DataArray (y: 187, x: 265)>
array([[  6.11341268e-05,   4.48576985e-02,   4.12176279e-02, ...,
                     nan,              nan,              nan],
       [  6.25219820e-05,   4.47450710e-02,   3.90997991e-02, ...,
                     nan,              nan,              nan],
       [  2.09941200e-02,   3.51398615e-02,   3.92766771e-03, ...,
                     nan,              nan,              nan],
       ..., 
       [  9.82601444e-03,   1.42253114e-02,   3.66791178e-02, ...,
                     nan,              nan,              nan],
       [  9.49039243e-03,   1.27554366e-02,   3.64760558e-02, ...,
                     nan,              nan,              nan],
       [             nan,              nan,              nan, ...,
                     nan,              nan,              nan]])
Coordinates:
  * x        (x) int32 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
  * y        (y) int32 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 .

In [42]:
rsq.attrs = subsetrvalue.attrs

In [43]:
xarray_to_rasterio(rsq, r'D:\Annies_Dissertation\Analysis\Regression\subset_overall_reg_rsquared.tif')