In [24]:
import xarray as xr
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

In [25]:
spark = SparkSession.builder.appName('spark-xarray').getOrCreate()

In [26]:
sc = spark.sparkContext

In [2]:
dset = xr.open_dataset('../sample-data/air.sig995.2012.nc')

In [3]:
dset

<xarray.Dataset>
Dimensions:  (lat: 73, lon: 144, time: 366)
Coordinates:
  * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 77.5 75.0 72.5 70.0 67.5 ...
  * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 12.5 15.0 17.5 20.0 22.5 ...
  * time     (time) datetime64[ns] 2012-01-01 2012-01-02 2012-01-03 ...
Data variables:
    air      (time, lat, lon) float64 234.5 234.5 234.5 234.5 234.5 234.5 ...
Attributes:
    Conventions:  COARDS
    title:        mean daily NMC reanalysis (2012)
    history:      created 2011/12 by Hoop (netCDF2.3)
    description:  Data is from NMC initialized reanalysis\n(4x/day).  These a...
    platform:     Model
    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...

In [5]:
time1 = dset.sel('time')

In [6]:
time1

<xarray.Dataset>
Dimensions:  (lat: 73, lon: 144, time: 366)
Coordinates:
  * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 77.5 75.0 72.5 70.0 67.5 ...
  * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 12.5 15.0 17.5 20.0 22.5 ...
  * time     (time) datetime64[ns] 2012-01-01 2012-01-02 2012-01-03 ...
Data variables:
    air      (time, lat, lon) float64 234.5 234.5 234.5 234.5 234.5 234.5 ...
Attributes:
    Conventions:  COARDS
    title:        mean daily NMC reanalysis (2012)
    history:      created 2011/12 by Hoop (netCDF2.3)
    description:  Data is from NMC initialized reanalysis\n(4x/day).  These a...
    platform:     Model
    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...

In [7]:
a = [time1]

In [8]:
a

[<xarray.Dataset>
 Dimensions:  (lat: 73, lon: 144, time: 366)
 Coordinates:
   * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 77.5 75.0 72.5 70.0 67.5 ...
   * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 12.5 15.0 17.5 20.0 22.5 ...
   * time     (time) datetime64[ns] 2012-01-01 2012-01-02 2012-01-03 ...
 Data variables:
     air      (time, lat, lon) float64 234.5 234.5 234.5 234.5 234.5 234.5 ...
 Attributes:
     Conventions:  COARDS
     title:        mean daily NMC reanalysis (2012)
     history:      created 2011/12 by Hoop (netCDF2.3)
     description:  Data is from NMC initialized reanalysis\n(4x/day).  These a...
     platform:     Model
     references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...]

# Get Times

In [10]:
times = dset.time

In [11]:
times

<xarray.DataArray 'time' (time: 366)>
array(['2012-01-01T00:00:00.000000000', '2012-01-02T00:00:00.000000000',
       '2012-01-03T00:00:00.000000000', ..., '2012-12-29T00:00:00.000000000',
       '2012-12-30T00:00:00.000000000', '2012-12-31T00:00:00.000000000'], dtype='datetime64[ns]')
Coordinates:
  * time     (time) datetime64[ns] 2012-01-01 2012-01-02 2012-01-03 ...
Attributes:
    long_name:      Time
    actual_range:   [ 17628096.  17636856.]
    delta_t:        0000-00-01 00:00:00
    standard_name:  time
    axis:           T
    avg_period:     0000-00-01 00:00:00

In [13]:
times.values[0]

numpy.datetime64('2012-01-01T00:00:00.000000000')

In [14]:
dset.sel(time=times.values[0])

<xarray.Dataset>
Dimensions:  (lat: 73, lon: 144)
Coordinates:
  * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 77.5 75.0 72.5 70.0 67.5 ...
  * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 12.5 15.0 17.5 20.0 22.5 ...
    time     datetime64[ns] 2012-01-01
Data variables:
    air      (lat, lon) float64 234.5 234.5 234.5 234.5 234.5 234.5 234.5 ...
Attributes:
    Conventions:  COARDS
    title:        mean daily NMC reanalysis (2012)
    history:      created 2011/12 by Hoop (netCDF2.3)
    description:  Data is from NMC initialized reanalysis\n(4x/day).  These a...
    platform:     Model
    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...

In [19]:
rows = times.values

In [21]:
rows.size

366

In [16]:
partitions = 20

In [22]:
step = rows.size / partitions

In [23]:
step

18

In [27]:
a = sc.range(0, 10, 2)

In [28]:
a.collect()

[0, 2, 4, 6, 8]

In [29]:
step

18

In [30]:
rows.size

366

In [31]:
rdd = sc.range(0, rows.size, step)

In [32]:
rdd = rdd.sortBy(lambda x: x, numPartitions=partitions)

In [33]:
rdd.count()

21

In [34]:
rdd.collect()

[0,
 18,
 36,
 54,
 72,
 90,
 108,
 126,
 144,
 162,
 180,
 198,
 216,
 234,
 252,
 270,
 288,
 306,
 324,
 342,
 360]

In [35]:
times

<xarray.DataArray 'time' (time: 366)>
array(['2012-01-01T00:00:00.000000000', '2012-01-02T00:00:00.000000000',
       '2012-01-03T00:00:00.000000000', ..., '2012-12-29T00:00:00.000000000',
       '2012-12-30T00:00:00.000000000', '2012-12-31T00:00:00.000000000'], dtype='datetime64[ns]')
Coordinates:
  * time     (time) datetime64[ns] 2012-01-01 2012-01-02 2012-01-03 ...
Attributes:
    long_name:      Time
    actual_range:   [ 17628096.  17636856.]
    delta_t:        0000-00-01 00:00:00
    standard_name:  time
    axis:           T
    avg_period:     0000-00-01 00:00:00

In [37]:
times.values.size

366

In [50]:
def readonep(dset, timestep):
    chunk = dset.sel(time=timestep)
    return chunk

In [51]:
readonep(dset, times.values[0])

<xarray.Dataset>
Dimensions:  (lat: 73, lon: 144)
Coordinates:
  * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 77.5 75.0 72.5 70.0 67.5 ...
  * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 12.5 15.0 17.5 20.0 22.5 ...
    time     datetime64[ns] 2012-01-01
Data variables:
    air      (lat, lon) float64 234.5 234.5 234.5 234.5 234.5 234.5 234.5 ...
Attributes:
    Conventions:  COARDS
    title:        mean daily NMC reanalysis (2012)
    history:      created 2011/12 by Hoop (netCDF2.3)
    description:  Data is from NMC initialized reanalysis\n(4x/day).  These a...
    platform:     Model
    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...

In [52]:
rdd = sc.parallelize(times.values)

In [53]:
rdd.count()

366

In [54]:
rdd.first()

numpy.datetime64('2012-01-01T00:00:00.000000000')

In [55]:
rdd1 = rdd.map(lambda x: readonep(dset, x))

In [56]:
rdd1.count()

366

In [57]:
rdd1.first()

<xarray.Dataset>
Dimensions:  (lat: 73, lon: 144)
Coordinates:
  * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 77.5 75.0 72.5 70.0 67.5 ...
  * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 12.5 15.0 17.5 20.0 22.5 ...
    time     datetime64[ns] 2012-01-01
Data variables:
    air      (lat, lon) float64 234.5 234.5 234.5 234.5 234.5 234.5 234.5 ...
Attributes:
    Conventions:  COARDS
    title:        mean daily NMC reanalysis (2012)
    history:      created 2011/12 by Hoop (netCDF2.3)
    description:  Data is from NMC initialized reanalysis\n(4x/day).  These a...
    platform:     Model
    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...

In [58]:
rdd1.take(3)

[<xarray.Dataset>
 Dimensions:  (lat: 73, lon: 144)
 Coordinates:
   * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 77.5 75.0 72.5 70.0 67.5 ...
   * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 12.5 15.0 17.5 20.0 22.5 ...
     time     datetime64[ns] 2012-01-01
 Data variables:
     air      (lat, lon) float64 234.5 234.5 234.5 234.5 234.5 234.5 234.5 ...
 Attributes:
     Conventions:  COARDS
     title:        mean daily NMC reanalysis (2012)
     history:      created 2011/12 by Hoop (netCDF2.3)
     description:  Data is from NMC initialized reanalysis\n(4x/day).  These a...
     platform:     Model
     references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...,
 <xarray.Dataset>
 Dimensions:  (lat: 73, lon: 144)
 Coordinates:
   * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 77.5 75.0 72.5 70.0 67.5 ...
   * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 12.5 15.0 17.5 20.0 22.5 ...
     time     datetime64[ns] 2012-01-02
 Data variables:
     air      (lat,

In [59]:
a = rdd1.first()

In [69]:
mean = rdd1.map(lambda x: x.mean())

In [70]:
mean.count()

366

In [73]:
mean.collect()

[<xarray.Dataset>
 Dimensions:  ()
 Data variables:
     air      float64 277.0, <xarray.Dataset>
 Dimensions:  ()
 Data variables:
     air      float64 276.8, <xarray.Dataset>
 Dimensions:  ()
 Data variables:
     air      float64 276.9, <xarray.Dataset>
 Dimensions:  ()
 Data variables:
     air      float64 276.9, <xarray.Dataset>
 Dimensions:  ()
 Data variables:
     air      float64 277.4, <xarray.Dataset>
 Dimensions:  ()
 Data variables:
     air      float64 277.6, <xarray.Dataset>
 Dimensions:  ()
 Data variables:
     air      float64 277.4, <xarray.Dataset>
 Dimensions:  ()
 Data variables:
     air      float64 277.0, <xarray.Dataset>
 Dimensions:  ()
 Data variables:
     air      float64 277.1, <xarray.Dataset>
 Dimensions:  ()
 Data variables:
     air      float64 277.1, <xarray.Dataset>
 Dimensions:  ()
 Data variables:
     air      float64 277.0, <xarray.Dataset>
 Dimensions:  ()
 Data variables:
     air      float64 276.7, <xarray.Dataset>
 Dimensions:  ()
 Data

In [75]:
a.air.mean()

<xarray.DataArray 'air' ()>
array(277.0142912613065)
Coordinates:
    time     datetime64[ns] 2012-01-01

In [80]:
dset.lat.values

array([ 90. ,  87.5,  85. ,  82.5,  80. ,  77.5,  75. ,  72.5,  70. ,
        67.5,  65. ,  62.5,  60. ,  57.5,  55. ,  52.5,  50. ,  47.5,
        45. ,  42.5,  40. ,  37.5,  35. ,  32.5,  30. ,  27.5,  25. ,
        22.5,  20. ,  17.5,  15. ,  12.5,  10. ,   7.5,   5. ,   2.5,
         0. ,  -2.5,  -5. ,  -7.5, -10. , -12.5, -15. , -17.5, -20. ,
       -22.5, -25. , -27.5, -30. , -32.5, -35. , -37.5, -40. , -42.5,
       -45. , -47.5, -50. , -52.5, -55. , -57.5, -60. , -62.5, -65. ,
       -67.5, -70. , -72.5, -75. , -77.5, -80. , -82.5, -85. , -87.5, -90. ], dtype=float32)

In [81]:
dset.lon.values

array([   0. ,    2.5,    5. ,    7.5,   10. ,   12.5,   15. ,   17.5,
         20. ,   22.5,   25. ,   27.5,   30. ,   32.5,   35. ,   37.5,
         40. ,   42.5,   45. ,   47.5,   50. ,   52.5,   55. ,   57.5,
         60. ,   62.5,   65. ,   67.5,   70. ,   72.5,   75. ,   77.5,
         80. ,   82.5,   85. ,   87.5,   90. ,   92.5,   95. ,   97.5,
        100. ,  102.5,  105. ,  107.5,  110. ,  112.5,  115. ,  117.5,
        120. ,  122.5,  125. ,  127.5,  130. ,  132.5,  135. ,  137.5,
        140. ,  142.5,  145. ,  147.5,  150. ,  152.5,  155. ,  157.5,
        160. ,  162.5,  165. ,  167.5,  170. ,  172.5,  175. ,  177.5,
        180. ,  182.5,  185. ,  187.5,  190. ,  192.5,  195. ,  197.5,
        200. ,  202.5,  205. ,  207.5,  210. ,  212.5,  215. ,  217.5,
        220. ,  222.5,  225. ,  227.5,  230. ,  232.5,  235. ,  237.5,
        240. ,  242.5,  245. ,  247.5,  250. ,  252.5,  255. ,  257.5,
        260. ,  262.5,  265. ,  267.5,  270. ,  272.5,  275. ,  277.5,
      

In [82]:
dset.sel(lat=0, lon=0)

<xarray.Dataset>
Dimensions:  (time: 366)
Coordinates:
    lat      float32 0.0
    lon      float32 0.0
  * time     (time) datetime64[ns] 2012-01-01 2012-01-02 2012-01-03 ...
Data variables:
    air      (time) float64 298.5 298.9 299.3 298.8 299.4 299.6 299.4 300.3 ...
Attributes:
    Conventions:  COARDS
    title:        mean daily NMC reanalysis (2012)
    history:      created 2011/12 by Hoop (netCDF2.3)
    description:  Data is from NMC initialized reanalysis\n(4x/day).  These a...
    platform:     Model
    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...

In [83]:
rdd1.getNumPartitions()

8