In [1]:
%autosave 120

Autosaving every 120 seconds


In [2]:
# Hide warnings if there are any
import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession
from pyspark import SparkConf
import read
import numpy as np
from __future__ import print_function
import h5py
from glob import glob
import pandas as pd

%matplotlib inline
import matplotlib
matplotlib.style.use('ggplot')
import matplotlib.pyplot as plt
import os
import sys
import cf_units as cf

In [3]:
spark = SparkSession.builder.appName("storm-tracker-preprocess").getOrCreate()

In [4]:
sc = spark.sparkContext

In [5]:
partitions = 12

In [6]:
sc.addPyFile("/glade/p/work/abanihi/h5spark/src/main/python/h5spark/read.py")

# Data

In [7]:
filepath = '/glade/scratch/abanihi/data/stormtracker/b.e11.B20TRC5CNBDRD.f09_g16.031.cam.h2.PSL.1990010100Z-2005123118Z.nc'

!ncdump -h /glade/scratch/abanihi/data/stormtracker/b.e11.B20TRC5CNBDRD.f09_g16.031.cam.h2.PSL.1990010100Z-2005123118Z.nc

In [8]:
from netCDF4 import Dataset

In [9]:
dset = Dataset(filepath)

In [10]:
from __future__ import print_function
from netCDF4 import Dataset



def ncdump(dset, verbose=True):
    """
    http://schubert.atmos.colostate.edu/~cslocum/netcdf_example.html
    
    Function that emulates NCAR's ncdump utility. It outputs dimensions, variables and their
    attribute information. It requires a valid instance of Dataset.
    
    Arguments:
    ----------
    - dset: netCDF4.Dataset
        A netCDF4 dataset object
    - verbose: Boolean
        whether or not nc_attrs, nc_dims, and nc_vars are printed.
        
    Returns:
    --------
    - nc_attrs : list
        A Python list of the NetCDF file global attributes
    - nc_dims : list
        A Python list of the NetCDF file dimensions
    - nc_vars : list
        A Python list of the NetCDF file variables
    """
    
    def print_ncattr(key):
        """
        Prints the NetCDF file attributes for a given key

        Parameters
        ----------
        - key : unicode
            a valid netCDF4.Dataset.variables key
        """
        try:
            print("\t\ttype: {}".format(repr(dset.variables[key].dtype)))
            for ncattr in dset.variables[key].ncattrs():
                print("\t\t{} {}".format(ncattr, repr(dset.variables[key].getncattr(ncattr))))
                
        except KeyError:
            print("\t\tWARNING: {} does not contain variable attributes".format(key))
            
    # NetCDF global attributes
    nc_attrs = dset.ncattrs()
    if verbose:
        print("NetCDF Global Attributes:")
        for nc_attr in nc_attrs:
            print("\t{}: {}".format(nc_attr, repr(dset.getncattr(nc_attr))))
    
    
    nc_dims = [dim for dim in dset.dimensions] # list of dset dimensions
    # Dimension shape information.
    if verbose:
        print("NetCDF dimension information:")
        for dim in nc_dims:
            print("\tName: {}".format(dim))
            print("\t\tsize: {}".format(len(dset.dimensions[dim])))
            print_ncattr(dim)
            
            
    # Variable information.
    nc_vars = [var for var in dset.variables] # list of dset variables
    if verbose:
        print("NetCDF variable information:")
        for var in nc_vars:
            if var not in nc_dims:
                print("\tName: {}".format(var))
                print("\t\tdimensions: {}".format(dset.variables[var].dimensions))
                print("\t\tsize: {}".format(dset.variables[var].size))
                print_ncattr(var)
                
    return nc_attrs, nc_dims, nc_vars


In [11]:
nc_attrs, nc_dims, nc_vars = ncdump(dset)

NetCDF Global Attributes:
	Conventions: u'CF-1.0'
	source: u'CAM'
	case: u'b.e11.B20TRC5CNBDRD.f09_g16.031'
	title: u'UNSET'
	logname: u'mickelso'
	host: u'ys1325'
	Version: u'$Name$'
	revision_Id: u'$Id$'
	initial_file: u'/glade/p/cesmdata/cseg/inputdata/atm/cam/inic/fv/cami-mam3_0000-01-01_0.9x1.25_L30_c100618.nc'
	topography_file: u'/glade/p/cesmdata/cseg/inputdata/atm/cam/topo/USGS-gtopo30_0.9x1.25_remap_c051027.nc'
	important_note: u"This data is part of the project 'Blind Evaluation of Lossy Data-Compression in LENS'.  Please exercise caution before using this data for other purposes."
NetCDF dimension information:
	Name: lat
		size: 192
		type: dtype('float64')
		long_name u'latitude'
		units u'degrees_north'
	Name: lon
		size: 288
		type: dtype('float64')
		long_name u'longitude'
		units u'degrees_east'
	Name: slat
		size: 191
		type: dtype('float64')
		long_name u'staggered latitude'
		units u'degrees_north'
	Name: slon
		size: 288
		type: dtype('float64')
		long_name u'stagge

In [12]:
lats = dset.variables['lat'][:]
lons = dset.variables['lon'][:]
time = dset.variables['time'][:]
slats = dset.variables['slat'][:]
slons = dset.variables['slon'][:]
P0 = dset.variables['P0'][:]
PSL = dset.variables['PSL'][:]

In [13]:
dset.close()

In [14]:
nc_attrs

[u'Conventions',
 u'source',
 u'case',
 u'title',
 u'logname',
 u'host',
 u'Version',
 u'revision_Id',
 u'initial_file',
 u'topography_file',
 u'important_note']

In [15]:
nc_dims

[u'lat', u'lon', u'slat', u'slon', u'time', u'nbnd', u'chars', u'lev', u'ilev']

In [16]:
PSL.shape

(23360, 192, 288)

In [17]:
from time import time
psl_rdd = read.h5read(sc, (filepath, '/PSL'), mode='single', partitions=partitions).zipWithIndex()

In [18]:
psl_rdd.cache()

PythonRDD[8] at RDD at PythonRDD.scala:48

In [19]:
# action + time 
t0 = time()
total_size = psl_rdd.count()
tt = time() - t0
print("The total size is {}".format(total_size))
print("Count done in {} seconds".format(round(tt,3)))

The total size is 23360
Count done in 16.263 seconds


In [20]:
print(psl_rdd.first())

(array([[  99190.,   99190.,   99190., ...,   99190.,   99190.,   99190.],
       [  99256.,   99254.,   99252., ...,   99260.,   99260.,   99258.],
       [  99264.,   99264.,   99262., ...,   99266.,   99266.,   99264.],
       ..., 
       [ 101326.,  101332.,  101336., ...,  101314.,  101318.,  101322.],
       [ 101556.,  101560.,  101564., ...,  101548.,  101552.,  101554.],
       [ 101854.,  101854.,  101854., ...,  101854.,  101854.,  101854.]], dtype=float32), 0)
