# Download derived data

This will save the data in the existing data directory in the base package directory

In [1]:
import tarfile
import urllib.request as urllib2

tgz = urllib2.urlopen("https://svrimg.niu.edu/grl_data/grl_data_22.tar.gz")
data = tarfile.open(fileobj=tgz, mode='r:gz')
data.extractall(path="../data")

# Boxplot data

### These are the data used to generate the boxplot data
### They are organized into three simulations

```
1. *_historical_*.csv (HIST)
2. *_future_4p5_*.csv (FUTR 4.5)
3. *_future_8p5_*.csv (FUTR 8.5)
```

### three dBZ thresholds

```
1. *_40dbz_*.csv (>= 40 dBZ)
2. *_50dbz_*.csv (>= 50 dBZ)
3. *_60dbz_*.csv (>= 60 dBZ)
```

### and four regions:

```
1. econus_*.csv (Eastern CONUS)
2. ama_*.csv (Amarillo)
3. mnp_*.csv (Minneapolis)
4. mph_*.csv (Memphis)
```


# Reading econus data for 40 dBZ / historical

### index column is the julian date
### other columns represent 'simulation years'

#### values are the mean count of 40 dBZ days in each region (max 1 for any given day)

In [2]:
import pandas as pd

df = pd.read_csv("../data/boxplot/econus_historical_40dbz_days.csv", index_col=0, parse_dates=True)

df

Unnamed: 0,1990-1991,1991-1992,1992-1993,1993-1994,1994-1995,1995-1996,1996-1997,1997-1998,1998-1999,1999-2000,2000-2001,2001-2002,2002-2003,2003-2004,2004-2005
1992-01-01,0.151925,0.015609,0.140479,0.273673,0.054110,0.047867,0.166493,0.299688,0.020812,0.043704,0.026015,0.040583,0.342352,0.240375,0.007284
1992-01-02,0.156087,0.137357,0.012487,0.072841,0.031217,0.026015,0.124870,0.337149,0.015609,0.158169,0.044745,0.207076,0.109261,0.218522,0.035380
1992-01-03,0.105099,0.363163,0.026015,0.068678,0.024974,0.016649,0.095734,0.197711,0.048907,0.299688,0.062435,0.315297,0.065557,0.280957,0.178980
1992-01-04,0.002081,0.424558,0.059313,0.084287,0.020812,0.006243,0.130073,0.186264,0.010406,0.057232,0.084287,0.143600,0.008325,0.106139,0.053070
1992-01-05,0.021852,0.181061,0.157128,0.143600,0.037461,0.030177,0.080125,0.141519,0.002081,0.167534,0.039542,0.120708,0.044745,0.144641,0.065557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992-12-27,0.026015,0.065557,0.127992,0.038502,0.041623,0.045786,0.395421,0.152966,0.097815,0.171696,0.013528,0.015609,0.097815,0.080125,0.109261
1992-12-28,0.000000,0.033299,0.110302,0.116545,0.120708,0.061394,0.276795,0.019771,0.018730,0.156087,0.002081,0.020812,0.048907,0.029136,0.098855
1992-12-29,0.038502,0.014568,0.152966,0.111342,0.200832,0.078044,0.251821,0.116545,0.038502,0.235172,0.014568,0.059313,0.233091,0.024974,0.042664
1992-12-30,0.142560,0.010406,0.187305,0.112383,0.264308,0.152966,0.181061,0.160250,0.087409,0.517170,0.005203,0.048907,0.343392,0.098855,0.112383


# Cumulative data

### Exact same set up as boxplot data, but each year has a running count of days, except:

### three dBZ thresholds

```
1. *_40_cumu_dbz_*.csv (>= 40 dBZ)
2. *_50_cumu_dbz_*.csv (>= 50 dBZ)
3. *_60_cumu_dbz_*.csv (>= 60 dBZ)
```

### and the value is the sum of mean regional grid days up until that particular day of the year

In [3]:
import pandas as pd

df = pd.read_csv("../data/cumul/econus_historical_40_cumu_dbz_days.csv", index_col=0, parse_dates=True)

df

Unnamed: 0,1990-1991,1991-1992,1992-1993,1993-1994,1994-1995,1995-1996,1996-1997,1997-1998,1998-1999,1999-2000,2000-2001,2001-2002,2002-2003,2003-2004,2004-2005,mean,median,p25,p75
1992-01-01,0.151925,0.015609,0.140479,0.273673,0.054110,0.047867,0.166493,0.299688,0.020812,0.043704,0.026015,0.040583,0.342352,0.240375,0.007284,0.124731,0.054110,0.033299,0.203434
1992-01-02,0.308012,0.152966,0.152966,0.346514,0.085328,0.073881,0.291363,0.636837,0.036420,0.201873,0.070760,0.247659,0.451613,0.458897,0.042664,0.237183,0.201873,0.079605,0.327263
1992-01-03,0.413111,0.516129,0.178980,0.415193,0.110302,0.090531,0.387097,0.834547,0.085328,0.501561,0.133195,0.562955,0.517170,0.739854,0.221644,0.380506,0.413111,0.156087,0.516649
1992-01-04,0.415193,0.940687,0.238293,0.499480,0.131113,0.096774,0.517170,1.020812,0.095734,0.558793,0.217482,0.706556,0.525494,0.845994,0.274714,0.472286,0.499480,0.227888,0.632674
1992-01-05,0.437045,1.121748,0.395421,0.643080,0.168574,0.126951,0.597294,1.162331,0.097815,0.726327,0.257024,0.827263,0.570239,0.990635,0.340271,0.564135,0.570239,0.298647,0.776795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992-12-27,94.283039,99.340271,102.962539,96.881374,99.693028,105.947971,102.700312,100.857440,99.411030,105.077003,96.142560,98.802289,98.115505,100.800208,94.002081,99.667777,99.411030,97.498439,101.778876
1992-12-28,94.283039,99.373569,103.072841,96.997919,99.813736,106.009365,102.977107,100.877211,99.429761,105.233091,96.144641,98.823101,98.164412,100.829344,94.100937,99.742005,99.429761,97.581165,101.927159
1992-12-29,94.321540,99.388137,103.225806,97.109261,100.014568,106.087409,103.228928,100.993757,99.468262,105.468262,96.159209,98.882414,98.397503,100.854318,94.143600,99.849532,99.468262,97.753382,102.109781
1992-12-30,94.464100,99.398543,103.413111,97.221644,100.278876,106.240375,103.409990,101.154006,99.555671,105.985432,96.164412,98.931322,98.740895,100.953174,94.255983,100.011169,99.555671,97.981270,102.281998


# Grid days

### These are the netCDF files representing grid day counts for each season in each simulation year

### They are organized into three simulations

```
1. HIST_*.nc (HIST)
2. FUTR45_*.csv (FUTR 4.5)
3. FUTR85_*.csv (FUTR 8.5)
```

### three dBZ thresholds (which are saved as variables in the netCDF file)

```
1. '40_dbz_count' (>= 40 dBZ)
2. '50_dbz_count' (>= 50 dBZ)
3. '60_dbz_count' (>= 60 dBZ)
```

### the time dimension represents the start of the valid season

In [4]:
import xarray as xr

ds = xr.open_dataset("../data/days/HIST_2000_JJA_grid_days.nc")

ds

# Often we will want to read in several files associated with each simulation:

#### Example: read in all HIST data using dask

In [5]:
ds = xr.open_mfdataset("../data/days/HIST_*_*_grid_days.nc", combine='by_coords')

ds

Unnamed: 0,Array,Chunk
Bytes,1.03 MiB,17.52 kiB
Shape,"(60, 38, 59)","(1, 38, 59)"
Count,180 Tasks,60 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.03 MiB 17.52 kiB Shape (60, 38, 59) (1, 38, 59) Count 180 Tasks 60 Chunks Type float64 numpy.ndarray",59  38  60,

Unnamed: 0,Array,Chunk
Bytes,1.03 MiB,17.52 kiB
Shape,"(60, 38, 59)","(1, 38, 59)"
Count,180 Tasks,60 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.03 MiB,17.52 kiB
Shape,"(60, 38, 59)","(1, 38, 59)"
Count,180 Tasks,60 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.03 MiB 17.52 kiB Shape (60, 38, 59) (1, 38, 59) Count 180 Tasks 60 Chunks Type float64 numpy.ndarray",59  38  60,

Unnamed: 0,Array,Chunk
Bytes,1.03 MiB,17.52 kiB
Shape,"(60, 38, 59)","(1, 38, 59)"
Count,180 Tasks,60 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.03 MiB,17.52 kiB
Shape,"(60, 38, 59)","(1, 38, 59)"
Count,180 Tasks,60 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.03 MiB 17.52 kiB Shape (60, 38, 59) (1, 38, 59) Count 180 Tasks 60 Chunks Type float64 numpy.ndarray",59  38  60,

Unnamed: 0,Array,Chunk
Bytes,1.03 MiB,17.52 kiB
Shape,"(60, 38, 59)","(1, 38, 59)"
Count,180 Tasks,60 Chunks
Type,float64,numpy.ndarray


# We can then calculate seasonal statistics:

In [6]:
dseason = ds.groupby('time.season').mean('time')

dseason

Unnamed: 0,Array,Chunk
Bytes,70.06 kiB,17.52 kiB
Shape,"(4, 38, 59)","(1, 38, 59)"
Count,328 Tasks,4 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 70.06 kiB 17.52 kiB Shape (4, 38, 59) (1, 38, 59) Count 328 Tasks 4 Chunks Type float64 numpy.ndarray",59  38  4,

Unnamed: 0,Array,Chunk
Bytes,70.06 kiB,17.52 kiB
Shape,"(4, 38, 59)","(1, 38, 59)"
Count,328 Tasks,4 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,70.06 kiB,17.52 kiB
Shape,"(4, 38, 59)","(1, 38, 59)"
Count,328 Tasks,4 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 70.06 kiB 17.52 kiB Shape (4, 38, 59) (1, 38, 59) Count 328 Tasks 4 Chunks Type float64 numpy.ndarray",59  38  4,

Unnamed: 0,Array,Chunk
Bytes,70.06 kiB,17.52 kiB
Shape,"(4, 38, 59)","(1, 38, 59)"
Count,328 Tasks,4 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,70.06 kiB,17.52 kiB
Shape,"(4, 38, 59)","(1, 38, 59)"
Count,328 Tasks,4 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 70.06 kiB 17.52 kiB Shape (4, 38, 59) (1, 38, 59) Count 328 Tasks 4 Chunks Type float64 numpy.ndarray",59  38  4,

Unnamed: 0,Array,Chunk
Bytes,70.06 kiB,17.52 kiB
Shape,"(4, 38, 59)","(1, 38, 59)"
Count,328 Tasks,4 Chunks
Type,float64,numpy.ndarray


# Geographic data

### These data define the grids used for CAPE/CIN

In [7]:
import xarray as xr

ds = xr.open_dataset("../data/geog/geog_sim.nc")

ds

# Seasonal mean MU CAPE / MU CIN

### They are organized into three simulations

```
1. HIST_*.nc (HIST)
2. FUTR45_*.nc (FUTR 4.5)
3. FUTR85_*.nc (FUTR 8.5)
```

### Two variables are stored in each netCDF file

```
1. 'AFWA_CAPE_MU' (MU CAPE)
2. 'AFWA_CIN_MU' (MU CIN)
```

### the season dimension represents data for each season

In [8]:
ds = xr.open_dataset("../data/thermo/HIST_seasonal_MUCAPE_MUCIN.nc")

ds