In [1]:
import matplotlib.pyplot as plt
from glob import glob
import pandas as pd
import numpy as np
import matplotlib
import cartopy.crs as ccrs
import cartopy.feature as cf 
import xarray as xr
import cmocean
import cartopy
import cartopy.util
import tensorflow.keras
import sklearn.cluster
import scipy.cluster
import sklearn.decomposition
from minisom import MiniSom
from numba import jit
from tensorflow.keras.models import Sequential
from xarray.backends import file_manager as fm

In [2]:
# Pull all data files

frontobject_conus_path = "E:/FrontsProjectData/pickle_files/*/*/*/FrontObjects*conus.pkl"
frontobject_window_path = "E:/FrontsProjectData/pickle_files/*/*/*/FrontObjects*lon*lat*.pkl"
surfacedata_conus_path = "E:/FrontsProjectData/pickle_files/*/*/*/SurfaceData*conus.pkl"
surfacedata_window_path = "E:/FrontsProjectData/pickle_files/*/*/*/SurfaceData*lon*lat*.pkl"

frontobject_conus_files = sorted(glob(frontobject_conus_path))
frontobject_window_files = sorted(glob(frontobject_window_path))
surfacedata_conus_files = sorted(glob(surfacedata_conus_path))
surfacedata_window_files = sorted(glob(surfacedata_window_path))

print("FrontObject CONUS files: %d" % len(frontobject_conus_files))
print("FrontObject window files: %d" % len(frontobject_window_files))
print("SurfaceData CONUS files: %d" % len(surfacedata_conus_files))
print("SurfaceData window files: %d" % len(surfacedata_window_files))

FrontObject CONUS files: 5690
FrontObject window files: 256050
SurfaceData CONUS files: 5792
SurfaceData window files: 260640


In [3]:
frontobject_conus_dss = xr.concat(map(pd.read_pickle, frontobject_conus_files),
                                 dim='time')
surfacedata_conus_dss = xr.concat(map(pd.read_pickle, surfacedata_conus_files),
                                 dim='time')
print(frontobject_conus_dss)
print(surfacedata_conus_dss)

<xarray.Dataset>
Dimensions:     (latitude: 101, longitude: 181, time: 5690)
Coordinates:
  * latitude    (latitude) float64 50.0 49.75 49.5 49.25 ... 25.5 25.25 25.0
  * longitude   (longitude) float64 238.0 238.2 238.5 ... 282.5 282.8 283.0
  * time        (time) datetime64[ns] 2008-01-01 ... 2009-12-31T21:00:00
Data variables:
    identifier  (time, latitude, longitude) float64 0.0 0.0 0.0 ... 0.0 0.0 0.0
<xarray.Dataset>
Dimensions:    (latitude: 101, longitude: 181, time: 5792)
Coordinates:
  * time       (time) datetime64[ns] 2008-01-01 ... 2009-12-31T21:00:00
  * longitude  (longitude) float32 238.0 238.2 238.5 238.8 ... 282.5 282.8 283.0
  * latitude   (latitude) float32 50.0 49.75 49.5 49.25 ... 25.5 25.25 25.0
Data variables:
    d2m        (time, latitude, longitude) float32 dask.array<chunksize=(1, 101, 181), meta=np.ndarray>
    sp         (time, latitude, longitude) float32 dask.array<chunksize=(1, 101, 181), meta=np.ndarray>
    t2m        (time, latitude, longitude) flo

In [4]:
# Create new dataset that only contains data with the same timestamp on both datasets. For example, in the code below, the 
# frontobject dataset has a smaller time dimension than the surfacedata array, so we will check to see which time stamps from
# the surfacedata array are present in the frontobject array. 

frontobject_time = frontobject_conus_dss.time.values
sfc_time = surfacedata_conus_dss.time.values

indices = []
for i in range(0,len(sfc_time)):
    index = np.where(sfc_time[i]==frontobject_time)
    if index[0].size != 0:
        indices.append(index[0][0])

# While merging the two datasets, we will select data whose timestamps are present in both datasets.
conus_dss = xr.merge([surfacedata_conus_dss.sel(time=frontobject_time[indices]), frontobject_conus_dss]) 
print(conus_dss)

<xarray.Dataset>
Dimensions:     (latitude: 101, longitude: 181, time: 5690)
Coordinates:
  * time        (time) datetime64[ns] 2008-01-01 ... 2009-12-31T21:00:00
  * longitude   (longitude) float32 238.0 238.2 238.5 ... 282.5 282.8 283.0
  * latitude    (latitude) float32 50.0 49.75 49.5 49.25 ... 25.5 25.25 25.0
Data variables:
    d2m         (time, latitude, longitude) float32 dask.array<chunksize=(1, 101, 181), meta=np.ndarray>
    sp          (time, latitude, longitude) float32 dask.array<chunksize=(1, 101, 181), meta=np.ndarray>
    t2m         (time, latitude, longitude) float32 dask.array<chunksize=(1, 101, 181), meta=np.ndarray>
    theta_w     (time, latitude, longitude) float32 dask.array<chunksize=(1, 101, 181), meta=np.ndarray>
    u10         (time, latitude, longitude) float32 dask.array<chunksize=(1, 101, 181), meta=np.ndarray>
    v10         (time, latitude, longitude) float32 dask.array<chunksize=(1, 101, 181), meta=np.ndarray>
    identifier  (time, latitude, longi

In [5]:
conus_dss.t2m.attrs = {}
print(format(conus_dss.t2m))
kmeans_model = sklearn.cluster.KMeans(n_clusters=2, random_state=10).fit(conus_dss.t2m)

<xarray.DataArray 't2m' (time: 5690, latitude: 101, longitude: 181)>
dask.array<getitem, shape=(5690, 101, 181), dtype=float32, chunksize=(1, 101, 181), chunktype=numpy.ndarray>
Coordinates:
  * time       (time) datetime64[ns] 2008-01-01 ... 2009-12-31T21:00:00
  * longitude  (longitude) float32 238.0 238.2 238.5 238.8 ... 282.5 282.8 283.0
  * latitude   (latitude) float32 50.0 49.75 49.5 49.25 ... 25.5 25.25 25.0


FileNotFoundError: [Errno 2] No such file or directory: b'/ourdisk/hpc/ai2es/era5/Surface/ERA5Global_2009_3hrly_2mT.nc'

AttributeError: 'DataArray' object has no attribute '_cache'