In [1]:
import time
import pandas as pd
from multiprocessing import Pool
import math
import os
import shutil
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from scipy.io import readsav
import glob
import statistics
from aetherpy.io import read_routines
from math import cos, radians, sin, sqrt
from scipy import spatial, signal
from glob import glob
from datetime import datetime
from datetime import timedelta
from struct import unpack
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as dates
from pylab import cm
import fnmatch

from aetherpy.io import read_routines as rr


import sys

from scipy.interpolate import LinearNDInterpolator, interp1d

from utility_programs.read_routines import SAMI
from utility_programs.filters import make_fits

import xarray as xr

import gc

import cartopy.crs as ccrs

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
sami_data_path = "/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/"

sami_og_vars = {
    'deneu.dat': 'edens', }

geo_grid_files = {
    'lat': 'glatu.dat', 'lon': 'glonu.dat', 'alt': 'zaltu.dat',
}

In [5]:
nz, nf, nlt, nt = SAMI.get_grid_elems_from_parammod(sami_data_path)
times = SAMI.make_times(nt, sami_data_path, datetime(2011, 5, 20))

In [6]:
grid = {}

for f in geo_grid_files:
    file = open(os.path.join(sami_data_path, geo_grid_files[f]), 'rb')
    raw = np.fromfile(file, dtype='float32')[1:-1].copy()
    file.close()

    grid[f] = raw.reshape(nlt, nf, nz).copy()
    
ds_in = xr.Dataset(coords=
    {
        "latitude": (['loc'], grid['lat'].flatten(), {'units':'degrees_north'}),
        "longitude": (["loc"], grid['lon'].flatten(), {'units':'degrees_east'}),
        "height":(["loc"], grid['alt'].flatten()*1000, {'units': "meters",
                                                  "standard_name": "height_above_reference_ellipsoid" ,
                "long_name": "Elevation relative to sea level"}),
                 
    },
                   
                       
)

In [7]:
ds_in = ds_in.where((ds_in.height < 2200000) & (ds_in.height > 200000), drop = True)

In [8]:
ds_in

In [12]:
(len(ds_in.latitude)*8)**2/1e9

23652.2704896

In [71]:
ds_in.height

In [79]:
ds_in.sizes['loc']**2/1024**3

1607.4304819107056

In [15]:
ds_in.nbytes**2/1024**3

49562.76025772095

In [72]:
ds_in.to_netcdf(sami_data_path+'in.nc')

In [80]:
ds_out = xr.Dataset(
    {
        "latitude": (["lat"], np.arange(-80, 80, 2.0), {'units':'degrees_north'}),
        "longitude": (["lon"], np.arange(0, 360, 5), {'units':'degrees_east'}),
        "height":(["elevation"], np.arange(200,2000, 100)*1000, {
            'units': "meters",
            "standard_name": "height_above_reference_ellipsoid" ,
            "long_name": "Elevation relative to sea level"}),
    }
)

In [81]:
ds_out

In [82]:
ds_out.to_netcdf(sami_data_path+'out.nc')

In [53]:
ds_in.sizes['loc'] **2 / 1024 **3

2025.0

In [None]:
# 36 cores, 256 GB mem / node
# -> 1.5-2 TiB mem? -> 3-4 nodes. Use 4 to be safe

In [25]:
4*36 # cores

144

In [2]:
gitmdir = "/glade/u/home/abukowski/scratch/GITM-simstorm-run1/run2/UA/data"
files = np.sort(glob(os.path.join(gitmdir,'3DALL*')))

So currently GITM files are read (from fortran, formatted, binary files) into a python dict.

Is it faster to go straight to xarray from the binary, or to read the dict and then conver that?


In [3]:
# EX FILE:
%time f = rr.read_gitm_file(files[0])

f.keys()

CPU times: user 2.75 s, sys: 641 ms, total: 3.39 s
Wall time: 3.46 s


dict_keys(['vars', 'version', 'nlons', 'nlats', 'nalts', 'time', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39])

In [4]:
# Takes ~3.5 seconds...

In [5]:
file_to_read=files[0]

In [6]:
def read_gitm_file(filename, file_vars=None):
    """Read list of variables from one GITM file.

    Parameters
    ----------
    filename : str
        GITM file to read
    file_vars : list or NoneType
        List of desired variable names to read or None to read all
        (default=None)

    Returns
    -------
    data : dict
        Dict with keys 'time', which contains a datetime object specifying the
        time of the file and zero-offset indices, corresponding to the
        variable names in `file_vars` that holds arrays of the specified data.
        Also contains version number, dimensions, and a list of the variable
        names.

    """

    data = {"vars": []}

    if not os.path.isfile(filename):
        raise IOError('input file does not exist')

    with open(filename, 'rb') as fin:
        # Determine the correct endian
        end_char = '>'
        raw_rec_len = fin.read(4)
        rec_len = (unpack(end_char + 'l', raw_rec_len))[0]
        if rec_len > 10000 or rec_len < 0:
            # Ridiculous record length implies wrong endian.
            end_char = '<'
            rec_len = (unpack(end_char + 'l', raw_rec_len))[0]

        # Read version; read fortran footer+data.
        data["version"] = unpack(end_char + 'd', fin.read(rec_len))[0]

        _, rec_len = unpack(end_char + '2l', fin.read(8))

        # Read grid size information.
        data["nlons"], data["nlats"], data["nalts"] = unpack(
            end_char + 'lll', fin.read(rec_len))
        _, rec_len = unpack(end_char + '2l', fin.read(8))

        # Read number of variables.
        num_vars = unpack(end_char + 'l', fin.read(rec_len))[0]
        _, rec_len = unpack(end_char + '2l', fin.read(8))

        if file_vars is None:
            file_vars = np.arange(0, num_vars, 1)

        # Collect variable names in a list
        for ivar in range(num_vars):
            vcode = unpack(end_char + '%is' % (rec_len),
                           fin.read(rec_len))[0]
            var = vcode.decode('utf-8').replace(" ", "")
            data['vars'].append(var)
            dummy, rec_lec = unpack(end_char + '2l', fin.read(8))

        # Extract time
        rec_time = np.array(unpack(end_char + 'lllllll', fin.read(28)))
        rec_time[-1] *= 1000  # convert from millisec to microsec
        data["time"] = datetime(*rec_time)

        # Header is this length:
        # Version + start/stop byte
        # nlons, nlats, nalts + start/stop byte
        # num_vars + start/stop byte
        # variable names + start/stop byte
        # time + start/stop byte

        iheader_length = 84 + num_vars * 48

        ntotal = data["nlons"] * data["nlats"] * data["nalts"]
        idata_length = ntotal * 8 + 8

        # Save the data for the desired variables
        for ivar in file_vars:
            fin.seek(iheader_length + ivar * idata_length)
            sdata = unpack(end_char + 'l', fin.read(4))[0]
            data[ivar] = np.array(
                unpack(end_char + '%id' % (ntotal), fin.read(sdata))).reshape(
                    (data["nlons"], data["nlats"], data["nalts"]), order="F")

    return data

In [7]:
%time data = read_gitm_file(files[0])

CPU times: user 2.71 s, sys: 634 ms, total: 3.34 s
Wall time: 3.36 s


In [8]:
# Also 3.5 seconds...

In [73]:

def read_gitm_bin_xarray(filename, 
                        add_time=True,
                        drop_ghost_cells=True,
                        cols=None):

    if not os.path.isfile(filename):
        raise IOError('input file does not exist')

    with open(filename, 'rb') as fin:
        # Determine the correct endian
        end_char = '>'
        raw_rec_len = fin.read(4)
        rec_len = (unpack(end_char + 'l', raw_rec_len))[0]
        if rec_len > 10000 or rec_len < 0:
            # Ridiculous record length implies wrong endian.
            end_char = '<'
            rec_len = (unpack(end_char + 'l', raw_rec_len))[0]



        # Read version; read fortran footer+data.
        version = unpack(end_char + 'd', fin.read(rec_len))[0]

        _, rec_len = unpack(end_char + '2l', fin.read(8))

        # Read grid size information.
        nlons, nlats, nalts = unpack(
            end_char + 'lll', fin.read(rec_len))
        _, rec_len = unpack(end_char + '2l', fin.read(8))

        # Read number of variables.
        num_vars = unpack(end_char + 'l', fin.read(rec_len))[0]
        _, rec_len = unpack(end_char + '2l', fin.read(8))

        file_vars = np.arange(0, num_vars, 1)

        varnames = []

        # Collect variable names in a list
        for ivar in range(num_vars):
            vcode = unpack(end_char + '%is' % (rec_len),
                           fin.read(rec_len))[0]
            var = vcode.decode('utf-8').replace(" ", "")
            var=var.replace('!N','').replace('!U','').replace('!D','')\
                .replace('[','').replace('[','').replace(']','')
            varnames.append(var)
            dummy, rec_lec = unpack(end_char + '2l', fin.read(8))

        # Extract time
        rec_time = np.array(unpack(end_char + 'lllllll', fin.read(28)))
        rec_time[-1] *= 1000  # convert from millisec to microsec
        time_here = datetime(*rec_time)

        # Header is this length:
        # Version + start/stop byte
        # nlons, nlats, nalts + start/stop byte
        # num_vars + start/stop byte
        # variable names + start/stop byte
        # time + start/stop byte

        iheader_length = 84 + num_vars * 48

        ntotal = nlons * nlats * nalts
        idata_length = ntotal * 8 + 8

        data_vars={}

        # Save the data for the desired variables
        dimnames=['lon','lat','alt']
        
        for ivar in file_vars:
            fin.seek(iheader_length + ivar * idata_length)
            sdata = unpack(end_char + 'l', fin.read(4))[0]

            if ivar == 0:
                lons =  np.rad2deg(np.unique(np.array(
                        unpack(end_char + '%id' % (ntotal), fin.read(sdata))).reshape(
                        (nlons, nlats, nalts), order="F")))

            elif ivar == 1:
                lats =  np.rad2deg(np.unique(np.array(
                        unpack(end_char + '%id' % (ntotal), fin.read(sdata))).reshape(
                        (nlons, nlats, nalts), order="F")))

            elif ivar == 2:
                alts =  np.unique(np.array(
                        unpack(end_char + '%id' % (ntotal), fin.read(sdata))).reshape(
                        (nlons, nlats, nalts), order="F"))/1000


            else:
                data_vars[varnames[ivar]] = dimnames,np.array(
                    unpack(end_char + '%id' % (ntotal), fin.read(sdata))).reshape(
                        (nlons, nlats, nalts), order="F")
                # break
    ds = xr.Dataset(coords={'time':[time_here],'lon':lons,'lat':lats,'alt':alts},
                    data_vars=data_vars,
                    attrs={'version':version,
                          'dropped-ghost-cells':str(drop_ghost_cells),
                          'with_time':str(add_time)})

    if drop_ghost_cells:
        if nalts > 1:
            ds = ds.drop_isel(lat=[0,1,-2,-1],lon=[0,1,-1,-2],alt=[0,1,-1,-2])
        else:
            ds = ds.drop_isel(lat=[0,1,-2,-1],lon=[0,1,-1,-2])
    if not add_time:
        ds = ds.drop_vars('time')

    if cols is not None:
        ds = ds.get(cols)
                
        
    return ds
    # fin.close()


In [74]:
def gitm_times_from_filelist(file_list, century_prefix='20'):
    gitm_dtimes = []
    for i in file_list:
        yy, MM, dd = i[-17:-15], i[-15:-13], i[-13:-11]
        hr, mm, sec = i[-10:-8], i[-8:-6], i[-6:-4]
        try:
            gitm_dtimes.append(
                datetime(
                    int(century_prefix + yy), int(MM), int(dd),
                    int(hr), int(mm), int(sec)))
        except ValueError:
            raise ValueError(
                "GITM file name does not match expected format,",
                "filename %s cannot be parsed" % i)
    return gitm_dtimes

In [75]:
def read_gitm_multiple_bins(file_list,
                            start_dtime=None,
                            end_dtime=None,
                            start_idx=0,
                            end_idx=-1,
                            drop_ghost_cells=False,
                            cols=None):
    
    # Check inputs! Cannot specify start time & idx:
    if start_dtime is not None and start_idx is not None:
        raise ValueError("Cannot specify both Start idx & dtime")
    if end_dtime is not None and end_idx is not None:
        raise ValueError("Cannot specify both End idx & dtime")

    file_list=file_list[start_idx:end_idx]
    
    if start_dtime is not None or end_dtime is not None:
        times = gitm_times_from_filelist(file_list)
    if start_dtime is not None:
        time_mask = np.where(times>=start_dtime)
        file_list = file_list[time_mask]
        times = times[time_mask]
    if end_dtime is not None:
        time_mask = np.where(times<=end_dtime)
        file_list = file_list[time_mask]
    
    ds=[]
    for file in file_list:
        ds.append(read_gitm_bin_xarray(file,
                                       drop_ghost_cells=drop_ghost_cells,
                                       cols=cols))
        
    ds = xr.concat(ds,'time')
    
    return ds

In [76]:
ds = read_gitm_multiple_bins(file_list=files,start_idx=69,end_idx=72, cols=['Rho','NO+'])

In [77]:
ds

In [79]:
# Still need a post-processing (rewrite as xarray) script & scipts to read those


## SICK

now do SAMI. Will be weird cux we have some post-processed files and some not-processed files

In [1]:
sami_path = "/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/"

In [10]:
data_files = glob(os.path.join(sami_path,'*.dat'))

In [11]:
data_files

['/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/gsryu.dat',
 '/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/hidpgu.dat',
 '/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/u1pu.dat',
 '/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/ypu.dat',
 '/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/zsu.dat',
 '/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/vsi1u.dat',
 '/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/xphigu.dat',
 '/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/tecuB.dat',
 '/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/phiu.dat',
 '/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/denn5u.dat',
 '/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/time.dat',
 '/glade/u/home/abukowski/scratch/GITM-simstorm-run1/sami-gitm-coupled/nuin7u.dat',
 '/glad

In [29]:
files2=['deneu.dat','deni1u.dat deni2u.dat deni3u.dat deni4u.dat deni5u.dat deni6u.dat deni7u.dat denn1u.dat denn2u.dat denn3u.dat denn4u.dat denn5u.dat denn6u.dat denn7u.dat teu.dat ti1u.dat ti2u.dat ti5u.dat vsi1u.dat vsi2u.dat u1pu.dat u3hu.dat',
       'u1u.dat u2u.dat baltu.dat blatu.dat blonu.dat zaltu.dat glatu.dat glonu.dat phiu.dat']
files=[]
for i in range(len(files2)):
    for j in files2[i].split(' '):
        files.append(j)

In [37]:
sami_vars = {'deneu.dat':'edens',
 'deni1u.dat',
 'deni2u.dat',
 'deni3u.dat',
 'deni4u.dat',
 'deni5u.dat',
 'deni6u.dat',
 'deni7u.dat',
 'denn1u.dat',
 'denn2u.dat',
 'denn3u.dat',
 'denn4u.dat',
 'denn5u.dat',
 'denn6u.dat',
 'denn7u.dat',
 'teu.dat',
 'ti1u.dat',
 'ti2u.dat',
 'ti5u.dat',
 'vsi1u.dat',
 'vsi2u.dat',
 'u1pu.dat',
 'u3hu.dat',
 'u1u.dat',
 'u2u.dat',
 'baltu.dat',
 'blatu.dat',
 'blonu.dat',
 'zaltu.dat',
 'glatu.dat',
 'glonu.dat',
 'phiu.dat'}


['deneu.dat',
 'deni1u.dat',
 'deni2u.dat',
 'deni3u.dat',
 'deni4u.dat',
 'deni5u.dat',
 'deni6u.dat',
 'deni7u.dat',
 'denn1u.dat',
 'denn2u.dat',
 'denn3u.dat',
 'denn4u.dat',
 'denn5u.dat',
 'denn6u.dat',
 'denn7u.dat',
 'teu.dat',
 'ti1u.dat',
 'ti2u.dat',
 'ti5u.dat',
 'vsi1u.dat',
 'vsi2u.dat',
 'u1pu.dat',
 'u3hu.dat',
 'u1u.dat',
 'u2u.dat',
 'baltu.dat',
 'blatu.dat',
 'blonu.dat',
 'zaltu.dat',
 'glatu.dat',
 'glonu.dat',
 'phiu.dat']

In [50]:
nz, nf, nlt, nt = SAMI.get_grid_elems_from_parammod(sami_path)

In [None]:
with open(sami_path + 'deneu.dat','rb') as fin:
    e = unpack(end_char + '%id' % (ntotal), fin.read(sdata)).reshape(
                        (nlons, nlats, nalts), order="F")

In [None]:
fnames = {}
for f in data_files:
    if f.split('/')[-1] in files:
        dirs.append(f)
    else:
        print(f.split('/')[-1])

gsryu.dat
hidpgu.dat
ypu.dat
zsu.dat
xphigu.dat
tecuB.dat
time.dat
nuin7u.dat
gsphizu.dat
loss2u.dat
gsthetaxu.dat
xpu.dat
dvec13.dat
ysu.dat
gsphiyu.dat
sigmahu.dat
dvec23.dat
dvec32.dat
nuin3u.dat
xsu.dat
loss3u.dat
dvec33.dat
sigmapu.dat
hipcpu.dat
nuin5u.dat
vnphiu.dat
vsi5u.dat
xrgu.dat
t2u.dat
u5u.dat
sigmahicu.dat
blat0p.dat
blatpu.dat
hihcu.dat
vhsnxu.dat
balt0p.dat
vpsnyu.dat
vnpu.dat
dvec21.dat
jpu.dat
hidphivu.dat
loss7u.dat
loss6u.dat
weimer_grid.dat
xthgu.dat
gsphixu.dat
nuin2u.dat
vsi3u.dat
loss5u.dat
dene0B.dat
rhsegv.dat
dvec12.dat
glon0B.dat
gsrzu.dat
vnqu.dat
blonpu.dat
sigmapicu.dat
hipcphiu.dat
u3u.dat
baltpu.dat
bdirszu.dat
hidpvu.dat
t3u.dat
ti3u.dat
gsthetazu.dat
loss1u.dat
nmf2uB.dat
dvec22.dat
vsi4u.dat
hidphigu.dat
nuin1u.dat
zalt0B.dat
gsthetayu.dat
gsrxu.dat
dvec11.dat
vsi6u.dat
dvec31.dat
vhsnzu.dat
bdirsxu.dat
loss4u.dat
blat0.dat
ti6u.dat
balt0.dat
hihcmu.dat
jphiu.dat
blon0.dat
ti4u.dat
vsi7u.dat
vpsnzu.dat
hmf2uB.dat
vhsnyu.dat
u2su.dat
hipcu.dat
nuin4u

In [None]:
dirs

['deneu.dat',
 'deni1u.dat',
 'deni2u.dat',
 'deni3u.dat',
 'deni4u.dat',
 'deni5u.dat',
 'deni6u.dat',
 'deni7u.dat',
 'denn1u.dat',
 'denn2u.dat',
 'denn3u.dat',
 'denn4u.dat',
 'denn5u.dat',
 'denn6u.dat',
 'denn7u.dat',
 'teu.dat',
 'ti1u.dat',
 'ti2u.dat',
 'ti5u.dat',
 'vsi1u.dat',
 'vsi2u.dat',
 'u1pu.dat',
 'u3hu.dat',
 'u1u.dat',
 'u2u.dat',
 'baltu.dat',
 'blatu.dat',
 'blonu.dat',
 'zaltu.dat',
 'glatu.dat',
 'glonu.dat',
 'phiu.dat']