# Development Notebook
Constanly changing

In [None]:
import xarray as xr
import pandas as pd
import os
import pyproj
import numpy as np
import xesmf as xe
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import sys
sys.path.append('..')

import noaa_csl_funcs as ncf
from regrid_data import RegridInputs
%load_ext autoreload
%autoreload 2

# Post Regrid Data Handling

Goal is to have a streamlined way to access the files needed given a set of parameters. Envisioning something where a user could enter:
* Sectors of interest
* Timerange of interest
* Spatial bounding box
* Gas species of interest    

And be returned either an xarray object with dask parallelization, or create a new nc/set of nc files to be loaded by the user later. 


In [4]:
class Regridded_CSL_Handler:
    def __init__(self,regridded_path,bau_or_covid='COVID'):
        self.regridded_path = regridded_path
        self.sectors = self.get_sectors()
        self.bau_or_covid = bau_or_covid

    def get_sectors(self):
        '''Lists the sectors in the regridded data storage path'''

        sector_list = ncf.listdir_visible(self.regridded_path)
        sectors = {'area':[],'point':[]}
        for sector in sector_list:
            if 'area' in sector:
                sectors['area'].append(sector)
            elif 'point' in sector:
                sectors['point'].append(sector)
            else:
                raise ValueError(f"Unexpected sector type {sector}, not point or area.")
        return sectors

    def get_sector_subset_list(self,sector_subset):
        if sector_subset == 'all':
            sector_subset_list = []
            for k,v in self.sectors.items():
                sector_subset_list.extend(v)
            return sector_subset_list
        elif type(sector_subset)==str:
            return self.sectors[sector_subset]
        else:
            return sector_subset

    def get_days_in_range(self,dt1,dt2,day_types,sector_subset = 'all',add_path=True):
        '''Gets all filepaths to the day_type level that are within a datetime range
        
        Args:
        dt1 (datetime.date) : a date, datetime, etc to start the range (will only use year and month)
        dt2 (datetime.date) : a date, datetime, etc to end the range (will only use year and month)
        sectors (list) : list of sectors to include in the list
        day_types (list) : list of day types to include in the list
        add_path (bool, optional) : if true (default) it will add the regridded path to each element

        Returns:
        days_in_range (list) : list of paths to files that are within the date range and sector, day_types, etc. 
        '''

        dates_list = pd.date_range(dt1,dt2,freq = 'MS') #get a list of all the months between the dts
        sector_subset_list = self.get_sector_subset_list(sector_subset)
        days_in_range = []
        for date in dates_list:
            for sector in sector_subset_list:
                for day_type in day_types:
                    day_path = f'{sector}/{ncf.yr_to_yrstr(sector,date.year,self.bau_or_covid)}/{ncf.month_int_to_str(date.month)}/{day_type}'
                    if add_path:
                        days_in_range.append(os.path.join(self.regridded_path,day_path))
                    else:
                        days_in_range.append(day_path)
        return days_in_range
    
    def get_files_in_days(self,days_paths):
        files = []
        for day_path in days_paths:
            files.extend(ncf.listdir_visible(day_path,add_path=True))
        return files
    
regridded_path = '/uufs/chpc.utah.edu/common/home/lin-group9/agm/NOAA_CSL_Data/regridded'
RCH = Regridded_CSL_Handler(regridded_path)

dt1  = '2019-01'
dt2 = dt1#'2021-05'
days_paths = RCH.get_days_in_range(dt1,dt2,['weekdy'],sector_subset=['area_offroad','area_onroad_gasoline'])
files = RCH.get_files_in_days(days_paths)
files
#ds = xr.open_dataset(files[0])

['/uufs/chpc.utah.edu/common/home/lin-group9/agm/NOAA_CSL_Data/regridded/area_offroad/2019/Month01/weekdy/area_offroad_regridded.nc',
 '/uufs/chpc.utah.edu/common/home/lin-group9/agm/NOAA_CSL_Data/regridded/area_onroad_gasoline/2019/Month01/weekdy/area_onroad_gasoline_regridded.nc']

In [7]:
#Takes forever, need to figure out dask loading
ds1 = xr.open_dataset(files[0])
ds2 = xr.open_dataset(files[1])

def preprocess(ds):
    ds = ds.assign_coords(sector = ds.attrs['sector_id'])
    ds = ds.expand_dims(dim='sector')
    return ds

ds1_sec = preprocess(ds1)
ds2_sec = preprocess(ds2)

#xr.open_mfdataset(files,concat_dim = 'sector',combine='nested',preprocess=preprocess)
ds_comb = xr.combine_by_coords([ds1_sec['CO2'],ds2_sec['CO2']],combine_attrs='override')

In [None]:
proj = ccrs.PlateCarree()
fig = plt.figure(figsize=(10,5))
ax = plt.axes(projection = proj)
#ax.set_extent([map_extent['lon_low'],map_extent['lon_high'],map_extent['lat_low'],map_extent['lat_high']],crs=proj)
ds['NOX'].sel({'utc_hour':1}).plot.pcolormesh('lon','lat',ax = ax,cmap = 'viridis')
ax.coastlines()
ax.add_feature(cfeature.BORDERS)
ax.add_feature(cfeature.STATES)
plt.show()

## Unit conversion

Ongoing work about unit conversion. The official NOAA CSL documentation has some information, but there are some assumptions to be made about molecular weights (SAPRC versioning). This is an attempt to create a package that will:
* Choose and standardize the speciation values
* Allow for easy unit conversion, particularly chemical g/mol type conversions
* Correctly change attributes, labels, etc so things don't get wonky   

Ongoing development with noaa_csl_funcs.Unit_Converter

In [11]:
class CSLSpeciesDetails:
    def __init__(self,csv_fullpath):
        self.csv_fullpath = csv_fullpath
        self.df = self.load_details_csv()

    def load_details_csv(self,csv_fullpath = None):
        if csv_fullpath is None:
            csv_fullpath = self.csv_fullpath
        
        species_details_df = pd.read_csv(csv_fullpath,header = 17)
        return species_details_df
    
    def make_mw_dict(self,saprc_version = 'either'):
        mw_dict = {}
        if saprc_version == 'either':
            for _,row in self.df.iterrows():
                species = row['NC_name']
                mw_dict[species] = {}
                if row['Known_MW'] == row['Known_MW']: #known mw is not a nan, so use this first
                    mw_dict[species]['mw(g/mol)']=row['Known_MW']
                    mw_dict[species]['mw_source'] ='Known_MW'
                elif row['SAPRC99_MW'] == row['SAPRC99_MW']: #use saprc99 first
                    mw_dict[species]['mw(g/mol)'] = row['SAPRC99_MW']
                    mw_dict[species]['mw_source'] = 'SAPRC99_MW'        
                elif row['SAPRC22_MW'] == row['SAPRC22_MW']: #use saprc22 next
                    mw_dict[species]['mw(g/mol)'] = row['SAPRC22_MW']
                    mw_dict[species]['mw_source'] = 'SAPRC22_MW'      
                else:
                    mw_dict[species]['mw(g/mol)'] = np.nan
                    mw_dict[species]['mw_source'] = np.nan   
        else:
            raise ValueError('Havent made anything other that "either"')  #could do SAPRC22 only, 99 only, mixed etc TODO
        return mw_dict
    
csd = CSLSpeciesDetails('/uufs/chpc.utah.edu/common/home/u0890904/NOAA_CSL/noaa_csl/NOAA_CSL_species.csv')
csl_mw_dict = csd.make_mw_dict()
csl_mw_dict


{'CO': {'mw(g/mol)': 28.01, 'mw_source': 'Known_MW'},
 'CO2': {'mw(g/mol)': 44.01, 'mw_source': 'Known_MW'},
 'HC01': {'mw(g/mol)': 16.04, 'mw_source': 'Known_MW'},
 'HC02': {'mw(g/mol)': 30.07, 'mw_source': 'SAPRC99_MW'},
 'HC03': {'mw(g/mol)': 36.73, 'mw_source': 'SAPRC99_MW'},
 'HC04': {'mw(g/mol)': 58.61, 'mw_source': 'SAPRC99_MW'},
 'HC05': {'mw(g/mol)': 77.6, 'mw_source': 'SAPRC99_MW'},
 'HC06': {'mw(g/mol)': 118.89, 'mw_source': 'SAPRC99_MW'},
 'HC07': {'mw(g/mol)': 28.05, 'mw_source': 'SAPRC99_MW'},
 'HC08': {'mw(g/mol)': 72.34, 'mw_source': 'SAPRC99_MW'},
 'HC09': {'mw(g/mol)': 75.78, 'mw_source': 'SAPRC99_MW'},
 'HC10': {'mw(g/mol)': 68.12, 'mw_source': 'SAPRC99_MW'},
 'HC11': {'mw(g/mol)': 136.24, 'mw_source': 'SAPRC99_MW'},
 'HC12': {'mw(g/mol)': 95.16, 'mw_source': 'SAPRC99_MW'},
 'HC13': {'mw(g/mol)': 118.72, 'mw_source': 'SAPRC99_MW'},
 'HC14': {'mw(g/mol)': 30.03, 'mw_source': 'SAPRC99_MW'},
 'HC15': {'mw(g/mol)': 44.05, 'mw_source': 'SAPRC99_MW'},
 'HC16': {'mw(g/mol)'

# Other random stuff

In [None]:
import subprocess
def create_symlink_of_tree(source_fullpath,dest_path):
    '''Creates symlinks for an entire directory tree
    
    Args:
    source_fullpath (str) : full path of the source of symlinked data. should start and end with /
    dest_path (str) : path to where the symlinked copy will go
    '''
    if not source_fullpath.endswith('/'):
        source_fullpath = source_fullpath+'/'
    command = ["cp","-as", source_fullpath,dest_path]
    print(command)
    subprocess.Popen(command)

def symlink_all_pointsectors(base_path,dest_path):
    for sector in os.listdir(base_path):
        if sector.startswith('point'):
            source_fullpath = os.path.join(base_path,sector)
            dest_fullpath = os.path.join(dest_path,sector)
            
            create_symlink_of_tree(source_fullpath,dest_fullpath)

base_path = '/uufs/chpc.utah.edu/common/home/lin-group9/agm/NOAA_CSL_Data/base'
dest_path = '/uufs/chpc.utah.edu/common/home/lin-group9/agm/NOAA_CSL_Data/regridded'
#symlink_all_pointsectors(base_path,dest_path)
