In [4]:
import pandas as pd
import numpy as np
import os
import re
import string
import datetime
import pytz
import sys 
sys.path.append('..')
import funcs.ac_funcs as ac

# OOF

In [5]:
class oof_manager:
    '''Class to manage getting data from oof files'''

    def __init__(self,oof_data_folder,timezone):
        '''
        Args: 
        oof_data_folder (str) : path to the folder where oof data is stored
        timezone (str) : timezone for the measurments
        '''
        self.oof_data_folder = oof_data_folder
        self.timezone = timezone

    def load_oof_df_inrange(self,dt1,dt2,filter_flag_0=False,print_out=False,cols_to_load=None):
        '''Loads a dataframe from an oof file for datetimes between the input values
        
        Args:
        dt1_str (str) : string for the start time of the desired range of form "YYYY-mm-dd HH:MM:SS" 
        dt2_str (str) : string for the end time of the desired range of form "YYYY-mm-dd HH:MM:SS" 
        oof_filename (str) : name of the oof file to load
        filter_flag_0 (bool) : True will filter the dataframe to rows where the flag column is 0 (good data), false returns all the data
        print_out (bool) : Will print a message telling the user that they are loading a certain oof file. Default False. 
        cols_to_load (list) : List of strings that are the names of the oof data columns to load. Default is None, which loads all of the columns. 

        Returns:
        df (pd.DataFrame) : pandas dataframe loaded from the oof files, formatted date, and column names       
        '''
        if type(dt1) == str:
            dt1 = self.tzdt_from_str(dt1)
            dt2 = self.tzdt_from_str(dt2)
        oof_files_inrange = self.get_oof_in_range(dt1,dt2)
        full_df = pd.DataFrame()
        for oof_filename in oof_files_inrange:
            if print_out:
                print(f'Loading {oof_filename}')
            df = self.df_from_oof(oof_filename,fullformat = True, filter_flag_0 = filter_flag_0, cols_to_load=cols_to_load) #load the oof file to a dataframe
            #df = self.df_dt_formatter(df) #format the dataframe to the correct datetime and column name formats
            df = df.loc[(df.index>=dt1)&(df.index<=dt2)] #filter the dataframe between the input datetimes
            #if filter_flag_0: #if we want to filter by flag
            #    df = df.loc[df['flag'] == 0] #then do it!
            full_df = pd.concat([full_df,df])
        return full_df

    def df_from_oof(self,filename,fullformat = False,filter_flag_0 = False,cols_to_load=None):
        '''Load a dataframe from an oof file
        
        Args:
        filename (str) : name of the oof file (not the full path)
        fullformat (bool) : if you want to do the full format
        filter_flag_0 (bool) : if you want to only get the 0 flags (good data), set to True
        cols_to_load (list) : list of strings of the oof columns you want to load. Default None which loads all of the columns
        
        Returns:
        df (pd.DataFrame) : a pandas dataframe loaded from the em27 oof file with applicable columns added/renamed
        '''

        oof_full_filepath = os.path.join(self.oof_data_folder,filename) #get the full filepath using the class' folder path
        header = self.read_oof_header_line(oof_full_filepath)
        if cols_to_load == None: #if use_cols is none, we load all of the columns into the dataframe
            df = pd.read_csv(oof_full_filepath,
                            header = header,
                            delim_whitespace=True,
                            skip_blank_lines=False) #read it as a csv, parse the header
        else:
            must_have_cols = ['flag','year','day','hour','lat(deg)','long(deg)','zobs(km)'] #we basically always need these columns
            usecols = cols_to_load.copy() #copy the cols to load so it doesn't alter the input list (we often use "specs" or simlar)
            for mhc in must_have_cols: #loop through the must haves
                if mhc not in cols_to_load: #if they aren't in the columns to load
                    usecols.append(mhc) #add them 

            df = pd.read_csv(oof_full_filepath, #now load the dataframe with the specific columns defined
                header = header,
                delim_whitespace=True,
                skip_blank_lines=False,
                usecols = usecols) #read it as a csv, parse the header
                
        df['inst_zasl'] = df['zobs(km)']*1000 #add the instrument z elevation in meters above sea level (instead of km)
        df['inst_lat'] = df['lat(deg)'] #rename the inst lat column
        df['inst_lon'] = df['long(deg)'] #rename the inst lon column 
        if fullformat:
            df = self.df_dt_formatter(df)
        if filter_flag_0:
            df = df.loc[df['flag']==0]
        return df

    def tzdt_from_str(self,dt_str):
        '''Apply the inherent timezone of the class to an input datetime string
        
        Args:
        dt_str (str) : datetime string of form "YYYY-mm-dd HH:MM:SS" 
        
        Returns:
        dt (datetime.datetime) : timezone aware datetime object, with timezone determined by the class
        '''

        dt = datetime.datetime.strptime(dt_str,'%Y-%m-%d %H:%M:%S') #create the datetime
        dt = pytz.timezone(self.timezone).localize(dt) #apply the timezone
        return dt

    def read_oof_header_line(self,full_file_path):
        '''Reads and parses the header line of an oof file
        
        Args: 
        full_file_path (str) : full path to an oof file we want to read
        
        Returns:
        header (list) : list of column names to use in the header 
        '''

        with open(full_file_path) as f: #open the file
            line1 = f.readline() #read the first line
        header = int(line1.split()[0])-1 #plit the file and get the header
        return header

    def parse_oof_dt(self,year,doy,hr_dec):
        '''Get a real datetime from an oof style datetime definition
        
        Args:
        year (int) : year
        doy (int) : day of the year 
        hr_dec (float) : decimal hour of the day
        
        Returns:
        dt (pandas.datetime) : pandas datetime object gleaned from the inputs
        '''

        dt = pd.to_datetime(f'{int(year)} {int(doy)}',format='%Y %j') + datetime.timedelta(seconds = hr_dec*3600)
        return dt

    def df_dt_formatter(self,df):
        '''Format a loaded oof dataframe to have the correct datetime as an index

        Assumes that the oof timestamps are in UTC
        
        Args: 
        df (pd.DataFrame) : dataframe loaded using df_from_oof() 

        Returns:
        df (pd.DataFrame) : reformatted dataframe with datetime as the index, and converted to a timezone aware object. 
        '''

        df['dt'] = np.vectorize(self.parse_oof_dt)(df['year'],df['day'],df['hour']) #set the datetime column by parsing the year, day and hour columns
        df = df.set_index('dt',drop=True).sort_index() #set dt as the index
        df.index = df.index.tz_localize('UTC').tz_convert(self.timezone) #localize and convert the timezone
        return df

    def get_sorted_oof(self):
        '''Get a list of oof files in the oof data folder, sorted so they are in chron order
        
        Returns:
        files (list) : list of files ending in oof in the data folder
        '''

        files = [] #initialize the list
        for file in sorted(os.listdir(self.oof_data_folder)): #loop through the sorted filenames in the oof data folder
            if file.endswith('oof'): #if the file ends in oof
                files.append(file) #add it to the list
        return files

    def get_oof_in_range(self,dt1,dt2):
        '''Finds the oof files in the data folder that fall between two input datetimes
        
        Args:
        dt1 (str or datetime.datetime) : start datetime of the interval we want to find files within
        dt2 (str or datetime.datetime) : end datetime of the interfal we want to find files within
        
        Returns:
        files in range (list) : list of oof filenames that fall within the datetime range input
        '''
        dt1 = dt1 - datetime.timedelta(days=1) #sometimes with UTC there are values in the previous day's oof file, so start one behind to check
        daystrings_in_range = [] #initialize the day strings in the range
        delta_days = dt2.date()-dt1.date() #get the number of days delta between the end and the start
        for i in range(delta_days.days +1): #loop through that number of days 
            day = dt1.date() + datetime.timedelta(days=i) #get the day by incrementing by i (how many days past the start)
            daystrings_in_range.append(day.strftime('%Y%m%d')) #append a string of the date (YYYYmmdd) to match with filenames

        files_in_range = [] #initilize the filenames that will be in the range
        for file in self.get_sorted_oof(): #loop through the sorted oof files in the data folder
            for daystring_in_range in daystrings_in_range: # loop through the daystrings that are in the range
                if daystring_in_range in file: #if the daystring is in the filename, 
                    files_in_range.append(file) #append it. Otherwise keep going
        
        return files_in_range

    def date_from_oof(self,oof_filename):
        '''Strips the date from an oof filename
        
        Args: 
        oof_filename (str)

        Returns:
        date (datetime.datetime.date) : date gained from the oof filename
        '''

        try:
            datestring = oof_filename.split('.')[0][2:] #split the oof_filename on . and remove the two letter identifier 
            date = datetime.datetime.strptime(datestring,"%Y%m%d").date() #convert to a date
            return date
        except:
            raise Exception(f'Error in getting datestring from {oof_filename}')

    def get_inrange_dates(self,dt1,dt2):
        '''Gets a range of dates between an input datetime range
        
        Args:
        dt1 (datetime.datetime) : start datetime
        dt2 (datetime.datetime) : end datetime
        
        Returns:
        dates_in_range (list) : list of dates within the datetime range
        '''

        files_in_range = self.get_oof_in_range(dt1,dt2) #find the files in the range
        dates_in_range = [] #initialize the dates list
        for oof_filename in files_in_range: #loop through the files in the range
            inrange_date = self.date_from_oof(oof_filename) #grab the date
            dates_in_range.append(inrange_date) #and append it
        return dates_in_range

    def check_get_loc(self,oof_df):
        '''Checks and gets the location of the instrument from the oof file
        TODO: This will break if the location moves during data collection or between days. This could become an issue if data was collected
        during one day and went past midnight UTC, then moved to a differnt location the next day. The oof_df in this case for the secnod day
        would include some data from the first data colleciton session in the early UTC hours, before moveing. 

        Args: 
        oof_df (pd.DataFrame) : dataframe of oof values
        
        Returns: 
        inst_lat (float) : instrument latitude
        inst_lon (float) : instrument longitude
        inst_zasl (float) : instrument elevation above sea level in meters        
        '''

        cols_to_check = ['inst_lat','inst_lon','inst_zasl']
        for col in cols_to_check:
            if not pd.col_is_equal(oof_df[col]):
                raise Exception('{col} is not the same for the entire oof_df. This is an edge case.')
        #If we make it through the above, we can pull the values from the dataframe at the 0th index because they are all the same
        inst_lat = oof_df.iloc[0]['inst_lat']
        inst_lon = oof_df.iloc[0]['inst_lon']
        inst_zasl = oof_df.iloc[0]['inst_zasl']
        return inst_lat,inst_lon,inst_zasl   

In [18]:
#Load the side by side data for ua and ha so that we can correct to one another
data_folder = '/uufs/chpc.utah.edu/common/home/u0890904/LAIR_1/Data/EM27_oof/SLC_EM27_ha_2022_2023_oof_v2_nasrin_correct'
filter_flag_0 = True #set to True if we want to filter bad spectra
timezone = 'UTC'  #timezone within which to load the dataframes
specs = ['xch4(ppm)','xco2(ppm)','xco(ppb)'] #these are the species we want to correct

#the datetime string ranges below are the ranges when the ua EM27 and ha EM27 were side by side on the roof of wbb
dt1_str = '2023-07-08 11:00:00'
dt2_str = '2023-07-09 21:59:59'

dt1 = datetime.datetime.strptime(dt1_str,'%Y-%m-%d %H:%M:%S') #create the datetime
dt1 = pytz.timezone('US/Mountain').localize(dt1) #apply the timezone

dt2 = datetime.datetime.strptime(dt2_str,'%Y-%m-%d %H:%M:%S') #create the datetime
dt2 = pytz.timezone('US/Mountain').localize(dt2) #apply the timezone

my_oof_manager = ac.oof_manager(data_folder,timezone) #create the oof manager for that instrument
#my_oof_manager.get_oof_in_range(dt1,dt2)
df = my_oof_manager.load_oof_df_inrange(dt1,dt2,filter_flag_0=filter_flag_0,cols_to_load=specs) #load the datetime in the range
df.index.tz_convert('US/Mountain')

DatetimeIndex(['2023-07-08 11:03:10.800000-06:00',
                      '2023-07-08 11:03:18-06:00',
               '2023-07-08 11:03:28.800000-06:00',
               '2023-07-08 11:03:32.400000-06:00',
               '2023-07-08 11:03:43.200000-06:00',
               '2023-07-08 11:03:46.800000-06:00',
               '2023-07-08 11:03:57.600000-06:00',
               '2023-07-08 11:04:01.200000-06:00',
                      '2023-07-08 11:04:12-06:00',
               '2023-07-08 11:04:19.200000-06:00',
               ...
               '2023-07-09 16:18:28.800000-06:00',
                      '2023-07-09 16:18:36-06:00',
               '2023-07-09 16:18:43.200000-06:00',
               '2023-07-09 16:18:50.400000-06:00',
               '2023-07-09 16:19:01.200000-06:00',
               '2023-07-09 16:19:08.400000-06:00',
               '2023-07-09 16:19:15.600000-06:00',
               '2023-07-09 16:19:22.800000-06:00',
               '2023-07-09 16:19:33.600000-06:00',
            

# Met data from GGG format

In [None]:
class met_loader_ggg:
    def __init__(self,daily_met_path):
        self.daily_met_path = daily_met_path

    def load_single_file(self,fname):
        fullpath = os.path.join(self.daily_met_path,fname)
        df = pd.read_csv(fullpath)
        return df
    
    def create_dt_index(self,)
    

daily_met_path = '/uufs/chpc.utah.edu/common/home/u0890904/WBB_met/daily_csvs'
mlg = met_loader_ggg(daily_met_path)

df = mlg.read_single_file('20231118_HA.WBB.txt')
df.dtypes

# WBB

In [None]:

def read_all_wbb_to_df(wbb_data_path,wbb_resample_interval):
    wbb_met_df = pd.DataFrame()
    for file in os.listdir(wbb_data_path):
        fullpath = os.path.join(wbb_data_path,file)
        df = pd.read_csv(fullpath,header = 6,skiprows=[7])
        df.index = pd.to_datetime(df['Date_Time']).dt.tz_convert(timezone)
        df[['u','v']] = df.apply(lambda row: ac.wdws_to_uv(row['wind_speed_set_1'],row['wind_direction_set_1']),axis = 1,result_type='expand')

        if wbb_resample_interval == None:
            df_resampled = df.copy()
        else:
            df_resampled = df.resample(wbb_resample_interval).mean(numeric_only=True).dropna(how='all')
        df_resampled['ws'],df_resampled['wd'] = np.vectorize(ac.uv_to_wdws)(df_resampled['u'],df_resampled['v'])
        wbb_met_df = pd.concat([wbb_met_df,df_resampled])
    wbb_met_df = wbb_met_df.sort_index()
    wbb_met_df = wbb_met_df.rename(columns={'pressure_set_1':'pressure','air_temp_set_1':'temp'})
    wbb_met_df['pressure'] = wbb_met_df['pressure']/100
    return wbb_met_df

# Trisonica

In [None]:
def read_all_trisonica_to_df(trisonica_data_path,trisonica_resample_interval):
    headers_list = ['ET','Date','Time','S','D','U','V','W','T','H','DP','P','AD','PI','RO','MD','TD']
    trisonica_df = pd.DataFrame()
    for file in os.listdir(trisonica_data_path):
        fullpath = os.path.join(trisonica_data_path,file)
        with open(fullpath,'r',errors='ignore') as f:
            rows_list = []
            for i,line in enumerate(f):
                newline = line.strip()
                if len(newline) < 5:
                    continue
                newline = newline.replace('=','')
                for let in string.ascii_letters.replace('n','').replace('a',''):
                    newline = newline.replace(let,'')
                newline = newline.replace(',',' ')
                if newline[0] == ' ':
                    newline = newline[1:]
                newline = re.sub(r"\s+",",",newline)
                line_to_append = newline.split(',')
                if len(line_to_append) == len(headers_list):
                    rows_list.append(line_to_append)
            df = pd.DataFrame(rows_list)
            df.columns = headers_list
            for col in df.columns:
                df[col] = pd.to_numeric(df[col],errors='coerce')
            df = df.dropna(axis = 1,how = 'all')
            df['DT'] = pd.to_datetime(df['ET'],unit='s')
            df = df.set_index('DT')
            df.index = df.index.tz_localize('UTC').tz_convert(timezone)
            df = df.drop(['S','D'],axis = 1)
            if trisonica_resample_interval is not None:
                df = df.resample(trisonica_resample_interval).mean(numeric_only=True)
        trisonica_df = pd.concat([trisonica_df,df])
    trisonica_df = trisonica_df.sort_index()
    trisonica_df['ws'],trisonica_df['wd'] = np.vectorize(ac.uv_to_wdws)(trisonica_df['U'],trisonica_df['V'])
    return trisonica_df