In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from datetime import datetime

In [2]:
# A class to store and parse filename 
class filenameClass:
    def __init__(self, fn):
        self.fn = fn
        # TODO change pattern for different CTD folders: make it a variable
        self.pattern = r'^(?P<date>\d{8})(?P<time>\d{4})_(?P<sonde_name>[A-Z\d-]+)_(?P<sonde_number>\d{4})_(?P<dummy>\d{6})\.(?P<filetype>[a-zA-Z]+)$'
        return

    def fn_parser(self):
        # define regex for CTD RINKO data: date, time, sondeName, sondeNumber, dummy, fieltype
        # match filename and pattern
        match = re.match(self.pattern, self.fn)
        if match:
            #return match
            self.date_org = match.group('date')
            self.yyyy, self.mm, self.dd = self.date_org[0:4],self.date_org[4:6],self.date_org[6:]
            # save another date formet
            self.date = f"{self.yyyy}-{self.mm}-{self.dd}"
            self.time_org = match.group('time')
            # save another time format
            self.hh, self.min = self.time_org[0:2], self.time_org[2:]
            self.dummy = match.group('dummy')
            self.ss = self.dummy[-2:]
            self.time = f"{self.hh}:{self.min}:{self.ss}"

            self.sonde_name = match.group('sonde_name')
            self.sonde_number = match.group('sonde_number')
            self.file_type = match.group('filetype')
        else:
            raise f"Filename {self.fn} does not match the pattern!!"

    def listUpAttr(self, form="1"):
        # form "1" is numberic date time, "2" is usual date format
        if self.file_type[-1] == 'w':
            # if file type is raw
            self.list = np.array([self.sonde_name, self.sonde_number, self.date_org if form=="1" else self.date,
                                  self.time, 1,
                                  None,  None, None])
        elif self.file_type[-1] == 'v':
            # if file type is csv
            self.list = np.array([self.sonde_name, self.sonde_number, self.date_org if form=="1" else self.date,
                                  self.time, None,
                                  1, None, None])
        else:
            raise "File type does not match 'raw' or 'csv'!!"
        return self.list

    def getDate(self):
        return self.date_org, "or", self.date

    def getTime(self):
        return self.time_org, "or", self.time

    def getSondeName(self):
        return self.sonde_name

    def getSondeNumber(self):
        return self.sonde_number

    def getFileType(self):
        return self.file_type



In [3]:
# Define functions: list-out filenames, print all attributes
def listOut_fn(path):
    # use os to retrieve list of files, and get individual elements
    all_files = os.listdir(path)
    return all_files

def printAttr(file):
    print("Date:", file.getDate())
    print("Time:", file.getTime())
    print("Sonde Name:", file.getSondeName())
    print("Sonde ID:", file.getSondeNumber())
    print("Filetype:", file.getFileType())
    return

def readFilename(fn, date_form="1"):
    # handle file name and parse it
    fileClass = filenameClass(fn)
    fileClass.fn_parser()
    list = fileClass.listUpAttr(form=date_form)
    return fileClass,list

def loopAndStack(fn_list,date_form="1"):
    # loop over all fns and stack it into one numpy array
    fn0 = fn_list[0]
    file, list_all = readFilename(fn=fn0, date_form=date_form)
    for fn in fn_list[1:]:
        file, list = readFilename(fn=fn, date_form=date_form)
        # vertically stack all
        list_all = np.vstack((list_all, list))
    return list_all

def mergeStackedList(list_all, col_names):
    # merge the array to check if it has csv and raw
    df = pd.DataFrame(list_all,columns=col_names)
    # if sondeName, SondeNumber, startDate, startTime are same, sum up 'raw' & 'csv' columns
    merged_df = df.groupby(['SondeName','SondeNumber','StartDate','StartTime']).agg(np.sum)
    return merged_df


In [4]:
############## Configure data path of data below ################
dt_path = "../data/RINKO_Data/"
# config year below: single year or "all"
year = '2014'
year = "all"
#########################################################

In [5]:
# Read and store all filenames
RINKO_fn = listOut_fn(dt_path)
if year != "all":
    filename_list = [fn for fn in RINKO_fn if fn[0:4]==year]
else:
    # TODO what happened with $2019?
    filename_list = [fn for fn in RINKO_fn if fn[0]!="$"]
# check fns
print(filename_list[:10])
print('Length of list: ', len(filename_list))

['201403031502_ASTD102-ALC-R02_0254_150205.Csv', '201403031502_ASTD102-ALC-R02_0254_150205.raw', '201403170853_ASTD102-ALC-R02_0254_085318.Csv', '201403170853_ASTD102-ALC-R02_0254_085318.raw', '201403180814_ASTD102-ALC-R02_0254_081455.Csv', '201403180814_ASTD102-ALC-R02_0254_081455.raw', '201403250855_ASTD102-ALC-R02_0254_085522.Csv', '201403250855_ASTD102-ALC-R02_0254_085522.raw', '201403280925_ASTD102-ALC-R02_0254_092557.Csv', '201403280925_ASTD102-ALC-R02_0254_092557.raw']
Length of list:  5215


In [6]:
##############testing pattern: uncomment below ##################
#filename = "201403170853_ASTD102-ALC-R02_0254_085318.csv"
#filename = "201403250855_ASTD102-ALC-R02_0254_085522.csv"
#filename = RINKO_fn[99]
# TODO: '$' added to datetime, for some cases
#########################################
print("Checking a sample case to test the code:")
file, list = readFilename(fn=filename_list[0], date_form="2")
# Check
printAttr(file)
print("The whole list is like: ", list)
print("Some elements to check: ",list[1:4])

Checking a sample case to test the code:
Date: ('20140303', 'or', '2014-03-03')
Time: ('1502', 'or', '15:02:05')
Sonde Name: ASTD102-ALC-R02
Sonde ID: 0254
Filetype: Csv
The whole list is like:  ['ASTD102-ALC-R02' '0254' '2014-03-03' '15:02:05' None 1 None None]
Some elements to check:  ['0254' '2014-03-03' '15:02:05']


In [7]:
# Loop and stack all parsed file names
results = loopAndStack(fn_list=filename_list, date_form="2")
print(results)

[['ASTD102-ALC-R02' '0254' '2014-03-03' ... 1 None None]
 ['ASTD102-ALC-R02' '0254' '2014-03-03' ... None None None]
 ['ASTD102-ALC-R02' '0254' '2014-03-17' ... 1 None None]
 ...
 ['ASTD102-ALC-R02' '0611' '2023-08-23' ... None None None]
 ['ASTD102-ALC-R02' '0611' '2023-08-24' ... 1 None None]
 ['ASTD102-ALC-R02' '0611' '2023-08-24' ... None None None]]


In [8]:
# Merge stacked list and change to pd.DataFrame
print("Note that the DF below is sorted by SondeNumber!")
results_df = mergeStackedList(results,col_names=['SondeName','SondeNumber','StartDate','StartTime','raw','csv','Latitude','Longitude'])
# Lat lon are unknown
results_df.Latitude = 'Unknown'
results_df.Longitude = 'Unknown'
print(results_df)

Note that the DF below is sorted by SondeNumber!
                                                 raw csv Latitude Longitude
SondeName       SondeNumber StartDate  StartTime                           
ASTD102-ALC-R02 0143        2016-03-30 09:28:09    1   1  Unknown   Unknown
                                       09:31:12    1   1  Unknown   Unknown
                                       09:36:36    1   1  Unknown   Unknown
                            2016-06-21 08:47:12    1   1  Unknown   Unknown
                            2016-11-21 09:00:52    1   1  Unknown   Unknown
...                                               ..  ..      ...       ...
                0611        2023-08-23 08:03:25    1   1  Unknown   Unknown
                                       08:34:39    1   1  Unknown   Unknown
                                       08:44:54    1   1  Unknown   Unknown
                                       08:56:21    1   1  Unknown   Unknown
                            2023-08-24 

In [9]:
print("Numer of files: ", len(filename_list))
print("Length of DF: ", results_df.shape[0])
# check if some rows has no raw or csv file, if yes, print it out.
print(results_df[(results_df['raw']==0) | (results_df['csv']==0)])

Numer of files:  5215
Length of DF:  2608
                                                 raw csv Latitude Longitude
SondeName       SondeNumber StartDate  StartTime                           
ASTD102-ALC-R02 0254        2018-07-19 09:45:19    1   0  Unknown   Unknown


In [10]:
# save df to csv
results_df.to_csv(f'CTD_{year}.csv')

End of code.