In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re

In [2]:
# A class to store and parse filename 
class filenameClass:
    def __init__(self, fn):
        self.fn = fn
        return

    def fn_parser(self):
        # define regex for CTD RINKO data: date, time, sondeName, sondeNumber, dummy, fieltype
        filename = self.fn
        pattern = r'^(?P<date>\d{8})(?P<time>\d{4})_(?P<sonde_name>[A-Z\d-]+)_(?P<sonde_number>\d{4})_(?P<dummy>\d{6})\.(?P<filetype>[a-zA-Z]+)$'
        # match filename and pattern
        match = re.match(pattern, filename)
        if match:
            #return match
            self.date = match.group('date')
            self.time = match.group('time')
            self.sonde_name = match.group('sonde_name')
            self.sonde_number = match.group('sonde_number')
            self.dummy = match.group('dummy')
            self.file_type = match.group('filetype')
        else:
            raise f"Filename {self.fn} does not match the pattern!!"

    def listUpAttr(self):
        if self.file_type[-1] == 'w':
            # if file type is raw
            self.list = np.array([self.sonde_name, self.sonde_number, self.date, self.time, 1, None,  None, None])
        elif self.file_type[-1] == 'v':
            # if file type is csv
            self.list = np.array([self.sonde_name, self.sonde_number, self.date, self.time, None, 1, None, None])
        else:
            raise "File type does not match 'raw' or 'csv'!!"
        return self.list

    def getDate(self):
        return self.date

    def getTime(self):
        return self.time

    def getSondeName(self):
        return self.sonde_name

    def getSondeNumber(self):
        return self.sonde_number

    def getFileType(self):
        return self.file_type



In [3]:
# Define functions: list-out filenames, print all attributes
def listOut_fn(path):
    # TODO use os to retrieve list of files, and get individual elements
    all_files = os.listdir(path)
    return all_files

def printAttr(file):
    print("Date:", file.getDate())
    print("Time:", file.getTime())
    print("Sonde Name:", file.getSondeName())
    print("Sonde ID:", file.getSondeNumber())
    print("Filetype:", file.getFileType())
    return

def readFilename(fn):
    fileClass = filenameClass(fn)
    fileClass.fn_parser()
    list = fileClass.listUpAttr()
    return fileClass,list

def loopAndStack(fn_list):
    fn0 = fn_list[0]
    file, list_all = readFilename(fn0)
    for fn in fn_list[1:]:
        file, list = readFilename(fn)
        # vertically stack all
        list_all = np.vstack((list_all, list))
    return list_all

def mergeStackedList(list_all, col_names):
    df = pd.DataFrame(list_all,columns=col_names)
    # if sondeName, SondeNumber, startDate, startTime are same, sum up 'raw' & 'csv' columns
    merged_df = df.groupby(['SondeName','SondeNumber','StartDate','StartTime']).agg(np.sum)
    return merged_df


In [4]:
############## Configure data path of data below ################
dt_path = "../data/RINKO_Data/"
# test only year 2014
year = '2014'
#########################################################

In [5]:
# Read and store all filenames
RINKO_fn = listOut_fn(dt_path)
filename_list = [fn for fn in RINKO_fn if fn[0:4]==year]
# check fns
print(filename_list[:10])
print('Length of list: ', len(filename_list))

['201403031502_ASTD102-ALC-R02_0254_150205.Csv', '201403031502_ASTD102-ALC-R02_0254_150205.raw', '201403170853_ASTD102-ALC-R02_0254_085318.Csv', '201403170853_ASTD102-ALC-R02_0254_085318.raw', '201403180814_ASTD102-ALC-R02_0254_081455.Csv', '201403180814_ASTD102-ALC-R02_0254_081455.raw', '201403250855_ASTD102-ALC-R02_0254_085522.Csv', '201403250855_ASTD102-ALC-R02_0254_085522.raw', '201403280925_ASTD102-ALC-R02_0254_092557.Csv', '201403280925_ASTD102-ALC-R02_0254_092557.raw']
Length of list:  208


In [6]:
##############testing pattern: uncomment below ##################
#filename = "201403170853_ASTD102-ALC-R02_0254_085318.csv"
#filename = "201403250855_ASTD102-ALC-R02_0254_085522.csv"
#filename = RINKO_fn[99]
# TODO: '$' added to datetime, for some cases
#########################################
file, list = readFilename(fn=filename_list[0])
# Check
printAttr(file)
print(list)
print(list[1:4])

Date: 20140303
Time: 1502
Sonde Name: ASTD102-ALC-R02
Sonde ID: 0254
Filetype: Csv
['ASTD102-ALC-R02' '0254' '20140303' '1502' None 1 None None]
['0254' '20140303' '1502']


In [7]:
# Loop and stack all parsed file names
results = loopAndStack(fn_list=filename_list)
print(results)

[['ASTD102-ALC-R02' '0254' '20140303' ... 1 None None]
 ['ASTD102-ALC-R02' '0254' '20140303' ... None None None]
 ['ASTD102-ALC-R02' '0254' '20140317' ... 1 None None]
 ...
 ['ASTD102-ALC-R02' '0254' '20141210' ... None None None]
 ['ASTD102-ALC-R02' '0254' '20141212' ... 1 None None]
 ['ASTD102-ALC-R02' '0254' '20141212' ... None None None]]


In [8]:
# Merge stacked list and change to pd.DataFrame
results_df = mergeStackedList(results,col_names=['SondeName','SondeNumber','StartDate','StartTime','raw','csv','Latitude','Longitude'])
# Lat lon are unknown
results_df.Latitude = 'Unknown'
results_df.Longitude = 'Unknown'
print(results_df)

                                                raw csv Latitude Longitude
SondeName       SondeNumber StartDate StartTime                           
ASTD102-ALC-R02 0254        20140303  1502        1   1  Unknown   Unknown
                            20140317  0853        1   1  Unknown   Unknown
                            20140318  0814        1   1  Unknown   Unknown
                            20140325  0855        1   1  Unknown   Unknown
                            20140328  0925        1   1  Unknown   Unknown
...                                              ..  ..      ...       ...
                            20141204  0748        1   1  Unknown   Unknown
                            20141205  1115        1   1  Unknown   Unknown
                            20141209  0859        1   1  Unknown   Unknown
                            20141210  0926        1   1  Unknown   Unknown
                            20141212  0900        1   1  Unknown   Unknown

[104 rows x 4 columns]


In [9]:
# save df to csv
results_df.to_csv('CTD_2014.csv')

End of code.