# Step 0: define class and functions

Sub 0: imports

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from datetime import datetime

Sub 1: define class 

In [2]:
###############################################
# A class to store and parse filename:        #
#    - input:                                 #
#       fn: filename                          #
#       pattern: filename pattern to match    #
#    - output:                                #
#       several methods to call               #
###############################################

class filenameClass:
    def __init__(self, fn, pattern=None):
        self.fn = fn
        # TODO change pattern for different CTD folders: make it a variable
        self.pattern = pattern
        return

    def fn_parser(self):
        # define regex for CTD RINKO data: date, time, sondeName, sondeNumber, dummy, fieltype
        # match filename and pattern
        def checkAndAssignDate(match):
            if 'date' in match.groupdict():
                self.date_org = match.group('date')
                self.yyyy, self.mm, self.dd = self.date_org[0:4],self.date_org[4:6],self.date_org[6:]
                # save another date format
                self.date = f"{self.yyyy}-{self.mm}-{self.dd}"
                return
            else:
                self.date_org = None
                self.yyyy, self.mm, self.dd = None, None, None
                # save another date format
                self.date = None
                return 
        def checkAndAssignTime(match):
            if 'time' in match.groupdict():
                self.time_org = match.group('time')
                # save another time format
                self.hh, self.min = self.time_org[0:2], self.time_org[2:]
                return
            else:
                self.time_org = None
                self.hh, self.min = None, None
                return
        
        def checkAndAssignDummy(match):
            if 'dummy' in match.groupdict():
                self.dummy = match.group('dummy')
                self.ss = self.dummy[-2:]
                self.time = f"{self.hh}:{self.min}:{self.ss}"
                return
            else:
                self.dummy = None
                self.ss = None
                self.time = None
                return 
        
        def checkAndAssignSondeName(match):
            if 'sonde_name' in match.groupdict():
                self.sonde_name = match.group('sonde_name')
                return
            else:
                self.sonde_name = None
                return 
            
        def checkAndAssignSondeNumber(match):
            if 'sonde_number' in match.groupdict():
                self.sonde_number = match.group('sonde_number')
                return
            else:
                self.sonde_number = None
                return 
            
        def checkAndAssignFiletype(match):
            if 'filetype' in match.groupdict():
                self.file_type = match.group('filetype')
                return
            else:
                self.file_type = None
                return 
        
        def checkAndAssignSno(match):
            if 'sno' in match.groupdict():
                self.sno = match.group('sno')
                # TCDKU is sonde name for compactCTD data
                if self.sno == '103':
                    self.sonde_name = 'TCDKU'
                # also, if sno, hh mm ss is in time_org
                self.hh,self.min,self.ss = self.time_org[:2],self.time_org[2:4],self.time_org[4:]
                self.time = f"{self.hh}:{self.min}:{self.ss}"
                return
            else:
                self.sno = None
                return 
        
        def checkAndAssignBklno(match):
            if 'blkno' in match.groupdict():
                self.blkno = match.group('blkno')
                return
            else:
                self.blkno = None
                return 
        
        # main dish
        match = re.match(self.pattern, self.fn)
        if match:
            # check and assign self values in match
            checkAndAssignDate(match)
            checkAndAssignTime(match)
            checkAndAssignDummy(match)
            checkAndAssignSondeName(match)
            checkAndAssignSondeNumber(match)
            checkAndAssignFiletype(match)
            # for old CTD: sno==serial number, blkno==block number, assign time in Sno
            checkAndAssignSno(match)
            checkAndAssignBklno(match)
        else:
            raise f"Filename {self.fn} does not match the pattern!!"

    def listUpAttr(self, form="1"):
        # form "1" is numberic date time, "2" is usual date format
        if (self.file_type[-1:] == 'w' or self.file_type[-1:] == 'W'):
            # if file type is raw
            if self.sno is None and self.blkno is None:
                # if no sno record
                self.list = np.array([self.sonde_name, self.sonde_number, self.date_org if form=="1" else self.date,
                                      self.time, 1,
                                      None,  None, None])
            else:
                # else, add block number to record: sonde number == sno i guess?
                self.list = np.array([self.sonde_name, self.sno, self.blkno, self.date_org if form=="1" else self.date,
                                      self.time, 1,
                                      None,  None, None])
        elif (self.file_type[-1:] == 'v'or self.file_type[-1:] == 'V'):
            # if file type is csv
            if self.sno is None and self.blkno is None:
                self.list = np.array([self.sonde_name, self.sonde_number, self.date_org if form=="1" else self.date,
                                      self.time, None,
                                      1, None, None])
            else:
                # else, add block number to record: sonde number == sno i guess?
                self.list = np.array([self.sonde_name, self.sno, self.blkno, self.date_org if form=="1" else self.date,
                                      self.time, None,
                                      1,  None, None])
        else:
            print(f"File type: {self.file_type}")
            raise "File type does not match 'raw' or 'csv'!!"
        return self.list

    def getDate(self):
        return self.date_org, "or", self.date

    def getTime(self):
        return self.time_org, "or", self.time

    def getSondeName(self):
        return self.sonde_name

    def getSondeNumber(self):
        return self.sonde_number

    def getFileType(self):
        return self.file_type
    
    def getSno(self):
        return self.sno
    
    def getBlkno(self):
        return self.blkno
    
    def printPattern(self):
        print(f"Input pattern: {self.pattern}")
        return


Sub 2: define functions

In [3]:
# Define functions: list-out filenames, print all attributes
def listOut_fn(path):
    # use os to retrieve list of files, and get individual elements
    all_files = os.listdir(path)
    # list all files except folder
    all_files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    return all_files

def printAttr(file):
    print("Date:", file.getDate())
    print("Time:", file.getTime())
    print("Sonde Name:", file.getSondeName())
    print("Serial number:", file.getSno())
    print("Sonde ID:", file.getSondeNumber())
    print("Block number:", file.getBlkno())
    print("Filetype:", file.getFileType())
    return

def readFilename(fn, date_form="1", pattern=None):
    # handle file name and parse it
    fileClass = filenameClass(fn=fn, pattern=pattern)
    fileClass.fn_parser()
    list = fileClass.listUpAttr(form=date_form)
    return fileClass,list

def loopAndStack(fn_list,pattern,date_form="1"):
    # loop over all fns and stack it into one numpy array
    fn0 = fn_list[0]
    file, list_all = readFilename(fn=fn0, date_form=date_form,pattern=pattern)
    for fn in fn_list[1:]:
        file, list = readFilename(fn=fn, date_form=date_form,pattern=pattern)
        # vertically stack all
        list_all = np.vstack((list_all, list))
    return list_all

def mergeStackedList(list_all, col_names, groupby):
    # merge the array to check if it has csv and raw
    df = pd.DataFrame(list_all,columns=col_names)
    # if sondeName, SondeNumber, startDate, startTime are same, sum up 'raw' & 'csv' columns
    merged_df = df.groupby(groupby).agg(np.sum)
    return merged_df

def findIncompleteData(results):
    mask = results_df[(results_df['raw']==0) | (results_df['csv']==0)]
    if len(mask):
        print(mask)
        print(f"Length of incomplete data: {len(mask)}")
        return mask
    else:
        print("Results have equal raw and csv.")
        return 

def checkFileLength(len_fn_raw,len_dummy,len_results,len_single):
    len_both = len_results-len_single # files that have both csv and raw
    total = len_dummy + (len_both)*2 + len_single
    # if len(filename list) == dummy + len(double)*2 + single
    if total == len_fn_raw:
        print(f"Length matches.")
        print(f"Fn raw: {len_fn_raw}")
        print(f"Total: {total}")
        print(f"Dummy: {len_dummy}")
        print(f"Files [csv && raw]: {len_both}")
        print(f"Files [csv // raw]: {len_single}")
        return True
    else:
        print("Length does not match!!")
        print(f"Fn raw: {len_fn_raw}")
        print(f"Total: {total}")
        print(f"Dummy: {len_dummy}")
        print(f"Files [csv && raw]: {len_both}")
        print(f"Files [csv // raw]: {len_single}")
        return False
    
def countAndRemove99999(fn_list):
    dummy_fn = [item for item in fn_list if item[-9:-4]=='99999']
    count = len(dummy_fn)
    print(f"Dummy files: {dummy_fn}")
    print(f"Length of dummy file: {count}")
    output = [item for item in fn_list if item not in dummy_fn]
    return dummy_fn, count, output

Sub 3: Configurate constants

In [4]:
############## Configure regex patterns below #################
pattern_RINNKO = r'^(?P<date>\d{8})(?P<time>\d{4})_(?P<sonde_name>[A-Z\d-]+)_(?P<sonde_number>\d{4})_(?P<dummy>\d{6})\.(?P<filetype>[a-zA-Z]+)$'
pattern_afterEQ = r'^(?P<date>\d{8})_(?P<time>\d{6})_SNo_(?P<sno>\d{3})_BlkNo_(?P<blkno>\d{4})\.(?P<filetype>[a-zA-Z]+)$'

# Step 1: compile RINNKO CTD data

In [5]:
############## Configure data path of data below #################
dt_path = "../data/RINKO_Data/"
# config year below: single year or "all"
#year = '2014'
year = "all"
#########################################################

In [6]:
# Read and store all filenames
RINKO_fn = listOut_fn(dt_path)
print(f"Length of raw list: {len(RINKO_fn)}")
if year != "all":
    filename_list = [fn for fn in RINKO_fn if fn[0:4]==year]
else:
    filename_list = [fn for fn in RINKO_fn if fn[0]!="$"]
    pass
# check fns && remove dummy files (e.g., 99999 at last 5 digits)
dummy_fn, dummy_count, filename_list = countAndRemove99999(fn_list=filename_list)
print('Length of new list: ', len(filename_list))

Length of raw list: 5217
Dummy files: []
Length of dummy file: 0
Length of new list:  5217


In [7]:
##############testing pattern: uncomment below ##################
#filename = "201403170853_ASTD102-ALC-R02_0254_085318.csv"
#filename = "201403250855_ASTD102-ALC-R02_0254_085522.csv"
#filename = RINKO_fn[99]
# TODO: '$' added to datetime, for some cases
#########################################
print("Checking a sample case to test the code:")
file, list = readFilename(fn=filename_list[1], date_form="2",pattern=pattern_RINNKO)
file.printPattern()
# Check
printAttr(file)
print("The whole list is like: ", list)
print("Some elements to check: ",list[1:4])

Checking a sample case to test the code:
Input pattern: ^(?P<date>\d{8})(?P<time>\d{4})_(?P<sonde_name>[A-Z\d-]+)_(?P<sonde_number>\d{4})_(?P<dummy>\d{6})\.(?P<filetype>[a-zA-Z]+)$
Date: ('20140303', 'or', '2014-03-03')
Time: ('1502', 'or', '15:02:05')
Sonde Name: ASTD102-ALC-R02
Serial number: None
Sonde ID: 0254
Block number: None
Filetype: raw
The whole list is like:  ['ASTD102-ALC-R02' '0254' '2014-03-03' '15:02:05' 1 None None None]
Some elements to check:  ['0254' '2014-03-03' '15:02:05']


In [8]:
# Loop and stack all parsed file names
results = loopAndStack(fn_list=filename_list, date_form="2",pattern=pattern_RINNKO)
print(results)

[['ASTD102-ALC-R02' '0254' '2014-03-03' ... 1 None None]
 ['ASTD102-ALC-R02' '0254' '2014-03-03' ... None None None]
 ['ASTD102-ALC-R02' '0254' '2014-03-17' ... 1 None None]
 ...
 ['ASTD102-ALC-R02' '0611' '2023-08-23' ... None None None]
 ['ASTD102-ALC-R02' '0611' '2023-08-24' ... 1 None None]
 ['ASTD102-ALC-R02' '0611' '2023-08-24' ... None None None]]


In [9]:
# Merge stacked list and change to pd.DataFrame
print("Note that the DF below is sorted by SondeNumber!")
results_df = mergeStackedList(results,col_names=['SondeName','SondeNumber','StartDate','StartTime','raw','csv','Latitude','Longitude'],
                             groupby=['SondeName','SondeNumber','StartDate','StartTime'])
# Lat lon are unknown
results_df.Latitude = 'Unknown'
results_df.Longitude = 'Unknown'
print(results_df)

Note that the DF below is sorted by SondeNumber!
                                                  raw  csv Latitude Longitude
SondeName       SondeNumber StartDate  StartTime                             
ASTD102-ALC-R02 0143        2016-03-30 09:28:09     1    1  Unknown   Unknown
                                       09:31:12     1    1  Unknown   Unknown
                                       09:36:36     1    1  Unknown   Unknown
                            2016-06-21 08:47:12     1    1  Unknown   Unknown
                            2016-11-21 09:00:52     1    1  Unknown   Unknown
...                                               ...  ...      ...       ...
                0611        2023-08-23 08:03:25     1    1  Unknown   Unknown
                                       08:34:39     1    1  Unknown   Unknown
                                       08:44:54     1    1  Unknown   Unknown
                                       08:56:21     1    1  Unknown   Unknown
               

In [10]:
print("Numer of files: ", len(filename_list))
print("Length of DF: ", results_df.shape[0])
# check if some rows has no raw or csv file, if yes, print it out.
incomplete_data = findIncompleteData(results=results_df)

Numer of files:  5217
Length of DF:  2609
                                                  raw  csv Latitude Longitude
SondeName       SondeNumber StartDate  StartTime                             
ASTD102-ALC-R02 0254        2018-07-19 09:45:19     1    0  Unknown   Unknown
Length of incomplete data: 1


In [11]:
# check lengths
checking = checkFileLength(len_fn_raw=len(RINKO_fn),len_dummy=dummy_count,len_results=len(results_df),len_single=len(incomplete_data))

Length matches.
Fn raw: 5217
Total: 5217
Dummy: 0
Files [csv && raw]: 2608
Files [csv // raw]: 1


In [12]:
# save df to csv
if checking:
    save_path = "./output/"
    results_df.to_csv(f'{save_path}CTD_RINNKO_{year}.csv')
    print(f"Saved, at {save_path}")
else:
    print("No save -> length did not match.")

Saved, at ./output/


# Step 2: compile 震災後 compactCTD data

Extra: an extra function to count and remove dummy CTD

In [13]:
############## Configure data path of data below #################
dt_path = "../data/CompactCTD Data（震災後）/"
# config year below: single year or "all"
#year = '2014'
year = "all"
#########################################################

In [14]:
# Read and store all filenames
afterEQ_fn = listOut_fn(dt_path)
if year != "all":
    filename_list = [fn for fn in afterEQ_fn if fn[0:4]==year]
else:
    filename_list = [fn for fn in afterEQ_fn if fn[0]!="$"]
# check fns && remove dummy files (e.g., 99999 at last 5 digits)
dummy_fn, dummy_count, filename_list = countAndRemove99999(fn_list=filename_list)
print('Length of new list: ', len(filename_list))

Dummy files: ['20200124_144542_SNo_99999_BlkNo_99999.Csv', '20200124_144542_SNo_99999_BlkNo_99999.RAW', '20200228_104715_SNo_99999_BlkNo_99999.Csv', '20200228_104715_SNo_99999_BlkNo_99999.RAW', '20200309_132906_SNo_99999_BlkNo_99999.Csv', '20200309_132906_SNo_99999_BlkNo_99999.RAW', '20200309_133221_SNo_99999_BlkNo_99999.Csv', '20200309_133221_SNo_99999_BlkNo_99999.RAW', '20200309_133603_SNo_99999_BlkNo_99999.Csv', '20200309_133603_SNo_99999_BlkNo_99999.RAW', '20200309_133819_SNo_99999_BlkNo_99999.Csv', '20200309_133819_SNo_99999_BlkNo_99999.RAW', '20200312_111954_SNo_99999_BlkNo_99999.Csv', '20200312_111954_SNo_99999_BlkNo_99999.RAW', '20200312_112351_SNo_99999_BlkNo_99999.Csv', '20200312_112351_SNo_99999_BlkNo_99999.RAW', '20200312_114450_SNo_99999_BlkNo_99999.Csv', '20200312_114450_SNo_99999_BlkNo_99999.RAW']
Length of dummy file: 18
Length of new list:  1661


In [15]:
# testing one single case
print("Checking a sample case to test the code:")
file, list = readFilename(fn=filename_list[1], date_form="2",pattern=pattern_afterEQ)
file.printPattern()
# Check
printAttr(file)
print("The whole list is like: ", list)
print("Some elements to check: ",list[1:4])

Checking a sample case to test the code:
Input pattern: ^(?P<date>\d{8})_(?P<time>\d{6})_SNo_(?P<sno>\d{3})_BlkNo_(?P<blkno>\d{4})\.(?P<filetype>[a-zA-Z]+)$
Date: ('20120306', 'or', '2012-03-06')
Time: ('083509', 'or', '08:35:09')
Sonde Name: TCDKU
Serial number: 103
Sonde ID: None
Block number: 0002
Filetype: Csv
The whole list is like:  ['TCDKU' '103' '0002' '2012-03-06' '08:35:09' None 1 None None]
Some elements to check:  ['103' '0002' '2012-03-06']


In [16]:
# Loop and stack all parsed file names
results = loopAndStack(fn_list=filename_list, date_form="2",pattern=pattern_afterEQ)
print(results)

[['TCDKU' '103' '0001' ... 1 None None]
 ['TCDKU' '103' '0002' ... 1 None None]
 ['TCDKU' '103' '0003' ... 1 None None]
 ...
 ['TCDKU' '103' '0145' ... None None None]
 ['TCDKU' '103' '0146' ... 1 None None]
 ['TCDKU' '103' '0146' ... None None None]]


In [17]:
# Merge stacked list and change to pd.DataFrame
print("Note that the DF below is sorted by SondeNumber!")
results_df = mergeStackedList(results,col_names=['SondeName','SerialNumber(Sno)','BlockNumber(Blkno)','StartDate','StartTime','raw','csv','Latitude','Longitude'],
                             groupby=['SondeName','SerialNumber(Sno)','BlockNumber(Blkno)','StartDate','StartTime'])

# Lat lon are unknown
results_df.Latitude = 'Unknown'
results_df.Longitude = 'Unknown'
print(results_df)

Note that the DF below is sorted by SondeNumber!
                                                                     raw  csv  \
SondeName SerialNumber(Sno) BlockNumber(Blkno) StartDate  StartTime             
TCDKU     103               0001               2012-03-05 11:54:37     0    1   
                                               2013-12-02 10:05:04     1    1   
                                               2013-12-06 11:08:02     1    1   
                                               2013-12-13 13:27:48     1    1   
                                               2014-01-06 11:29:00     1    1   
...                                                                  ...  ...   
                            0248               2020-03-11 13:54:06     1    1   
                            0249               2020-03-11 13:54:43     1    1   
                            0250               2020-03-11 13:56:32     1    1   
                            0251               2020-03-11 13

In [18]:
print("Numer of files: ", len(filename_list))
print("Length of DF: ", results_df.shape[0])
# check if some rows has no raw or csv file, if yes, print it out.
incomplete_data = findIncompleteData(results=results_df)

Numer of files:  1661
Length of DF:  840
                                                                     raw  csv  \
SondeName SerialNumber(Sno) BlockNumber(Blkno) StartDate  StartTime             
TCDKU     103               0001               2012-03-05 11:54:37     0    1   
                            0002               2012-03-06 08:35:09     0    1   
                            0003               2012-03-08 09:03:01     0    1   
                            0004               2012-03-09 11:42:31     0    1   
                            0005               2012-03-12 10:34:24     0    1   
                            0006               2012-03-13 08:45:29     0    1   
                            0007               2012-03-14 08:45:26     0    1   
                            0008               2012-03-16 08:56:43     0    1   
                            0009               2012-03-21 08:17:42     0    1   
                            0023               2012-03-22 10:34:40  

In [19]:
# check lengths
checking = checkFileLength(len_fn_raw=len(afterEQ_fn),len_dummy=dummy_count,len_results=len(results_df),len_single=len(incomplete_data))

Length matches.
Fn raw: 1679
Total: 1679
Dummy: 18
Files [csv && raw]: 821
Files [csv // raw]: 19


In [20]:
# save df to csv
if checking:
    save_path = "./output/"
    results_df.to_csv(f'{save_path}CTD_震災後_{year}.csv')
    print(f"Saved, at {save_path}")
else:
    print("No save -> length did not match.")

Saved, at ./output/


# Step 3: (ing) compile 被災したPCより吸い出せたもの compactCTD data

In [21]:
#TODO list up all .csv // .raw files in the subdirectory
# make two outputs: 1) csv, 2) raw
# note: same pattern as afterEQ (becoz they are compactCTD)

In [22]:
import os
import numpy as np

def listupAllFiles():
    def list_files(directory):
        file_list = []
        # Walk through all directories and subdirectories
        for root, dirs, files in os.walk(directory):
            # Iterate over each file in the current directory
            for file in files:
                # Get the absolute path of the file -> get file only
                #file_path = os.path.join(root, file)
                file_path = file
                # Append the file path to the list
                file_list.append(file_path)
        return file_list
    
    def showStats(all_files):
        # count all length
        print(f"Length: {len(all_files)}")
        # count csv length
        csv_count = sum(1 for file in all_files if file.lower().endswith((".csv", ".CSV")))
        print(f"CSV length: {csv_count}")
        # count raw length
        raw_count = sum(1 for file in all_files if file.lower().endswith((".RAW", ".raw")))
        print(f"RAW length: {raw_count}")
        # check not csv nor raw
        other_files = [file for file in all_files if not file.lower().endswith((".csv",".raw"))]
        print(f"Other length: {len(other_files)}")
        print(f"Other files: {other_files}")
        
    # Directory to search in
    dir_name = "CompactCTD data（被災したPCより吸い出せたもの）"
    directory = f"../data/{dir_name}"
    # Get list of all files in directory and subdirectories
    all_files = list_files(directory)
    showStats(all_files)
    # Save the list of files to a text file
    np.savetxt(f'all_files_{dir_name}.txt', all_files, fmt='%s')
    return all_files

# run
filename_list = listupAllFiles()

Length: 3419
CSV length: 726
RAW length: 2690
Other length: 3
Other files: ['20091002_082231_SNo_103_BlkNo_0072.RAW\u3000七戻', '20091002_085758_SNo_103_BlkNo_0073.RAW\u3000鵜住居', '20091002_091112_SNo_103_BlkNo_0074.RAW 大槌港']


In [23]:
import re

def checkFilesAndPattern(all_files,pattern,show_not_matched=False):
    re_pattern = re.compile(pattern)
    matched_files = [file_name for file_name in all_files if re_pattern.match(file_name)]
    # show length
    print(f"Number of files matched: {len(matched_files)}")
    not_matched_files = [file_name for file_name in all_files if not re_pattern.match(file_name)]
    # show not matched
    print(f"Number of files NOT matched: {len(not_matched_files)}")
    if show_not_matched:
        # show NOT matched files
        print(f"Files NOT matched: {not_matched_files}")
    return matched_files

# run
matched_files = checkFilesAndPattern(all_files=filename_list,pattern=pattern_afterEQ,show_not_matched=False)
print(f"Pattern to be used: {pattern_afterEQ}")

Number of files matched: 3380
Number of files NOT matched: 39
Pattern to be used: ^(?P<date>\d{8})_(?P<time>\d{6})_SNo_(?P<sno>\d{3})_BlkNo_(?P<blkno>\d{4})\.(?P<filetype>[a-zA-Z]+)$


In [24]:
# check if files are redundant
from collections import Counter

def checkFilesRedundent(all_files):
    file_name_counter = Counter(all_files)
    #  Check if any filename appears more than twice
    n = 0
    total_repeated = 0
    total = 0
    for file_name, count in file_name_counter.items():
        total += count
        if count >= 2:
            #print(f"{file_name} appears more than twice.")
            total_repeated += count - 1
            n += 1
    print(f"Total {n} files appeared more than twice.")
    print(f"Extra {total_repeated} should be considered (aka dummy files)")
    print(f"Total {total} (including redundant files)")
    return n, total_repeated
# run
repeated_file_n, dummy_n = checkFilesRedundent(all_files=matched_files)

Total 431 files appeared more than twice.
Extra 1082 should be considered (aka dummy files)
Total 3380 (including redundant files)


In [25]:
# only handle matched files
filename_list = matched_files

In [26]:
# Loop and stack all parsed file names
results = loopAndStack(fn_list=filename_list, date_form="2",pattern=pattern_afterEQ)
print(results)
print(results.shape)

[['TCDKU' '103' '0002' ... None None None]
 ['TCDKU' '103' '0001' ... None None None]
 ['TCDKU' '103' '0001' ... None None None]
 ...
 ['TCDKU' '103' '0003' ... None None None]
 ['TCDKU' '103' '0013' ... None None None]
 ['TCDKU' '103' '0014' ... None None None]]
(3380, 9)


In [27]:
# Merge stacked list and change to pd.DataFrame
print("Note that the DF below is sorted by SondeNumber!")
results_df = mergeStackedList(results,col_names=['SondeName','SerialNumber(Sno)','BlockNumber(Blkno)','StartDate','StartTime','raw','csv','Latitude','Longitude'],
                             groupby=['SondeName','SerialNumber(Sno)','BlockNumber(Blkno)','StartDate','StartTime'])

# Lat lon are unknown
results_df.Latitude = 'Unknown'
results_df.Longitude = 'Unknown'
print(results_df)
print(results_df.shape) # note: from 3380 to 1527 (830 are singular)

Note that the DF below is sorted by SondeNumber!
                                                                     raw  csv  \
SondeName SerialNumber(Sno) BlockNumber(Blkno) StartDate  StartTime             
TCDKU     103               0001               2006-11-19 23:50:24     1    0   
                                               2007-04-04 15:58:55     1    0   
                                               2007-05-08 08:49:26     1    1   
                                               2007-05-22 08:25:08     1    1   
                                               2007-05-29 08:38:36     2    1   
...                                                                  ...  ...   
                            0292               2009-12-03 09:58:32     1    0   
                            0293               2009-12-07 08:24:48     1    1   
                            0294               2009-12-07 08:48:30     1    1   
                            0295               2009-12-07 08

In [28]:
print("Numer of files: ", len(filename_list))
print("Length of DF: ", results_df.shape[0])
# check if some rows has no raw or csv file, if yes, print it out.
incomplete_data = findIncompleteData(results=results_df)

Numer of files:  3380
Length of DF:  1527
                                                                     raw  csv  \
SondeName SerialNumber(Sno) BlockNumber(Blkno) StartDate  StartTime             
TCDKU     103               0001               2006-11-19 23:50:24     1    0   
                                               2007-04-04 15:58:55     1    0   
                                               2007-07-13 13:31:52     1    0   
                                               2007-08-19 09:25:30     1    0   
                                               2007-11-08 15:58:22     1    0   
...                                                                  ...  ...   
                            0289               2009-11-30 09:38:43     1    0   
                            0290               2009-12-01 11:32:44     1    0   
                            0291               2009-12-02 10:00:02     1    0   
                            0292               2009-12-03 09:58:32 

In [29]:
# check lengths
dummy_count = dummy_n
checking = checkFileLength(len_fn_raw=len(matched_files),len_dummy=dummy_count,len_results=len(results_df),len_single=len(incomplete_data))
# TODO: raw matched files are 3380, but output is 3306, why? -> what happended in mergeStackedList() step?
print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
print("TODO: raw matched files are 3380, but output is 3306")

Length does not match!!
Fn raw: 3380
Total: 3306
Dummy: 1082
Files [csv && raw]: 697
Files [csv // raw]: 830
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
TODO: raw matched files are 3380, but output is 3306


In [30]:
# check length by sum up raw and csv
raw_sum = np.sum(results_df["raw"])
csv_sum = np.sum(results_df["csv"])
print(raw_sum,csv_sum,raw_sum+csv_sum)
print(results_df[results_df['raw']>2])

2603 703 3306
                                                                     raw  csv  \
SondeName SerialNumber(Sno) BlockNumber(Blkno) StartDate  StartTime             
TCDKU     103               0001               2009-07-15 08:46:26     6    0   
                            0002               2009-07-20 10:55:50     6    0   
                            0003               2009-07-20 11:10:25     6    0   
                            0004               2009-07-20 14:51:01     6    0   
                            0005               2009-07-21 14:08:00     6    0   
...                                                                  ...  ...   
                            0261               2009-11-10 09:21:45     4    1   
                            0262               2009-11-10 09:36:30     4    1   
                            0263               2009-11-10 09:45:48     4    0   
                            0264               2009-11-11 09:45:03     3    1   
              

In [31]:
# save df to csv
#if checking:
#    save_path = "./output/"
#    results_df.to_csv(f'{save_path}CTD_被災したPCより吸い出せたもの_{year}.csv')
#    print(f"Saved, at {save_path}")
#else:
#    print("No save -> length did not match.")

save_path = "./output/"
results_df.to_csv(f'{save_path}CTD_被災したPCより吸い出せたもの_all.csv')
print(f"Saved, at {save_path}")

Saved, at ./output/


End of code.