In [1]:
import numpy as np
import pandas as pd
import glob
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
#function inputs:
data_folder = '..\Data\Level_0\Falkor19\KT15'
output_path = '..\Data\Level_1\Falkor19\KT15'
sea_serial = 7417
sky_serial = 7409
experiment = 'Falkor19'

In [3]:
files = glob.glob(data_folder + '\*.txt')
files[1]

'..\\Data\\Level_0\\Falkor19\\KT15\\2019_Falkor_326_065848.txt'

In [6]:
#loop through filenames and import to pandas dataframes
import time 

start = time.time()
df = (pd.read_csv(f, delimiter='\s+', skiprows=1, header=None,                                                  #treat whitespace as the delimeter, ignore the header line
                  usecols=[0,1,2,3,4,5], names=['Date','Time','SeaRef','SeaTemp','SkyRef','SkyTemp'],           #use the first 6 columns, and name them as specified
                  parse_dates={'DateTime':[0,1]}, index_col=0,                                                  #parse the first two columns as a single DateTime, and make it the index column
                  na_values=['AMB','TIMEOUT','ERROR'],                                                          #list of other things the parser might encounter in these files, that should be treated like NaNs
                  dtype={'SeaRef':np.float64, 'SeaTemp':np.float64, 'SkyRef':np.float64, 'SkyTemp':np.float64}, #explicitly specify that data columns must be 64-bit floating point numbers
                  error_bad_lines=False, warn_bad_lines=True).dropna(axis='index',how='any',inplace=True)       #drop any bad lines or rows with NaN in them
      for f in glob.glob(data_folder + '\*.txt'))                                                               #iterate over all the text files found in the data_folder
end = time.time()
print(end-start)

0.0009961128234863281


In [7]:
start = time.time()
kt = df.concat()
end = time.time()
print(end-start)

AttributeError: 'generator' object has no attribute 'concat'

In [8]:
from tqdm import tqdm


### Doing it with a classic for loop
takes about 7 mins

In [10]:
from tqdm import tqdm
files = glob.glob(data_folder + '\*.txt')
li = []
start = time.time()
for filepath in tqdm(files):
    df = pd.read_csv(filepath,                                                                                     #filename to read in
                     delimiter='\s+', skiprows=1, header=None,                                                     #treat whitespace as the delimeter, ignore the header line
                     usecols=[0,1,2,3,4,5], names=['Date','Time','SeaRef','SeaTemp','SkyRef','SkyTemp'],           #use the first 6 columns, and name them as specified
                     parse_dates={'DateTime':[0,1]}, index_col=0,                                                  #parse the first two columns as a single DateTime, and make it the index column
                     na_values=['AMB','TIMEOUT','ERROR'],                                                                            #list of other things the parser might encounter in these files, that should be treated like NaNs
                     dtype={'SeaRef':np.float64, 'SeaTemp':np.float64, 'SkyRef':np.float64, 'SkyTemp':np.float64}, #explicitly specify that data columns must be 64-bit floating point numbers
                     error_bad_lines=False, warn_bad_lines=True)                                                   #if there is a bad line in the data file, drop it from the file and show a warning, but continue parsing
    df.dropna(axis='index',how='any',inplace=True)                                                                 #drop any rows that have a NaN value in them
    
    li.append(df)

end = time.time()
print(end-start)


  0%|                                                                                           | 0/22 [00:00<?, ?it/s]
  5%|███▊                                                                               | 1/22 [00:15<05:18, 15.18s/it]
  9%|███████▌                                                                           | 2/22 [00:27<04:48, 14.43s/it]
 14%|███████████▎                                                                       | 3/22 [00:42<04:34, 14.47s/it]
 18%|███████████████                                                                    | 4/22 [00:49<03:40, 12.24s/it]
 23%|██████████████████▊                                                                | 5/22 [00:55<02:57, 10.42s/it]
 27%|██████████████████████▋                                                            | 6/22 [01:10<03:05, 11.61s/it]
 32%|██████████████████████████▍                                                        | 7/22 [01:25<03:11, 12.75s/it]
 36%|██████████████████████████████▏   

403.7713885307312


In [13]:
start = time.time()
kt = pd.concat(li, axis=0)
end = time.time()
print(end-start)

0.10170221328735352


In [15]:
start = time.time()
kt.to_pickle(output_path+f'\{experiment}_KT15_{sea_serial}_{sky_serial}.p')
end = time.time()
print(end-start)

0.3690192699432373


In [16]:
import xarray
ds = xarray.Dataset.from_dataframe(kt)
ds.attrs['experiment'] = experiment
ds.attrs['sea_serial'] = sea_serial
ds.attrs['sky_serial'] = sky_serial
ds

<xarray.Dataset>
Dimensions:   (DateTime: 2972671)
Coordinates:
  * DateTime  (DateTime) datetime64[ns] 2019-11-21T04:34:36.153000 ... 2019-12-18T19:56:40.848000
Data variables:
    SeaRef    (DateTime) float64 31.51 31.51 31.51 31.51 ... 35.36 35.36 35.36
    SeaTemp   (DateTime) float64 27.94 27.98 27.9 27.92 ... 28.4 28.3 28.26
    SkyRef    (DateTime) float64 31.69 31.69 31.69 31.69 ... 35.2 35.2 35.19
    SkyTemp   (DateTime) float64 22.4 22.38 22.31 22.33 ... -2.39 -2.39 -2.5
Attributes:
    experiment:  Falkor19
    sea_serial:  7417
    sky_serial:  7409

In [18]:
ds.to_netcdf(output_path+f'/{experiment}_KT15_{sea_serial}_{sky_serial}.cdf')

In [None]:
ds = xarray.open_dataset('test.cdf')
ds