In [1]:
import numpy as np
import pandas as pd
import os
import csv

In [2]:
#Step 1
FIELDS = ('VMAX', 'MSLP', 'TYPE', 'LAT', 'LON', 'CD20', 'COHC', 'DTL', 'OAGE', 'RSST', 'U200', 'U20C', 'V20C', 'E000', 'RHLO', 'RHMD', 'RHHI', 'PSLV', 'Z850', 'D200', 'REFC', 'PEFC', 'T000', 'R000', 'TLAT', 'TLON', 'TWAC', 'G150', 'G200', 'G250', 'V000', 'V850', 'V500', 'V300', 'TGRD', 'TADV', 'PENC', 'SHDC', 'SDDC', 'SHGC', 'DIVC', 'T150', 'T200', 'T250', 'SHRD', 'SHTD', 'SHRS', 'SHTS', 'SHRG', 'PENV', 'VMPI', 'VMFX', 'HE05', 'O500', 'O700', 'CFLX')

DELIMITER = tuple([5]*25)

idx = np.cumsum([0] + list(DELIMITER))
slices = [slice(i, j) for (i, j) in zip(idx[:-1], idx[1:])]

def parse(line, col_index=2):
    val = [to_num(line[s].strip()) for s in slices]
    return [val[col_index]] + [val[23]]

DELIMITER_HEADER = (5, 7, 3, 5, 7, 7, 5, 9)

idx_h = np.cumsum([0] + list(DELIMITER_HEADER))
slices_h = [slice(i, j) for (i, j) in zip(idx_h[:-1], idx_h[1:])]

def parse_header(line):
    return [line[s].strip() for s in slices_h]

def to_num(val):
    try:
        return int(val)
    except ValueError:
        return val

In [3]:
#Step 2
# Load data from file
storms = []
storm = {'header': None, 'data': []}
storm_set = []
bln_process = True

with open('./data/lsdiaga_1982_2017_sat_ts.dat', newline='') as csvfile:
    datareader = csv.reader(csvfile, delimiter='>')
#     n = 0
    for row in datareader:
#         n = n + 1
#         if n > 500:
#             break
#         print(row[0])
        
        if "HEAD" in row[0]:
#             ['ALBE', '820602', '12', '20', '21.7', '87.1', '1005', 'AL011982']
            storm_header = parse_header(row[0])
            storm_id = storm_header[7]
            storm['header'] = storm_header
            storm['data'] = []
            if storm_id not in storm_set:
                storm_set.append(storm_id)
        elif "LAST" in row[0]:
            storms.append(storm.copy())
        else:
            row_val = parse(row[0])
            if row_val[1] == 'PSLV':
                storm['data'].append(row_val)
                temp_val = parse(row[0], 3)
                temp_val[1] = temp_val[1] + '1'
                storm['data'].append(temp_val)
                temp_val2 = parse(row[0], 4)
                temp_val2[1] = temp_val2[1] + '2'
                storm['data'].append(temp_val2)
            elif row_val[1] in FIELDS:
                storm['data'].append(row_val)

# print(storm_set)
# print(storm)
# print(storms)

In [4]:
#Step 3: Create Pandas Dataframe
storm_data = []
labels = ['0 - 9', '10 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69', '70 - 79', '80 - 89', '90 - 99']
for storm_item in storms:
    storm_name, dt, dt_time, storm_id = [storm_item['header'][index] for index in [0, 1, 2, 7]]
    yr = int(dt[:2])
    yr_val = dt[:2]
    dt = '20' + str(dt) if yr < 82 else '19' + str(dt)
    dt = dt + ' ' + str(dt_time)
    dt = pd.to_datetime(dt, format='%Y%m%d %H', errors='ignore')
    
    yr_val = int('20' + yr_val if yr < 82 else '19' + yr_val)
        
    df_storm = pd.DataFrame(storm_item['data'])
    df_storm = df_storm.iloc[:, 0:24]

    # print(frame.iloc[:, [0,1,2,3,4,5,6,7,8, 9, 23]])
    df_storm = df_storm.T
    df_storm.columns = df_storm.iloc[1]
    df_storm = df_storm.drop(1)
#     df_storm = df_storm.iloc[2:23, :]
    df_storm.insert(0, 'ID', storm_id)
    df_storm.insert(1, 'NAME', storm_name)
    df_storm.reset_index(drop=True, inplace=True)
    df_storm.insert(2, 'DATETIME', dt)
    df_storm.insert(3, 'YEAR', yr_val)
    df_storm.insert(4, "REGION", 'Atlantic')
    storm_data.append(df_storm)
#     print (df_storm)


In [5]:
#Step 4
# storm_data[0]
# len(storm_data)
df_storms = pd.concat(storm_data)
df_storms.shape

(12526, 63)

In [6]:
#Step 5
# print('# of unique storms: ', len(storm_set))
#Replaced 9999 values with NaN
df_storms = df_storms.replace(to_replace=9999, value=np.nan)
#Dropped storms having any NaN values
df_nulls = df_storms[df_storms.isnull().any(axis=1)]
# a[a['MSLP'].isnull()]
ids = df_nulls['ID'].unique()
df_storms = df_storms.loc[~ df_storms['ID'].isin(ids)]
#Labeled each storm with Category
# cat_labels = ['TD', 'TS', 'Cat1', 'Cat2', 'Cat3', 'Cat4', 'Cat5']
cat_labels = [-1, 0, 1, 2, 3, 4, 5]
cat_bins = [0, 33, 63, 82, 95, 112, 136, 3000]
df_storms['CATEGORY'] = pd.cut(df_storms['VMAX'], bins=cat_bins, labels=cat_labels)
df_storms.reset_index(drop=True, inplace=True)
df_storms.loc[df_storms.ID == 'AL272005']

1,ID,NAME,DATETIME,YEAR,REGION,VMAX,MSLP,TYPE,LAT,LON,...,SHTS,SHRG,PENV,VMPI,VMFX,HE05,O500,O700,CFLX,CATEGORY
6703,AL272005,BETA,2005-10-26 18:00:00,2005,Atlantic,30,1007.0,1,104,809,...,7,152,117,167,1078,30,-91,-109,-3,-1
6704,AL272005,BETA,2005-10-27 00:00:00,2005,Atlantic,30,1007.0,1,107,811,...,23,165,107,168,1278,35,-68,-74,25,-1
6705,AL272005,BETA,2005-10-27 06:00:00,2005,Atlantic,35,1005.0,1,110,813,...,25,173,121,168,1326,10,-78,-101,32,0
6706,AL272005,BETA,2005-10-27 12:00:00,2005,Atlantic,45,1000.0,1,113,813,...,29,101,121,170,1305,5,-96,-104,25,0
6707,AL272005,BETA,2005-10-27 18:00:00,2005,Atlantic,50,997.0,1,116,813,...,9,106,116,163,1097,10,-94,-108,-12,0
6708,AL272005,BETA,2005-10-28 00:00:00,2005,Atlantic,50,995.0,1,119,812,...,8,174,109,162,1234,17,-74,-86,55,0
6709,AL272005,BETA,2005-10-28 06:00:00,2005,Atlantic,50,994.0,1,123,811,...,15,175,113,163,1287,24,-80,-99,91,0
6710,AL272005,BETA,2005-10-28 12:00:00,2005,Atlantic,50,993.0,1,128,810,...,39,133,117,165,1204,9,-93,-108,78,0
6711,AL272005,BETA,2005-10-28 18:00:00,2005,Atlantic,50,991.0,1,132,811,...,26,147,114,154,939,7,-89,-100,-19,0
6712,AL272005,BETA,2005-10-29 00:00:00,2005,Atlantic,60,988.0,1,134,812,...,338,153,108,156,1066,15,-75,-81,68,0


In [8]:
#Save df_storms to data file
df_storms.to_csv('./data/atlantic_out.csv', index=False)