In [3]:
import pandas as pd
import numpy as np

In [3]:
#combine multiple columns of wind speed/pressure from different agencies into one column with priority

def load_agency_order(df, target, order):
    df['COMBINE_%s'%target] = np.nan

    for agency in order:
        nan_before = df['COMBINE_%s'%target].isna().sum()
        df['COMBINE_%s'%target] = df['COMBINE_%s'%target].combine_first(df[agency])   #combine winds from different agencies with order if available
        nan_after = df['COMBINE_%s'%target].isna().sum()
    
    return df


In [4]:
def load_IBTrACS_202510(basin, dt):
    WP_file = pd.read_csv('data/IBTrACS_202510/ibtracs.WP.list.v04r01.csv')
    WP_file.drop(index=0, inplace=True)
    WP_file['YEAR'] = WP_file['SEASON'].astype(int)
    WP_file = WP_file[WP_file['YEAR']>1981]
    WP_file = WP_file[WP_file['YEAR']<2025]

    EP_file = pd.read_csv('data/IBTrACS_202510/ibtracs.EP.list.v04r01.csv')
    EP_file.drop(index=0, inplace=True)
    EP_file['YEAR'] = EP_file['SEASON'].astype(int)
    EP_file = EP_file[EP_file['YEAR']>1981]
    EP_file = EP_file[EP_file['YEAR']<2025]

    agency_winds_pres = ['USA_WIND', 'WMO_WIND', 'CMA_WIND', 'HKO_WIND', 'TOKYO_WIND', 'USA_PRES', 'WMO_PRES', 'CMA_PRES', 'HKO_PRES', 'TOKYO_PRES']

    IB_data = pd.concat([WP_file, EP_file], axis=0).drop_duplicates(subset=(['SID', 'USA_ATCF_ID', 'ISO_TIME', 'NATURE']+agency_winds_pres))    #remove duplicate rows
    basin_ID = basin


    IB_data['ISO_TIME'] = pd.to_datetime(IB_data['ISO_TIME'])

    if dt == 6:
        valid_hours = [6, 12, 18, 0]
        IB_data = IB_data[IB_data['ISO_TIME'].dt.hour.isin(valid_hours)]     #select only valid hours
        IB_data = IB_data[IB_data['ISO_TIME'].dt.minute == 0]
    elif dt == 3:
        valid_hours = [0, 3, 6, 9, 12, 15, 18, 21]
        IB_data = IB_data[IB_data['ISO_TIME'].dt.hour.isin(valid_hours)]     #select only valid hours
        IB_data = IB_data[IB_data['ISO_TIME'].dt.minute == 0]
    elif dt == 'all':
        pass
    else:
        print('%s Invalid dt: %s'%(basin, dt))
        return

    
    basin_TC_ID = {}         #fill in missing IDs
    for TC_SID in IB_data['SID'].unique():
        TC_ID_list = sorted(IB_data[IB_data['SID'] == TC_SID]['USA_ATCF_ID'].unique())
        TC_ID_list = [i for i in TC_ID_list if not i.isspace()]
        if len(TC_ID_list) == 0:          #this SID has no corresponding ID
            continue
        elif len(TC_ID_list) > 1:         #this SID has more than 1 matching IDs
            USA_ids = [id for id in TC_ID_list if basin_ID in id]
            if len(USA_ids) > 0:
                # Update all records for this SID to use the first USA_ATCF_ID
                IB_data.loc[IB_data['SID'] == TC_SID, 'USA_ATCF_ID'] = USA_ids[0]
                basin_TC_ID[TC_SID] = USA_ids[0]
        else:                             #this SID matches with 1 ID
            basin_TC_ID[TC_SID] = TC_ID_list[0]
    IB_data['USA_ATCF_ID'] = IB_data['SID'].apply(lambda x: basin_TC_ID[x] if (x in basin_TC_ID) else ' ')      #fill in missing IDs according to SID
    IB_data = IB_data[IB_data['USA_ATCF_ID'].str.contains(basin_ID)]              #select only TCs in the basin of interest
    IB_data.reset_index(drop=True, inplace=True)

    for TC_ID in IB_data['USA_ATCF_ID'].unique():     # separate multiple IDs for the same USA ID
        TC_SID_list = IB_data[IB_data['USA_ATCF_ID'] == TC_ID]['SID'].unique()
        if len(TC_SID_list) > 1:
            for sid in TC_SID_list:  # Add suffix to differentiate multiple IDs for same SID
                mask = (IB_data['SID'] == sid) & (IB_data['USA_ATCF_ID'] == TC_ID)
                IB_data.loc[mask, 'USA_ATCF_ID'] = TC_ID + '_' + str(TC_SID_list.tolist().index(sid) + 1)

    #combine winds from different agencies
    IB_data['WMO_WIND'] = IB_data['WMO_WIND'].replace(r'^\s*$', np.nan, regex=True)     #convert empty strings to nan
    IB_data['WMO_WIND'] = IB_data['WMO_WIND'].astype(float)
    IB_data['USA_WIND'] = IB_data['USA_WIND'].replace(r'^\s*$', np.nan, regex=True)
    IB_data['USA_WIND'] = IB_data['USA_WIND'].astype(float)
    IB_data['TOKYO_WIND'] = IB_data['TOKYO_WIND'].replace(r'^\s*$', np.nan, regex=True)
    IB_data['TOKYO_WIND'] = IB_data['TOKYO_WIND'].astype(float)
    IB_data['CMA_WIND'] = IB_data['CMA_WIND'].replace(r'^\s*$', np.nan, regex=True)
    IB_data['CMA_WIND'] = IB_data['CMA_WIND'].astype(float)
    IB_data['HKO_WIND'] = IB_data['HKO_WIND'].replace(r'^\s*$', np.nan, regex=True)
    IB_data['HKO_WIND'] = IB_data['HKO_WIND'].astype(float)
    
    agency_wind_NA = ['USA_WIND']         #sorted by number of available data from each agency in target basin, see check_VMAX function below in IBTrACS_TS_new.ipynb
    agency_wind_EP = ['USA_WIND', 'CMA_WIND', 'HKO_WIND']
    agency_wind_WP = ['USA_WIND', 'CMA_WIND', 'HKO_WIND', 'TOKYO_WIND', 'WMO_WIND']
    agency_wind_orders = {'NA': agency_wind_NA, 'EP': agency_wind_EP, 'WP': agency_wind_WP}
    IB_data = load_agency_order(IB_data, 'WIND', agency_wind_orders[basin])

    #combine pressure from different agencies
    IB_data['WMO_PRES'] = IB_data['WMO_PRES'].replace(r'^\s*$', np.nan, regex=True)     #convert empty strings to nan
    IB_data['WMO_PRES'] = IB_data['WMO_PRES'].astype(float)
    IB_data['USA_PRES'] = IB_data['USA_PRES'].replace(r'^\s*$', np.nan, regex=True)
    IB_data['USA_PRES'] = IB_data['USA_PRES'].astype(float)
    IB_data['TOKYO_PRES'] = IB_data['TOKYO_PRES'].replace(r'^\s*$', np.nan, regex=True)
    IB_data['TOKYO_PRES'] = IB_data['TOKYO_PRES'].astype(float)
    IB_data['CMA_PRES'] = IB_data['CMA_PRES'].replace(r'^\s*$', np.nan, regex=True)
    IB_data['CMA_PRES'] = IB_data['CMA_PRES'].astype(float)
    IB_data['HKO_PRES'] = IB_data['HKO_PRES'].replace(r'^\s*$', np.nan, regex=True)
    IB_data['HKO_PRES'] = IB_data['HKO_PRES'].astype(float)

    agency_pres_NA = ['USA_PRES']         #sorted by number of available data from each agency in target basin
    agency_pres_EP = ['USA_PRES', 'TOKYO_PRES', 'HKO_PRES', 'CMA_PRES', 'WMO_PRES']
    agency_pres_WP = ['TOKYO_PRES', 'CMA_PRES', 'USA_PRES', 'HKO_PRES', 'WMO_PRES']
    agency_pres_orders = {'NA': agency_pres_NA, 'EP': agency_pres_EP, 'WP': agency_pres_WP}
    IB_data = load_agency_order(IB_data, 'PRES', agency_pres_orders[basin])

    

    IB_data['LANDFALL'] = IB_data['LANDFALL'].replace(r'^\s*$', np.nan, regex=True)
    IB_data['LANDFALL'] = IB_data['LANDFALL'].astype(float)
    IB_data = IB_data.drop_duplicates(subset=(['SID', 'USA_ATCF_ID', 'ISO_TIME', 'NATURE', 'COMBINE_WIND', 'COMBINE_PRES']))
        

    return IB_data


#save the processed datasets

EP_data_202510 = load_IBTrACS_202510('EP', 3)
WP_data_202510 = load_IBTrACS_202510('WP', 3)

EP_data_202510.to_csv('processed_data/IBTrACS_EP_processed_dt3_202510.csv')
WP_data_202510.to_csv('processed_data/IBTrACS_WP_processed_dt3_202510.csv')


    

  WP_file = pd.read_csv('data/IBTrACS_202510/ibtracs.WP.list.v04r01.csv')
  EP_file = pd.read_csv('data/IBTrACS_202510/ibtracs.EP.list.v04r01.csv')
  WP_file = pd.read_csv('data/IBTrACS_202510/ibtracs.WP.list.v04r01.csv')
  EP_file = pd.read_csv('data/IBTrACS_202510/ibtracs.EP.list.v04r01.csv')


In [4]:
def TC_properties(basin):        
    IB_data = pd.read_csv('processed_data/IBTrACS_%s_processed_dt3_202510.csv'%basin)
    IB_data['ISO_TIME'] = pd.to_datetime(IB_data['ISO_TIME'])

    TC_properties_df = pd.DataFrame(columns=['year', 'ID', 'start_V', 'end_V', 'LMI', 'duration_before_LMI', 'duration_after_LMI', 'intensity_range_1', \
                                            'intensity_range_2', 'total_intensity_range', 'start_P', 'end_P', 'LMP', 'type', 'start_lat', 'LMI_lat', 'end_lat'])  #LMP is minimum pressure

    ID_unique = sorted(IB_data['USA_ATCF_ID'].unique())        

    all_nan_wind = 0
    missing_wind = 0
    all_nan_pressure = 0
    missing_pressure = 0
    less_than_3_VMAX = 0
    intensity_range_1_equals_0 = 0
    intensity_range_2_equals_0 = 0
    intensity_range_total_equals_0 = 0
    no_duration_before_LMI = 0
    no_duration_after_LMI = 0
    
    
    for TC_ID in ID_unique:
        TC_data = IB_data[IB_data['USA_ATCF_ID'] == TC_ID].sort_values(by='ISO_TIME').reset_index(drop=True)
        if len(TC_data) > 0:
            TC_year = TC_data['YEAR'].iloc[0]
            if len(TC_data[TC_data['COMBINE_WIND'] >= 34]) > 0:
                start_idx = TC_data[TC_data['COMBINE_WIND'] >= 34].index[0]
                if len(TC_data[TC_data['NATURE'] == 'TS']) > 0:    
                    end_idx = TC_data[TC_data['NATURE'] == 'TS'].index[-1]
                    if end_idx > start_idx:
                        TC_data = TC_data.iloc[start_idx:end_idx+1].reset_index(drop=True)
            
                        TC_duration = (TC_data['ISO_TIME'].iloc[-1] - TC_data['ISO_TIME'].iloc[0]).total_seconds()/3600 + 3

                        if TC_data['COMBINE_WIND'].count() == 0:         #if all wind speed data points are nan
                            all_nan_wind += 1
                        elif TC_data['COMBINE_WIND'].count() < len(TC_data):
                            missing_wind += 1
                        if TC_data['COMBINE_WIND'].count() < 3:                 
                            less_than_3_VMAX += 1
                        if TC_data['COMBINE_PRES'].count() == 0:         
                            all_nan_pressure += 1
                            TC_start_P = np.nan
                            TC_end_P = np.nan
                            TC_LMP = np.nan
                        elif TC_data['COMBINE_PRES'].count() < len(TC_data):
                            missing_pressure += 1
                            TC_start_P = TC_data['COMBINE_PRES'].iloc[TC_data['COMBINE_PRES'].first_valid_index()]
                            TC_end_P = TC_data['COMBINE_PRES'].iloc[TC_data['COMBINE_PRES'].last_valid_index()]
                            TC_LMP = TC_data['COMBINE_PRES'].min()
                        else:
                            TC_start_P = TC_data['COMBINE_PRES'].iloc[TC_data['COMBINE_PRES'].first_valid_index()]
                            TC_end_P = TC_data['COMBINE_PRES'].iloc[TC_data['COMBINE_PRES'].last_valid_index()]
                            TC_LMP = TC_data['COMBINE_PRES'].min()
                        if TC_duration < 100:
                            TC_type = 'short'
                        elif TC_duration < 200:
                            TC_type = 'medium'
                        else:
                            TC_type = 'long'


                        TC_start_V = TC_data['COMBINE_WIND'].iloc[TC_data['COMBINE_WIND'].first_valid_index()]
                        TC_end_V = TC_data['COMBINE_WIND'].iloc[TC_data['COMBINE_WIND'].last_valid_index()]
                        TC_start_lat = TC_data['LAT'].iloc[TC_data['LAT'].first_valid_index()]
                        TC_end_lat = TC_data['LAT'].iloc[TC_data['LAT'].last_valid_index()]
                        TC_LMI_idx = np.nanargmax(TC_data['COMBINE_WIND'].values)
                        TC_LMI = TC_data['COMBINE_WIND'].iloc[TC_LMI_idx]
                        TC_LMI_lat = TC_data['LAT'].iloc[TC_LMI_idx]
                        TC_duration_before_LMI = (TC_data['ISO_TIME'].iloc[TC_LMI_idx] - TC_data['ISO_TIME'].iloc[0]).total_seconds()/3600 + 1.5     
                        TC_duration_after_LMI = (TC_data['ISO_TIME'].iloc[-1] - TC_data['ISO_TIME'].iloc[TC_LMI_idx]).total_seconds()/3600 + 1.5
                        TC_intensity_range_1 = TC_LMI - TC_start_V
                        TC_intensity_range_2 = TC_LMI - TC_end_V
                        TC_intensity_range_total = TC_intensity_range_1 + TC_intensity_range_2

                        if TC_duration_before_LMI == 0:
                            no_duration_before_LMI += 1
                        if TC_duration_after_LMI == 0:
                            no_duration_after_LMI += 1
                        if TC_intensity_range_1 == 0:
                            intensity_range_1_equals_0 += 1
                        if TC_intensity_range_2 == 0:
                            intensity_range_2_equals_0 += 1
                        if TC_intensity_range_total == 0:
                            intensity_range_total_equals_0 += 1
                        
                        TC_properties_df = pd.concat([TC_properties_df, pd.DataFrame({'year':[TC_year], 'ID':[TC_ID], 'start_V':[TC_start_V], 'end_V':[TC_end_V], 'LMI':[TC_LMI], \
                                                                'duration_before_LMI':[TC_duration_before_LMI], 'duration_after_LMI':[TC_duration_after_LMI], 'intensity_range_1':[TC_intensity_range_1], \
                                                                'intensity_range_2':[TC_intensity_range_2], 'total_intensity_range':[TC_intensity_range_total], 'start_P':[TC_start_P], 'end_P':[TC_end_P], 'LMP':[TC_LMP], 'type':[TC_type], \
                                                                'start_lat':[TC_start_lat], 'LMI_lat':[TC_LMI_lat], 'end_lat':[TC_end_lat]})], ignore_index=True)

    TC_properties_df.to_csv('data/properties/properties_%s_202510.csv'%(basin), index=False)
    
    return TC_properties_df


TC_properties('EP')
TC_properties('WP')

  IB_data = pd.read_csv('processed_data/IBTrACS_%s_processed_dt3_202510.csv'%basin)
  TC_properties_df = pd.concat([TC_properties_df, pd.DataFrame({'year':[TC_year], 'ID':[TC_ID], 'start_V':[TC_start_V], 'end_V':[TC_end_V], 'LMI':[TC_LMI], \
  TC_properties_df = pd.concat([TC_properties_df, pd.DataFrame({'year':[TC_year], 'ID':[TC_ID], 'start_V':[TC_start_V], 'end_V':[TC_end_V], 'LMI':[TC_LMI], \


Unnamed: 0,year,ID,start_V,end_V,LMI,duration_before_LMI,duration_after_LMI,intensity_range_1,intensity_range_2,total_intensity_range,start_P,end_P,LMP,type,start_lat,LMI_lat,end_lat
0,1982,WP011982,35.0,15.0,60.0,85.5,142.5,25.0,45.0,70.0,1004.0,1006.0,990.0,long,7.7,8.2,12.1
1,1983,WP011983,35.0,15.0,35.0,1.5,34.5,0.0,20.0,20.0,998.0,1000.0,998.0,short,15.6,15.6,19.0
2,1984,WP011984,35.0,25.0,40.0,7.5,43.5,5.0,15.0,20.0,998.0,1002.0,994.0,short,14.2,14.8,17.3
3,1985,WP011985,35.0,25.0,40.0,7.5,97.5,5.0,15.0,20.0,1000.0,1006.0,1000.0,medium,7.1,8.3,23.0
4,1986,WP011986,35.0,30.0,85.0,79.5,25.5,50.0,55.0,105.0,996.0,1008.0,970.0,medium,5.7,18.6,18.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1083,1996,WP371996_2,35.0,30.0,45.0,4.5,157.5,10.0,15.0,25.0,996.0,1008.0,992.0,medium,19.3,19.4,9.6
1084,1994,WP381994,35.0,29.0,115.0,97.5,163.5,80.0,86.0,166.0,1002.0,1010.0,950.0,long,7.9,11.2,17.4
1085,1994,WP391994,35.0,25.0,50.0,79.5,112.5,15.0,25.0,40.0,1000.0,1004.0,985.0,medium,6.1,13.3,20.8
1086,1996,WP421996_1,35.0,34.0,80.0,121.5,82.5,45.0,46.0,91.0,1002.0,996.0,975.0,long,8.4,11.8,19.5


In [6]:
def process_DELV(basin, dt, only_6):
    IB_data = pd.read_csv('processed_data/IBTrACS_%s_processed_dt3_202510.csv'%(basin))
    IB_data['ISO_TIME'] = pd.to_datetime(IB_data['ISO_TIME'])
    if only_6:
        IB_data = IB_data[IB_data['ISO_TIME'].dt.hour.isin([0, 6, 12, 18])]

    TC_DELV_df = pd.DataFrame(columns=['YEAR', 'USA_ATCF_ID', 'DELV', 'abs_DELV'])
    TC_before_DELV_df = pd.DataFrame(columns=['YEAR', 'USA_ATCF_ID', 'DELV'])
    TC_after_DELV_df = pd.DataFrame(columns=['YEAR', 'USA_ATCF_ID', 'DELV'])

    ID_unique = sorted(IB_data['USA_ATCF_ID'].unique())

    for TC_ID in ID_unique:
        TC_ID_data = IB_data[IB_data['USA_ATCF_ID'] == TC_ID].sort_values(by='ISO_TIME').reset_index(drop=True)
        if len(TC_ID_data) > 0:
            TC_year = TC_ID_data['YEAR'].iloc[0]
            if len(TC_ID_data[TC_ID_data['COMBINE_WIND'] >= 34]) > 0:
                start_idx = TC_ID_data[TC_ID_data['COMBINE_WIND'] >= 34].index[0]
                if len(TC_ID_data[TC_ID_data['NATURE'] == 'TS']) > 0:    
                    end_idx = TC_ID_data[TC_ID_data['NATURE'] == 'TS'].index[-1]
                    if end_idx > start_idx:
                        TC_ID_data = TC_ID_data.iloc[start_idx:end_idx+1].reset_index(drop=True)
                        before_len = len(TC_ID_data)
                        TC_ID_data = TC_ID_data.dropna(subset=['COMBINE_WIND']).reset_index(drop=True)
                        after_len = len(TC_ID_data)
                        if len(TC_ID_data) > 0:
                            LMI_idx = TC_ID_data['COMBINE_WIND'].idxmax()
                            LMI_time = TC_ID_data['ISO_TIME'].iloc[LMI_idx]
                            before_stop_time = LMI_time - pd.Timedelta(hours=dt)

                            TC_ID_DT = TC_ID_data[['ISO_TIME', 'COMBINE_WIND']]
                            TC_ID_DT['ISO_TIME'] = TC_ID_DT['ISO_TIME'] - pd.Timedelta(hours=dt)
                            TC_ID_DT.rename(columns={'COMBINE_WIND':'COMBINE_WIND_DT'}, inplace=True)
                            TC_ID_DELV = pd.merge(TC_ID_data, TC_ID_DT, on='ISO_TIME', how='inner')
                            TC_ID_DELV['DELV'] = TC_ID_DELV['COMBINE_WIND_DT'] - TC_ID_DELV['COMBINE_WIND']
                            TC_ID_DELV['abs_DELV'] = np.abs(TC_ID_DELV['DELV'])
                            TC_ID_DELV.reset_index(drop=True, inplace=True)
                            TC_ID_before = TC_ID_DELV[TC_ID_DELV['ISO_TIME'] <= before_stop_time]
                            TC_ID_after = TC_ID_DELV[TC_ID_DELV['ISO_TIME'] >= LMI_time]

                            TC_DELV_df = pd.concat([TC_DELV_df, TC_ID_DELV[['YEAR', 'USA_ATCF_ID', 'DELV', 'abs_DELV']]], ignore_index=True)
                            TC_before_DELV_df = pd.concat([TC_before_DELV_df, TC_ID_before[['YEAR', 'USA_ATCF_ID', 'DELV']]], ignore_index=True)
                            TC_after_DELV_df = pd.concat([TC_after_DELV_df, TC_ID_after[['YEAR', 'USA_ATCF_ID', 'DELV']]], ignore_index=True)

    #TC_DELV_df.to_csv('save_results/DELV/DELV_%s_dt%s_202510.csv'%(basin, dt), index=False)
    TC_before_DELV_df.to_csv('data/properties/DELV_before_LMI_%s_dt%s_202510.csv'%(basin, dt), index=False)
    TC_after_DELV_df.to_csv('data/properties/DELV_after_LMI_%s_dt%s_202510.csv'%(basin, dt), index=False)

process_DELV('EP', 6, False)
process_DELV('WP', 6, False)


  IB_data = pd.read_csv('processed_data/IBTrACS_%s_processed_dt3_202510.csv'%(basin))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TC_ID_DT['ISO_TIME'] = TC_ID_DT['ISO_TIME'] - pd.Timedelta(hours=dt)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TC_ID_DT.rename(columns={'COMBINE_WIND':'COMBINE_WIND_DT'}, inplace=True)
  TC_DELV_df = pd.concat([TC_DELV_df, TC_ID_DELV[['YEAR', 'USA_ATCF_ID', 'DELV', 'abs_DELV']]], ignore_index=True)
  TC_before_DELV_df = pd.concat([TC_before_DELV_df, TC_ID_before[['YEAR', 'USA_ATCF_ID', 'DELV']]], ignore_index=True)
  TC_after_DELV_df = pd.concat([TC_after_DELV_df, TC_ID_after[['