In [2]:
import pandas as pd
from func import cal_buoyancy,sounding_cal
import glob
import tropycal.tracks as tracks
import os
from haversine import haversine, Unit
import re
import datetime
import metpy.calc as mpcalc
from metpy.units import units
from metpy import constants
from metpy.cbook import get_test_data
from metpy.plots import add_metpy_logo, SkewT
from scipy.interpolate import interp1d
from scipy.optimize import bisect
import numpy as np
import pickle
import numpy as np
import math
import pint
import seaborn as sns
from pydoc import help
from sklearn.preprocessing import QuantileTransformer
import xarray as xr
import netCDF4 as nc
import re
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from numpy import load
from numpy import asarray
from numpy import save
import pytz
from scipy import stats
from sklearn.linear_model import LinearRegression
from collections import Counter
import pymannkendall as mk
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [3]:
pd.set_option('display.max_rows', None)

pd.set_option('display.max_columns', None)

In [4]:
# Gaoyun's identified APE cases 

ape_cases = xr.open_dataset("APEs_time_data.nc")

ape_cases_date = pd.to_datetime(ape_cases['time'], unit='D', origin='1970-01-01')

ape_cases_type = ape_cases['LAC_idx']

ape_cases_df = pd.DataFrame({
    
    'date': ape_cases_date,
    
    'type': ape_cases_type
})

ape_cases_df = ape_cases_df.set_index("date")

ape_cases_df['type'] = ape_cases_df['type'].replace({0: 'dry_coupling', 1: 'wet_coupling', 2: 'neither'})

ape_cases_df['dry_coupling'] = ape_cases_df['type'] == 'dry_coupling'

ape_cases_df['wet_coupling'] = ape_cases_df['type'] == 'wet_coupling'

ape_cases_df['APE'] = True

ape_cases_df = ape_cases_df.drop('type', axis=1)

In [6]:
directory_path = "/data/rong4/gloria/LAC_temp_data/alt_relative_data"

files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith(".nc")]

gfiles = []

for ds in files:
    
    g = xr.open_dataset(ds)
    
    gfiles.append(g)

## precipitation data for classifying as APEs

In [9]:
#average hourly precipitation

ape = gfiles[6]

tuples = []

pshape = ape['precipitation'].data.shape

days = pshape[0]

hours = pshape[1]

for d in range(days):

    for h in range(hours):

        bt = ape['base_time'].data[d]

        to = ape['time_offset'].data[h]

        dt = datetime.utcfromtimestamp(bt + to)

        lst = dt - timedelta(hours=6)

        p = ape['precipitation'].data[d,h]*0.01

        tup = (lst, p)
        
        tuples.append(tup)


In [11]:
gpdf = pd.DataFrame(tuples, columns=['time','p'])

gpdf['timestamp'] = pd.to_datetime(gpdf['time'])

gpdf['date'] = gpdf['timestamp'].dt.date

#converting from individual rows for every hour to a list for each day 
gres = gpdf.groupby('date')['p'].agg(list).reset_index()

gres = gres.set_index('date')

gres.index = pd.to_datetime(gres.index)

filtered_gres = gres[(gres.index.year >= 2001) & (gres.index.year <= 2019) & (gres.index.month >= 5) & (gres.index.month <= 9)]

print(filtered_gres.shape)

(2752, 1)


In [223]:
# complicated version with weird indexing

# tuples = []

# for i in range(filtered_gres.shape[0]):
    
#     if i < filtered_gres.shape[0]-1:
    
#         date = filtered_gres.index[i]

#         precip_current = filtered_gres['p'][i]
        
#         #capturing the first hour of the next day since this corresponds to precip from last hour of the previous day
#         precip_nextday_first_hr = filtered_gres['p'][i+1][0]
        
#         precip = np.append(precip_current, precip_nextday_first_hr)

#         if len(precip) == 25:

#             #corresponds to the precip during hours 6-13, since captures precip from hour before, and +1 at the end based on python indexing rules  
#             precip_morning = sum(precip[7:15])

#             #corresponds to hours 14-20
#             precip_afternoon = sum(precip[15:22]) 

#             #corresponds to hours 21-24
#             ## 22--> all precip in hour 21, 23 --> all in hr 22, 24 --> all precip in hr 23
            
#             precip_evening = sum(precip[22:25])

#             peak_hr = np.argmax(precip)

#             peak_amount = precip[peak_hr]

#             all_hours = precip

#             tup = (date, precip_morning, precip_afternoon, precip_evening, peak_hr, peak_amount, all_hours)

#             tuples.append(tup)

# aperes = pd.DataFrame(tuples,columns=['date','morning','afternoon','evening','peak_hr', 'peak_amount', 'all_hours'])

# aperes.shape

(2735, 7)

In [30]:
tuples = []

for i in range(filtered_gres.shape[0]):
    
    date = filtered_gres.index[i]

    precip = np.array(filtered_gres['p'][i])
    
    if len(precip) == 24:

        #corresponds to the precip during hours 6-13, since captures precip from hour before, and +1 at the end based on python indexing rules  
        
        precip_morning = sum(precip[6:13])

        #corresponds to hours 14-20
        
        precip_afternoon = sum(precip[14:21]) 

        #corresponds to hours 21-24
        ## 22--> all precip in hour 21, 23 --> all in hr 22, 24 --> all precip in hr 23

        precip_evening = sum(precip[21:24])
        
        psubset = precip[6:]

        #choosing to select peak hr only within defined morning, aft, evening time periods
        
        # adding 6 since to make hour corresond to actual time of day
        peak_hr = np.argmax(psubset) + 6

        peak_amount = precip[peak_hr]

        all_hours = precip

        tup = (date, precip_morning, precip_afternoon, precip_evening, peak_hr, peak_amount, all_hours)

        tuples.append(tup)

aperes = pd.DataFrame(tuples,columns=['date','morning','afternoon','evening','peak_hr', 'peak_amount', 'all_hours'])

aperes.shape

[0.00020789 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.        ]
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.00275586 0.00382196 0.01212687]
[0.12695096 0.60058102 0.39022923 0.1690032  0.00972815 0.00986141
 0.01073028 0.         0.         0.         0.09886993 0.39496269
 0.58011192 1.15218018 0.61208424 0.55055969 0.63549572 0.54679104
 0.4332143  0.50238274 0.02805437 0.04254797 0.02878998 0.28729744]
[0.0294403  0.02628465 0.10301705 0.05039979 0.00688699 0.15504265
 0.27805437 0.34759064 0.1108049  0.03282516 0.01340085 0.01783582
 0.07401919 0.17433903 0.11845949 0.32328358 1.2460714  1.40281982
 1.27926437 0.98213219 1.23832092 1.9853

(2736, 7)

In [14]:
conditions = (aperes['afternoon'] > aperes['evening']) & \
             (aperes['afternoon'] >= 2 * aperes['morning']) & \
             (aperes['peak_hr'].isin([14, 15, 16, 17, 18, 19, 20]))

aperes['APE'] = conditions

gpdf = aperes[['date','APE','afternoon','morning','evening','peak_hr','all_hours','peak_amount']]

gpdf = gpdf.set_index('date')

print(gpdf.shape)

np.sum(gpdf['APE'])

(2736, 7)


508

In [552]:
# with open('gpdf.pdkl', 'wb') as file:
    
#     pickle.dump(gpdf, file)

In [15]:
#hurdat removal of TC events
#replacing with gpdf

hurdat = tracks.TrackDataset(basin='north_atlantic',source='hurdat', include_btk=False)

sgp = (36.6, -97.5)

tc_list = []

tc_dis_list = []

tc_time_list = []

for id_ in hurdat.keys:
    
    if int(id_[-4:]) < 2000:
        
        continue
    
    lat = np.array(hurdat.data[id_]['lat'][:])
    
    lon = np.array(hurdat.data[id_]['lon'][:])
    
    dis_list = []
    
    for j in range(len(lat)):
        
        pt = (lat[j],lon[j])
        
        dis_list = np.append(dis_list, haversine(sgp, pt, unit='km'))
    
    if np.any(dis_list < 800):
            
        tc_time_list = np.append(tc_time_list, np.array(hurdat.data[id_]['time'])[np.where(dis_list < 500)])
            
        tc_list.append(id_)


tc_time = []

for tt in tc_time_list:
    
    tc_time.append(np.datetime64(tt))   


ok_tc_time = []

for i in tc_time:
    
    ok_tc_time.append(i - pd.to_timedelta('6 hours'))
        
        
tc_dates = []

for dt in ok_tc_time:
    
    d = dt.date()
    
    tc_dates.append(d)
    
    
tc_dates = list(set(tc_dates))

--> Starting to read in HURDAT2 data
--> Completed reading in HURDAT2 data (3.0 seconds)


In [16]:
tc_dates = list(set(tc_dates))

gpdf_minus_hu = gpdf[~gpdf.index.isin(tc_dates)]

gpdf_only_hu = gpdf[gpdf.index.isin(tc_dates)]

print(gpdf_minus_hu.shape)

print("ape count", sum(gpdf_minus_hu['APE']==True))

(2718, 7)
ape count 506


In [None]:
with open('gpdf.pdkl', 'wb') as file:
    
    pickle.dump(gpdf_minus_hu, file)

## comparison between gloria/my identified APE cases

In [18]:
my_ape_cases = gpdf[gpdf['APE']==True]

my_ape_cases = set(np.array(my_ape_cases.index))

In [19]:
g_ape_cases = set(np.array(ape_cases_df.index))

In [20]:
# how many APEs am I missing compared to Gloria's list? 

# reasons for differences: 

# 1) 2001-08-18: peak hr is 0 LST (the first 4 hours of the morning have strongest precip)
# 2) 2001-08-31: peak hr = 1
# 3) 2001-09-04: aft < morning*2, but not if i use 6:15, 14:21 indexing
# 4) 2002-05-12: peak hr = 0, but if i exclude index 0, peak is 15
# 5) 2002-06-04: aft < morning*2, but not if i use 6:15, 14:21 indexing
# 6) 2002-06-08: just not in my data?
# 7) 2002-06-28: peak hr = 2, and quite a bit in other morning hours
# 8) 2002-07-07: peak hr = 0, but if i exclude index 0, peak is 15
# 9) 2002-07-13: aft < morning*2, but not if i use 6:15, 14:21 indexing
# 10) 2002-07-18: peak hr = 0, but if i exclude index 0, peak is 16
# 11) 2002-08-20: peak hr = 24, but if i exclude index 24, peak is 17
# 12) 2002-09-13: peak hr = 24, but if i exclude index 24, peak is 17
# 13) 2002-09-14: peak hr = 1, a lot of precip throughout the day
# 14) 2003-08-01: peak hr & morning/aft issues, both fixed by difference indexing
# 15) 2003-08-10: morn/aft, fixed w adjustment

my_missing_ape_cases = g_ape_cases - my_ape_cases

in_mine_not_gaoyuns = my_ape_cases - g_ape_cases

t = gpdf[gpdf.index == pd.Timestamp(2002, 9, 14)]

# morn = np.sum(t['all_hours'][0][6:15])

# aft = np.sum(t['all_hours'][0][14:21])

peak = np.argmax(t['all_hours'][0][6:])

# morn_aft_adjust = aft - 2*morn

# morn_aft_adjust

t

Unnamed: 0_level_0,APE,afternoon,morning,evening,peak_hr,all_hours,peak_amount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2002-09-14,False,3.37831,1.713348,0.805128,19,"[3.848150329589844, 4.5009222412109375, 3.5420...",0.88678


In [23]:
my_missing_ape_cases

{numpy.datetime64('2002-06-08T00:00:00.000000000'),
 numpy.datetime64('2002-09-14T00:00:00.000000000'),
 numpy.datetime64('2005-05-31T00:00:00.000000000')}

In [25]:
len(my_ape_cases), len(g_ape_cases)

(508, 368)

In [27]:
len(in_mine_not_gaoyuns)

143