In [1]:
%load_ext autoreload
%autoreload 2

In [93]:
import xarray as xr
import pandas as pd
import numpy as np
import tqdm as tqdm
import shapely
import cdsapi
import os
import glob
import pickle

import matplotlib.pyplot as plt
from geographiclib.geodesic import Geodesic
import vptree

import gtc_functions

In [122]:
grib_root = '/Users/orlandotimmerman/Library/CloudStorage/GoogleDrive-rt582@cam.ac.uk/.shortcut-targets-by-id/132Xl9yWOGKPM7ybLH0oa9c3dJGYrXkjC/datasets/EFs/weather_data/ecmwf/'

event_names = ['FLORENCE', 'HARVEY', 'MATTHEW', 'MICHAEL']
xa_dict = {}

# assign xarrays to labelled dictionary
for name in tqdm.tqdm(event_names,total=len(event_names)):
	grib_path = gtc_functions.get_path(grib_root + '.'.join((name,'grib')))
	xa_dict[name] = xr.load_dataset(grib_path, engine="cfgrib")

100%|██████████| 4/4 [00:44<00:00, 11.08s/it]


In [4]:
# TODO: better common path solution. Lisanne's use of Owen's functions? Cambridge-hosted data server?
google_drive_personal_key = '/Users/orlandotimmerman/Library/CloudStorage/GoogleDrive-rt582@cam.ac.uk/.shortcut-targets-by-id/132Xl9yWOGKPM7ybLH0oa9c3dJGYrXkjC/'

# xbd observation points
df_xbd_points_path = google_drive_personal_key + 'datasets/xBD_data/xbd_points_posthurr_reformatted.pkl'
df_xbd_points = pd.read_pickle(df_xbd_points_path)
df_xbd_points.head()

Unnamed: 0,geometry,damage_class,disaster_name,capture_date,lon,lat
0,POINT (-77.92443 34.78850),2,FLORENCE,2018-09-20 16:04:41+00:00,-77.924432,34.788502
1,POINT (-77.92459 34.78817),1,FLORENCE,2018-09-20 16:04:41+00:00,-77.924586,34.788174
2,POINT (-85.61007 30.20004),0,MICHAEL,2018-10-13 16:48:15+00:00,-85.610074,30.200042
3,POINT (-85.61057 30.20001),0,MICHAEL,2018-10-13 16:48:15+00:00,-85.610569,30.200012
4,POINT (-85.61055 30.20060),1,MICHAEL,2018-10-13 16:48:15+00:00,-85.610547,30.200601


In [5]:
xa_dict.keys()

dict_keys(['FLORENCE', 'HARVEY', 'MATTHEW', 'MICHAEL'])

In [6]:
xa_dict['MATTHEW']

In [7]:
df_xbd_points_grouped = df_xbd_points.groupby('disaster_name')

weather_headers = [str(el) for el in list(list((xa_dict.values()))[0].data_vars)]

for event_name, group in df_xbd_points_grouped:
	print(event_name)
	print(xa_dict[event_name])

FLORENCE
<xarray.Dataset>
Dimensions:     (time: 6, step: 24, latitude: 401, longitude: 601)
Coordinates:
    number      int64 0
  * time        (time) datetime64[ns] 2018-09-12 2018-09-13 ... 2018-09-17
  * step        (step) timedelta64[ns] 01:00:00 02:00:00 ... 1 days 00:00:00
    surface     float64 0.0
  * latitude    (latitude) float64 40.0 39.9 39.8 39.7 39.6 ... 0.3 0.2 0.1 0.0
  * longitude   (longitude) float64 -110.0 -109.9 -109.8 ... -50.2 -50.1 -50.0
    valid_time  (time, step) datetime64[ns] 2018-09-12T01:00:00 ... 2018-09-18
Data variables:
    u10         (time, step, latitude, longitude) float32 nan nan ... nan nan
    v10         (time, step, latitude, longitude) float32 nan nan ... nan nan
    d2m         (time, step, latitude, longitude) float32 nan nan ... nan nan
    t2m         (time, step, latitude, longitude) float32 nan nan ... nan nan
    sp          (time, step, latitude, longitude) float32 nan nan ... nan nan
    tp          (time, step, latitude, longitu

In [8]:
def restrict_coords(
	av_lon: float,
	av_lat: float,
	weather_lons: list[float],
	weather_lats: list[float],
	buffer: float
	) -> tuple[list]:
	"""TODO: docstring"""
	min_lon, max_lon = av_lon-buffer, av_lon+buffer
	min_lat, max_lat = av_lat-buffer, av_lat+buffer

	rest_weather_lons = [num for num in weather_lons if num >= min_lon and num <= max_lon]
	rest_weather_lats = [num for num in weather_lats if num >= min_lat and num <= max_lat]
	
	return rest_weather_lons, rest_weather_lats


def geoddist(p1, p2):
	# lat1, lon1, lat2, lon2
    return Geodesic.WGS84.Inverse(p1[1], p1[0], p2[1], p2[0])['s12']

In [22]:
df_xbd_points_grouped = df_xbd_points.groupby('disaster_name')
# TODO: update with more values
parameters = ['u10','v10','d2m','t2m','sp','tp']
dictionary_list = []

for event_name, group in df_xbd_points_grouped:
    xa = xa_dict[event_name]
    df = xa_dict[event_name].to_dataframe()

    # remove missing values (i.e. lat-lons in ocean)
    df_nonans = df.dropna(how='any')
    # assign latitude longitude multiindex to columns
    df_flat = df_nonans.reset_index()

    # coarse df spatial limitation for faster standardisation
    av_lon, av_lat = group.lon.mean(), group.lat.mean()
    df_flat = gtc_functions.limit_df_spatial_range(df_flat, [av_lat,av_lon], distance_buffer=2)
    df_flat = gtc_functions.standardise_dfs(df_flat)

    # iterate through each row in the xbd event df
    for i,row in tqdm.tqdm(group.iterrows(), total=len(group)):
        poi = shapely.geometry.point.Point(row.lon, row.lat)
        # further restrict df for specific point
        df_flat_specific = gtc_functions.limit_df_spatial_range(df_flat, [row.lat,row.lon], min_number=1)
        closest_ind = gtc_functions.find_index_closest_point_in_col(poi, df_flat_specific, 'geometry')
        # need to find lon, lat of closest point rather than the index
        # TODO: not sure that max is most relevant for all – parameterisation
        df_maxes = df_flat_specific[df_flat_specific['geometry'] == df_flat_specific.loc[closest_ind]['geometry']]
        maxs = df_maxes[parameters].abs().max()
            
        # generate dictionary of weather values
        dict_data = {k: maxs[k] for k in weather_headers}
        dict_data['xbd_index'] = row.name
        dict_data['name'] = event_name

        dictionary_list.append(dict_data)

weather_params_df = pd.DataFrame.from_dict(dictionary_list)

100%|██████████| 2834/2834 [01:58<00:00, 23.90it/s]
100%|██████████| 7226/7226 [05:28<00:00, 22.02it/s]
100%|██████████| 5836/5836 [03:19<00:00, 29.31it/s]
100%|██████████| 7158/7158 [03:22<00:00, 35.39it/s]


In [23]:
for n in out['name'].unique()[:]:
	print(n)
	print(out[out['name'] == n].isna().sum())

FLORENCE
u10          0
v10          0
d2m          0
t2m          0
sp           0
tp           0
xbd_index    0
name         0
dtype: int64
HARVEY
u10          0
v10          0
d2m          0
t2m          0
sp           0
tp           0
xbd_index    0
name         0
dtype: int64
MATTHEW
u10          0
v10          0
d2m          0
t2m          0
sp           0
tp           0
xbd_index    0
name         0
dtype: int64
MICHAEL
u10          0
v10          0
d2m          0
t2m          0
sp           0
tp           0
xbd_index    0
name         0
dtype: int64


## API Working

In [None]:
event_api_info = gtc_functions.return_relevant_event_info()

In [121]:
fetch_era5_data(
	weather_keys = ['t0','t1', 't2','w0','w1','tp','sp'], 
	start_end_dates = [[pd.Timestamp('2018-09-01'), pd.Timestamp('2018-09-05')]],
	area = [34, -90, 33, -88],
	download_dest_dir = '/Users/orlandotimmerman/Library/CloudStorage/GoogleDrive-rt582@cam.ac.uk/.shortcut-targets-by-id/132Xl9yWOGKPM7ybLH0oa9c3dJGYrXkjC/datasets/EFs/weather_data/ecmwf/api'
	)

2023-03-11 21:37:33,517 INFO Welcome to the CDS
2023-03-11 21:37:33,518 INFO Sending request to https://cds.climate.copernicus.eu/api/v2/resources/reanalysis-era5-land
2023-03-11 21:37:33,672 INFO Request is completed
2023-03-11 21:37:33,677 INFO Downloading https://download-0002-clone.copernicus-climate.eu/cache-compute-0002/cache/data4/adaptor.mars.internal-1678565959.0944207-6363-11-14eb3dfb-e05c-42df-bb16-4f4258f527ab.grib to /Users/orlandotimmerman/Library/CloudStorage/GoogleDrive-rt582@cam.ac.uk/.shortcut-targets-by-id/132Xl9yWOGKPM7ybLH0oa9c3dJGYrXkjC/datasets/EFs/weather_data/ecmwf/api/01-09-2018_05-09-2018/2m_dewpoint_temperature.grib (2.9K)
2023-03-11 21:37:33,944 INFO Download rate 11K/s
2023-03-11 21:37:34,047 INFO Welcome to the CDS
2023-03-11 21:37:34,048 INFO Sending request to https://cds.climate.copernicus.eu/api/v2/resources/reanalysis-era5-land
2023-03-11 21:37:34,148 INFO Downloading https://download-0011-clone.copernicus-climate.eu/cache-compute-0011/cache/data1/ad

01-09-2018_05-09-2018.nc saved successfully


  int_num = np.asarray(num, dtype=np.int64)


In [120]:
def fetch_era5_data(
	weather_keys: list[str],
	start_end_dates: list[tuple[pd.Timestamp]],
	area: list[tuple[float]],
	download_dest_dir: str,
	format: str = 'grib'
):
	"""Generate API call, download files, merge xarrays, save as new pkl file.
	
	Parameters
	----------
	weather_keys : list[str]
		list of weather parameter short names to be included in the call
	start_end_dates : list[tuple[pd.Timestamp]]
		list of start and end date/times for each event
	area : list[tuple[float]]
		list of max/min lat/lon values in format [north, west, south, east]
	download_dest_dir : str
		path to download destination
	format : str = 'grib'
		format of data file to be downloaded
	"""
	# initialise client
	c = cdsapi.Client()

	weather_params = return_parameter_strings(weather_keys)
	for i, dates in enumerate(start_end_dates):
		# create new folder for downloads
		destination_path = gtc_functions.get_path(download_dest_dir)
		dir_name = '_'.join((
			dates[0].strftime("%d-%m-%Y"), dates[1].strftime("%d-%m-%Y")
			))
		sub_dir = destination_path + '/' + dir_name
		if not os.path.exists(sub_dir):
			os.mkdir(sub_dir)

		time_info_dict = generate_times_from_start_end(dates)
		for param in weather_params:
			# generate api call info TODO: put into function
			api_call_dict = generate_api_dict(param, time_info_dict, area, format)
			file_name = f'{param}.{format}'
			dest = '/'.join((sub_dir, file_name))
			# make api call 
			# TODO: is there a nice way to overwrite files of same name, provided they are
			# different? e.g. different area	
			c.retrieve(
				'reanalysis-era5-land',
				api_call_dict,
				dest
			)
			
		# load in all files in folder
		file_paths = '/'.join((sub_dir, f'*.{format}'))

		xa_dict = {}
		for file_path in tqdm.tqdm(glob.glob(file_paths)):
			# get name of file
			file_name = file_path.split('/')[-1]
			# read into xarray
			xa_dict[file_name] = xr.load_dataset(file_path, engine="cfgrib")
			
		# merge TODO: apparently conflicting values of 'step'
		out = xr.merge([array for array in xa_dict.values()], compat='override')
		# save as new file
		nc_file_name = '.'.join((dir_name, 'nc'))
		save_file_path = '/'.join((destination_path, nc_file_name))
		out.to_netcdf(path=save_file_path)
		print(f'{nc_file_name} saved successfully')

		# TODO: maybe delete folder and all files – not for now
	

In [58]:
def generate_api_dict(
	weather_params: list[str],
	time_info_dict: dict,
	area: list[float],
	format: str
) -> dict:
	"""Generate api dictionary format for single month of event"""

	api_call_dict = {
		"variable": weather_params,
		"area": area,
		"format": format
	} | time_info_dict

	return api_call_dict

In [56]:
def return_parameter_strings(
	dict_keys: list[str]
):
	"""Look up weather parameters in a dictionary so they can be entered as short strings rather than typed out in full.
	Key:value pairs ordered in expected importance
	TODO: may change keys to acronyms
	"""

	weather_dict = {
		't0': '2m_dewpoint_temperature', 't1': '2m_temperature', 't2': 'skin_temperature',
		'tp': 'total_precipitation',
		's0': 'skin_reservoir_content', 's1': 'volumetric_soil_water_layer_1', 's2': 'volumetric_soil_water_layer_2', 
		's3': 'volumetric_soil_water_layer_3', 's4': 'volumetric_soil_water_layer_4',
		'h0': 'surface_latent_heat_flux', 'h1': 'surface_sensible_heat_flux', 
		'r0': 'surface_net_solar_radiation', 'r1': 'surface_net_thermal_radiation', 'r2': 'surface_solar_radiation_downwards',
		'r3': 'surface_thermal_radiation_downwards',
		'e0': 'total_evaporation', 'e1': 'evaporation_from_bare_soil', 'e2': 'potential_evaporation',
		'run0': 'runoff', 'run0': 'sub-surface_runoff','run0': 'surface_runoff',
		'w0': '10m_u_component_of_wind', 'w1': '10m_v_component_of_wind', 
		'sp': 'surface_pressure'
	}

	weather_params = []
	for key in dict_keys:
		weather_params.append(weather_dict.get(key))
	
	return weather_params


def generate_times_from_start_end(
	start_end_dates: list[tuple[pd.Timestamp]]
) -> dict:
	"""Generate dictionary containing ecmwf time values.
	
	TODO: update so can span multiple months accurately (will involve several api calls)
	TODO: nicer hour assigment"""

	dates = pd.date_range(start_end_dates[0],start_end_dates[1])
	years, months, days, hours = set(), set(), set(), []
	# extract years from time
	for date in dates:
		years.add(str(date.year))
		months.add(gtc_functions.pad_number_with_zeros(date.month))
		days.add(gtc_functions.pad_number_with_zeros(date.day))

	# generate hour strings
	for i in range(24):
		hours.append(f'{i:02d}:00')

	years, months, days = list(years), list(months), list(days)

	time_info = {"year": years,
				"month": months[0],
				"day": days,
				"hours": hours}

	return time_info

In [54]:
test_time = pd.Timestamp('2018-08-04')
generate_times_from_start_end([test_time,pd.Timestamp('2018-09-01')])


{'year': ['2018'],
 'month': '08',
 'day': ['01',
  '16',
  '25',
  '06',
  '30',
  '28',
  '19',
  '13',
  '31',
  '23',
  '08',
  '29',
  '10',
  '24',
  '07',
  '22',
  '05',
  '26',
  '18',
  '09',
  '14',
  '27',
  '15',
  '11',
  '12',
  '17',
  '21',
  '20',
  '04'],
 'hours': ['00:00',
  '01:00',
  '02:00',
  '03:00',
  '04:00',
  '05:00',
  '06:00',
  '07:00',
  '08:00',
  '09:00',
  '10:00',
  '11:00',
  '12:00',
  '13:00',
  '14:00',
  '15:00',
  '16:00',
  '17:00',
  '18:00',
  '19:00',
  '20:00',
  '21:00',
  '22:00',
  '23:00']}

In [None]:
# type(test_time.year)

In [None]:
df_final = pd.DataFrame.from_dict(dictionary_list)
len(df_final)

In [None]:
for n in df_final['name'].unique()[:]:
	print(n)
	print(df_final[df_final['name'] == n].isna().sum())

In [None]:
df = xa_dict['FLORENCE'].to_dataframe()
df
print(df.isna().sum())

In [None]:
df = xa_dict['FLORENCE'].to_dataframe()
df_nonans = df.dropna(how='any')
df_nonans.isna().sum()

In [None]:
# df_nonans[df_nonans['valid_time'] == '2018-09-13 03:00:00']
df_nonans


In [None]:
lons = list(df_nonans.index.get_level_values('longitude'))
lats = list(df_nonans.index.get_level_values('latitude'))
coords = np.array(list(zip(lons, lats)))

In [None]:
# my_points = np.random.rand(100, 2) * 100
# test_points = np.array([[40,110],[40,109.9]])

In [None]:



# for name in df_xbd_points.disaster_name.unique():
# 	print(name)


# can do a simple spatial limitation first using xbd datapoint average
# xbd_mean_lat = 

In [None]:
rest_weather_lons, rest_weather_lats = restrict_coords(df_xbd_points.)

In [None]:
# np.random.seed(42)
# test_points = np.random.rand(100, 2)
# test_points = np.array([[40,110],[40,-109.9],[34,90]])
test_points = coords[:1000]
query_point = np.array([-108,39.925])

my_tree = vptree.VPTree(test_points, geoddist)

[distance, closest_p] = my_tree.get_nearest_neighbor(query_point)
print(distance, closest_p)

fig,ax = plt.subplots(1)
ax.scatter(x=[el[0] for el in test_points], y=[el[1] for el in test_points], label='data')
ax.scatter(x=query_point[0], y=query_point[1], label='query')
ax.scatter(x=closest_p[0], y=closest_p[1], color='k', marker='x', s=100, label='closest')
ax.set_xlabel('latitude')
ax.set_ylabel('longitude')
ax.legend()
ax.set_aspect('auto')

In [None]:
points = np.array([[110,30],[120,31],[119,22],[123,43]])

test_tree = vptree.VPTree(points, geoddist)

In [None]:
query_point = np.array([122,27])
[distance, closest_p] = test_tree.get_nearest_neighbor(query_point)
print(distance, closest_p)

In [None]:
plt.scatter(x=[el[0] for el in points], y=[el[1] for el in points], label='data')
plt.scatter(x=query_point[0], y=query_point[1], label='query')
plt.scatter(x=closest_p[0], y=closest_p[1], color='k', marker='x', s=100, label='closest')

In [None]:
tree_new = vptree.VPTree(np.random.randn(200, 10),euclidean)
query = [.5] * 10

tree_new.get_nearest_neighbor(query = 1.2)

In [None]:
xs = np.arange(1,10000)
ys = xs * np.log(xs)

In [None]:
tree.get_nearest_neighbor(coords[0])

In [None]:
da = df_nonans.to_xarray()
da

In [None]:
xa_dict['FLORENCE']

In [None]:
a = xr.DataArray(np.arange(25).reshape(5, 5), dims=("x", "y"))


In [None]:
a.where(a.x + a.y < 4, drop=True)


In [None]:
ds = xa_dict['FLORENCE']

In [None]:
# xa_dict['FLORENCE'].where(np.isnan(xa_dict['FLORENCE'].latitude + xa_dict['FLORENCE'].longitude))

ds.where(np.isnan(ds.time), drop=True)
# ds.where(np.isnan(ds.latitude+ds.longitude), drop=True)

In [None]:
np.nan + 3

In [None]:
# write processed stations file to pickle
df_ecmwf_path = google_drive_personal_key + 'datasets/EFs/weather_data'
df_ecmwf_pkl_name = 'ecmwf_data'

gtc_functions.write_df_to_pkl(df_ecmwf_path,
                            df_ecmwf_pkl_name,
                            df_final)

In [None]:
sum(df_final['u10'].isna())

In [None]:
var = 'u10'
from matplotlib import pyplot as plt, animation

fig,ax = plt.subplots(6,1)

for vt in ds[var].valid_time[0]:
	u10_data = ds[var].sel(valid_time=vt)

# i=0
# for t in ds['time'][1:3]:
# 	for s in ds['step'][:2]:
		
# 		u10_data = ds[var].sel(time=t,step=s)
# 		u10_data.plot(ax=ax[i], x='longitude', y='latitude', cmap='coolwarm', vmin=-20, vmax=20)
# 		i+=1
# 	i+=1


In [None]:
# u10_data = xa_dict['HARVEY'].u10.sel(time='2017-08-26',step='8:00:00')
# u10_data = xa_dict['MATTHEW'].u10.sel(time='2016-10-05',step='09:00:00')
# u10_data = xa_dict['FLORENCE'].u10.sel(time='2018-09-13',step='03:00:00')

# u10_data = ds['u10'].sel(time='2018-09-14',step='6:00:00')
u10_data
u10_data.plot(x='longitude', y='latitude', cmap='coolwarm', vmin=-20, vmax=20)

In [None]:
from matplotlib import pyplot as plt, animation
%matplotlib inline

# This is needed to display graphics calculated outside of jupyter notebook
from IPython.display import HTML, display

In [None]:
xa_f = xa_dict['FLORENCE']
u10 = xa_f.u10
u10


In [None]:
u10[0][0].plot(x='longitude', y='latitude', cmap='coolwarm', vmin=-20, vmax=20, figsize=(6,4))

In [None]:
len(u10.step)

In [None]:
num_side = 2
fig,axs = plt.subplots(num_side, num_side)
axs = axs.ravel()

plt.rcParams.update({'font.size': 6})

for i in range(num_side**2):
	# iterate over day
	for d in u10.time:
		# iterate over step
		for s in u10.step:
			u10_data = u10.sel(time=d,step=s)
			u10_data.plot(ax=axs[i], x='longitude', y='latitude', cmap='coolwarm', vmin=-20, vmax=20)
			
			# u10[i][0].plot(ax=axs[i], x='longitude', y='latitude', cmap='coolwarm', vmin=-20, vmax=20)


In [None]:
for i in range(5):
    var_data[i,:,:].plot(x='longitude', y='latitude', figsize=(6,4))
    plt.show()
    plt.close()

In [None]:
# Get a handle on the figure and the axes
fig, ax = plt.subplots(figsize=(12,6))

var_data = ds['u10']


# Plot the initial frame. 
cax = var_data[0,:,:].plot(
    add_colorbar=True,
    cmap='coolwarm',
    vmin=-40, vmax=40,
    cbar_kwargs={
        'extend':'neither'
    }
)

# Next we need to create a function that updates the values for the colormesh, as well as the title.
def animate(frame):
    cax.set_array(var_data[frame,:,:].values.flatten())
    ax.set_title("Time = " + str(var_data.coords['time'].values[frame])[:13])

# Finally, we use the animation module to create the animation.
ani = animation.FuncAnimation(
    fig,             # figure
    animate,         # name of the function above
    frames=40,       # Could also be iterable or list
    interval=200     # ms between frames
)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import cartopy.crs as ccrs

var_data = ds['u10']

# Create a figure and axis using cartopy
fig = plt.figure(figsize=(8, 6))
ax = plt.axes(projection=ccrs.PlateCarree())

# Define a function to update the plot for each time step
def update_plot(i):
    # Clear the axis for the new plot
    ax.clear()

    # Plot the variable data for the current time step
    var_data.isel(time=i).plot(ax=ax, transform=ccrs.PlateCarree(), add_colorbar=False)

    # Add a title and annotation for the current time step
    ax.set_title('Variable Data at Time: {}'.format(var_data.time.values[i]), fontsize=14)
    ax.annotate('Time: {}'.format(var_data.time.values[i]), xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12)

    # Set the axis extent and gridlines
    ax.set_extent([-180, 180, -90, 90], crs=ccrs.PlateCarree())
    ax.gridlines(linestyle='--', draw_labels=True)

# Create an animation object
ani = animation.FuncAnimation(fig, update_plot, frames=len(var_data.time), repeat=True)

# Save the animation as an mp4 file
ani.save('your_animation.mp4', writer='ffmpeg')
