In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gtc_functions
# having issues with circular dependencies here

import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns
from geopy.distance import geodesic


# remove these once scripts have been transferred
from tqdm import tqdm
from typing import Tuple,List,Union
import os
import numpy as np
import urllib
import requests

## Loading in weather station data

Load in list of weather stations. This file can be found at the bottom of this [webpage](https://www.ncei.noaa.gov/pub/data/noaa/).

In [3]:
# TODO: better common path solution. Lisanne's use of Owen's functions? Cambridge-hosted data server?
google_drive_personal_key = '/Users/orlandotimmerman/Library/CloudStorage/GoogleDrive-rt582@cam.ac.uk/.shortcut-targets-by-id/132Xl9yWOGKPM7ybLH0oa9c3dJGYrXkjC/'

In [6]:
weather_stations_csv_path = google_drive_personal_key + 'datasets/EFs/weather_data/isd-history.csv'

# date formats are specified
df_stations_all = pd.read_csv(weather_stations_csv_path, parse_dates=['BEGIN','END'])
df_stations_all = gtc_functions.standardise_dfs(df_stations_all)
df_stations_all.head(10)

Unnamed: 0,usaf,wban,station_name,ctry,state,icao,lat,lon,elev(m),begin,end,geometry
0,7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,2011-03-09,2013-07-30,POINT (0 0)
1,7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,2012-07-13,2017-08-22,POINT (0 0)
2,7070,99999,WXPOD 7070,AF,,,0.0,0.0,7070.0,2014-09-23,2015-09-26,POINT (0 0)
3,8260,99999,WXPOD8270,,,,0.0,0.0,0.0,2005-01-01,2012-07-31,POINT (0 0)
4,8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,2010-05-19,2012-03-23,POINT (65.567 32.95)
5,8307,99999,WXPOD 8318,AF,,,0.0,0.0,8318.0,2010-04-21,2010-04-21,POINT (0 0)
6,8411,99999,XM20,,,,,,,2016-02-17,2016-02-17,POINT EMPTY
7,8414,99999,XM18,,,,,,,2016-02-16,2016-02-17,POINT EMPTY
8,8415,99999,XM21,,,,,,,2016-02-17,2020-03-14,POINT EMPTY
9,8418,99999,XM24,,,,,,,2016-02-17,2016-02-17,POINT EMPTY


In [7]:
# remove stations with key information missing
df_stations = df_stations_all.dropna(subset=['lat', 'lon', 'usaf', 'wban'])
print(f'{len(df_stations_all)-len(df_stations)} weather stations had key information missing so were removed.')

1187 weather stations had key information missing so were removed


In [10]:
# generating filename of hourly weather data 
df_stations = gtc_functions.concat_df_cols(df_stations,'csv_filenames',['usaf','wban'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[concatted_col_name] = df[cols_to_concat].astype(str).apply(


In [None]:
### FOR FETCHING CLOSEST WEATHER STATIONS TO XBD POINTS

def find_fetch_closest_weather_station_data(
	df_xbd_points: pd.DataFrame,
	df_noaa: pd.DataFrame,
	df_stations: pd.DataFrame,
	time_buffer: Tuple[float,str],
	download_dest_dir: str,
	min_number: int = 1,
	distance_buffer: float = None,
) -> pd.DataFrame:
    """TODO: docstring
	TODO: make this less horrifically janky
    N.B. standardise download_dest_dir with other peoples' filepaths
    """

    # pre-assign column of values for assignment
    df_xbd_points[['event_start', 'event_end']] = np.nan
    df_xbd_points[['closest_stations', 'stations_lat_lons']] = np.nan

    # group by event in df_xbd_points
    df_xbd_points_grouped = df_xbd_points.groupby('disaster_name')
    # for each group in df_xbd_points:
    for name,group in df_xbd_points_grouped:

        # calculate start and end of event
        df_event_weather = df_noaa[df_noaa['name'] == name]
        start,end = gtc_functions.calculate_first_last_dates_from_df(group, time_buffer)
        # limit stations df to those operational within +/- 1 time_buffer either side of event
        df_station_time_lim = df_stations[(df_stations['begin'] <= start) & (df_stations['end'] >= end)]

        ignore_csvs = []
        # for each xbd observation in group
        for index,obs in tqdm(group.iterrows(), total=len(group)):
            # limit stations spatially
            obs_lat_lons = [obs['lat'], obs['lon']]
            df_station_spatial_time_lim = gtc_functions.limit_df_spatial_range(
				df_station_time_lim, obs_lat_lons, min_number, distance_buffer)

            stations_list = []
            station_no = 0
            while len(stations_list) < min_number:
            
                # find closest weather station(s) to current weather station (allow closest N, or within limit)
                # could make this return ranked, then just iterate through
                try:
                    station_index = gtc_functions.find_index_closest_point_in_col(
						group['geometry'].loc[index], df_station_spatial_time_lim, 'geometry', which_closest=station_no)
                except:
                    df_station_spatial_time_lim = gtc_functions.limit_df_spatial_range(
						df_station_time_lim, obs_lat_lons, len(df_station_spatial_time_lim)+1)
                    station_index = gtc_functions.find_index_closest_point_in_col(
						group['geometry'].loc[index], df_station_spatial_time_lim, 'geometry', which_closest=station_no)

                
                # TODO: could potentially fail when crossing years, but not realistically with hurricanes
                event_year = start.year
                # get weather station csv filename
                csv_filename = df_station_spatial_time_lim['csv_filenames'].loc[station_index]
                url = generate_weather_station_url(event_year, csv_filename)

                # executes if weather station not already downloaded
                # if file in ignore, reloop to next-closest station
                if not '/'.join((str(event_year), csv_filename)) in ignore_csvs:
                    # if file doesn't exist, append to ignore and reloop
                    # if file not downloaded
                    if not check_is_file_downloaded(csv_filename, download_dest_dir):
                        try:
                            download_dest = download_dest_dir + '.'.join((csv_filename, 'csv'))
                            urllib.request.urlretrieve(url, download_dest)
                            stations_list.append(csv_filename)
                        except:
                            ignore_csvs.append('/'.join((str(event_year), csv_filename)))
                    else:
                        stations_list.append(csv_filename)
                station_no += 1

            # append list of stations
            df_xbd_points['closest_stations'].iloc[index] = stations_list
            # append start and end dates
            df_xbd_points['event_start'].iloc[index] = start
            df_xbd_points['event_end'].iloc[index] = end

        # remove station rows which don't exist
        df_stations = df_stations.loc[~df_stations['csv_filenames'].isin(ignore_csvs)]

    return df_xbd_points


def generate_weather_station_url(
	event_year: str,
	csv_filename: str
) -> str:
    url_start = 'https://www.ncei.noaa.gov/data/global-hourly/access/'
    return url_start + '/'.join((str(event_year),csv_filename)) + '.csv'


def check_does_file_exist(
	url: str
) -> bool:
    """
    Returns
    -------
    False if non-existent, true if exists
    
    TODO: docstring"""
    # try:
    #     urllib.request.urlretrieve(url,download_dest)
    #     return True
    # except:
    #     print(f'{url} does not exist')
    #     return False
    response = requests.get(url)
    if response.status_code == 200:
        return True
    else:
        return False


def check_is_file_downloaded(
	csv_filename: str,
	download_dest_dir: str
) -> bool:
    """True if already downloaded, False if not"""
    potential_file_path = '/'.join((download_dest_dir,csv_filename)) + '.csv'
    if os.path.exists(potential_file_path):
        # downloaded
        return True
    else:
		print(f'{csv_filename} already downloaded.')
        return False


def download_from_url(
	url: str,
	download_dest_dir
):
    filename = url.split('/')[-1]
    destination = '/'.join((download_dest_dir,filename))
    # download file
    try:
        urllib.request.urlretrieve(url,destination)
    except:
        # for some a few weather station identifiers don't seem to exist
		print(f'{url} could not be found.')
        return False

In [None]:
### Get closest NOAA data for each xbd datapoint

def find_NOAA_points(
	df_noaa_xbd_hurricanes: pd.DataFrame, 
	df_xbd_points: pd.DataFrame
) -> pd.DataFrame:
    """
    Append the closest weather data from NOAA 6-hourly data to xbd points

	TODO: DOCSTRING
    """

    noaa_indices = []
    xbd_indices = []
    distances = []
    # group by event in df_xbd_points
    df_xbd_points_grouped = df_xbd_points.groupby('disaster_name')
    # for each group in df_xbd_points:
    for name,group in df_xbd_points_grouped:
        df_event_weather = df_noaa_xbd_hurricanes[df_noaa_xbd_hurricanes['name']==name]
        
        for index,obs in tqdm(group.iterrows(), total=len(group)):
			# find index of noaa observation datapoint closest to xbd point
            noaa_index = gtc_functions.find_index_closest_point_in_col(
                group['geometry'].loc[index], df_event_weather, 'geometry')
            noaa_row = df_noaa_xbd_hurricanes.loc[noaa_index]
			# calculate distance between xbd point and noaa observation
            distance = geodesic((obs['lat'], obs['lon']), 
                                (noaa_row['lat'], noaa_row['lon'])).km

			# append to list as tuple (faster than appending as value)
            noaa_indices += noaa_index,
            xbd_indices += index,
            distances += distance,

    # reindex dataframes to prepare for merge
    reindexed_noaa_xbd_hurricanes = df_noaa_xbd_hurricanes.reindex(noaa_indices)
    reindexed_noaa_xbd_hurricanes = reindexed_noaa_xbd_hurricanes.reset_index().rename(columns={'index': 'noaa_index'})

    reindexed_xbd_points = df_xbd_points.reindex(xbd_indices)
    reindexed_xbd_points = reindexed_xbd_points.reset_index().rename(columns={'index': 'xbd_index'})

    # rename columns before merge to avoid duplicate column names
    reindexed_noaa_xbd_hurricanes.rename(
		columns={'geometry': 'noaa_obs_geometry', 'lon': 'noaa_obs_lon', 'lat': 'noaa_obs_lat','date': 'noaa_obs_date'},
		inplace=True)
    reindexed_xbd_points.rename(
		columns={'geometry': 'xbd_obs_geometry', 'lon': 'xbd_obs_lon', 'lat': 'xbd_obs_lat'},
		inplace=True)

    joined_df = reindexed_xbd_points.join(reindexed_noaa_xbd_hurricanes, how='inner').set_index('xbd_index')
    joined_df.sort_values(by='xbd_index', inplace=True)
    df = gtc_functions.calc_distance_between_df_cols(
		joined_df, [['noaa_obs_lat', 'noaa_obs_lon'], ['xbd_obs_lat', 'xbd_obs_lon']], 'shortest_distance_to_track')

    return df