# Threshold computation

In [1]:
# load required libraries
import pandas as pd
import xarray as xr



# Download 1970-1978

In [2]:
# source of the definition applied
# https://link.springer.com/content/pdf/10.1007%2Fs10113-013-0499-2.pdf

import cdsapi

c = cdsapi.Client()

c.retrieve(
    'reanalysis-era5-single-levels-preliminary-back-extension',
    {
        'product_type': 'reanalysis',
        'format': 'netcdf',
        'variable': [
            '2m_dewpoint_temperature', '2m_temperature',
        ],
        'year': [
            '1970', '1971', '1972', '1973',
            '1974', '1975', '1976',
            '1977', '1978',
        ],
        'month': [
            '05', '06', '07',
            '08', '09',
        ],
        'day': [
            '01', '02', '03',
            '04', '05', '06',
            '07', '08', '09',
            '10', '11', '12',
            '13', '14', '15',
            '16', '17', '18',
            '19', '20', '21',
            '22', '23', '24',
            '25', '26', '27',
            '28', '29', '30',
            '31',
        ],
        'time': '16:00',
        'area': [
            71.2, -10, 37,
            30,
        ],
    },
    'download_threshold_1970_1978.nc')

In [3]:
# convert raw data to pandas df
ds = xr.open_dataset('download_threshold_1970_1978.nc')
df = ds.to_dataframe()

ecCodes library not found using ['eccodes', 'libeccodes.so', 'libeccodes']


In [4]:
# show instances
df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d2m,t2m
longitude,latitude,time,Unnamed: 3_level_1,Unnamed: 4_level_1
-10.0,71.0,1970-05-01 16:00:00,274.148407,274.242004
-10.0,71.0,1970-05-02 16:00:00,273.001862,273.792358
-10.0,71.0,1970-05-03 16:00:00,274.192993,274.629547
-10.0,71.0,1970-05-04 16:00:00,273.576294,274.3573
-10.0,71.0,1970-05-05 16:00:00,274.887207,275.039948


In [5]:
# get coordinates and time as variables
df = df.reset_index(level=['longitude', 'latitude', 'time'])

In [6]:
# show first instances
df.head()

Unnamed: 0,longitude,latitude,time,d2m,t2m
0,-10.0,71.0,1970-05-01 16:00:00,274.148407,274.242004
1,-10.0,71.0,1970-05-02 16:00:00,273.001862,273.792358
2,-10.0,71.0,1970-05-03 16:00:00,274.192993,274.629547
3,-10.0,71.0,1970-05-04 16:00:00,273.576294,274.3573
4,-10.0,71.0,1970-05-05 16:00:00,274.887207,275.039948


In [7]:
# inspect shape
df.shape

(30372489, 5)

In [8]:
# loading locations dataframe in order to filter the observations in the reference period data to those that are present in the
# more modern data
locations = pd.read_csv('locations1.csv', index_col=0)
locations.head()

Unnamed: 0,lat,lon,country,NUTS1,NUTS2,NUTS3
22,54.25,-10.0,IE,IE0,IE04,IE042
23,53.5,-10.0,IE,IE0,IE04,IE042
25,52.0,-10.0,IE,IE0,IE05,IE053
68,54.25,-9.25,IE,IE0,IE04,IE042
69,53.5,-9.25,IE,IE0,IE04,IE042


In [9]:
# get column with coordinates for location data
locations['lat_long'] = locations[['lat', 'lon']].apply(tuple, axis=1)

Iterating over latitude and longitude to delete observations that do not appear in either the lat or lon column of the location dataset. The result is that there might still be combinations of latitude and longitude that do not appear in the locations data. However, those will be dealt with later because the main intention of this intermediate step is to considerably decrease the data file size.

In [10]:
# get instances whose latitude is in the locations dataframe
df1 = df[df['latitude'].isin(locations['lat'])]

In [11]:
df1.shape

(10198062, 5)

In [12]:
# filter the obtained instances to those whose longitude is also in the locations data
df1 = df1[df1['longitude'].isin(locations['lon'])]

In [13]:
# inspect shape
df1.shape

(3420468, 5)

In [14]:
# drop time
df = df1.drop('time', 1)

In [15]:
# Refining the filtered rows by creating a new column that combines latitude and longitude.
# The step that was considered to be too demanding in terms of data storage before.
df['lat_long'] = df[['latitude', 'longitude']].apply(tuple, axis=1)

In [16]:
# only keep instances whose coordinates are in the locations data
df = df[df['lat_long'].isin(locations['lat_long'])]

In [17]:
# the number of instances has decreased strongly after filtering
df.shape

(1591812, 5)

In [18]:
# get clesius
df['t2m'] = df['t2m']-273.15
df['d2m'] = df['d2m']-273.15

In [19]:
# get apparent temperature
df['apparent_temperature'] = -2.653+(0.994*df['t2m'])+(0.0153*df['d2m']**2)

In [20]:
# store data for 1970 to 1978 in dataframe object
df_78 = df.copy()

# Download 1979-1999

import cdsapi

c = cdsapi.Client()

c.retrieve(
    'reanalysis-era5-single-levels',
    {
        'product_type': 'reanalysis',
        'format': 'netcdf',
        'variable': [
            '2m_dewpoint_temperature', '2m_temperature',
        ],
        'year': [
            '1979', '1980', '1981',
            '1982', '1983', '1984',
            '1985', '1986', '1987',
            '1988', '1989', '1990',
            '1991', '1992', '1993',
            '1994', '1995', '1996',
            '1997', '1998', '1999',
        ],
        'month': [
            '05', '06', '07',
            '08', '09',
        ],
        'day': [
            '01', '02', '03',
            '04', '05', '06',
            '07', '08', '09',
            '10', '11', '12',
            '13', '14', '15',
            '16', '17', '18',
            '19', '20', '21',
            '22', '23', '24',
            '25', '26', '27',
            '28', '29', '30',
            '31',
        ],
        'time': '16:00',
        'area': [
            71.2, -10, 37,
            30,
        ],
    },
    'download_threshold_1979_1999.nc')

In [21]:
# load and transform data to pandas df
ds = xr.open_dataset('download_threshold_1979_1999.nc')
df_1999 = ds.to_dataframe()

In [23]:
# show first instances
df_1999.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d2m,t2m
longitude,latitude,time,Unnamed: 3_level_1,Unnamed: 4_level_1
-10.0,71.0,1979-05-01 16:00:00,266.899475,271.257629
-10.0,71.0,1979-05-02 16:00:00,265.873871,270.178864
-10.0,71.0,1979-05-03 16:00:00,269.075348,272.265076
-10.0,71.0,1979-05-04 16:00:00,269.593445,272.149017
-10.0,71.0,1979-05-05 16:00:00,267.282318,271.494873


In [24]:
# get coordinates and time as columns
df_1999 = df_1999.reset_index(level=['longitude', 'latitude', 'time'])

In [25]:
# show first instances
df_1999.head()

Unnamed: 0,longitude,latitude,time,d2m,t2m
0,-10.0,71.0,1979-05-01 16:00:00,266.899475,271.257629
1,-10.0,71.0,1979-05-02 16:00:00,265.873871,270.178864
2,-10.0,71.0,1979-05-03 16:00:00,269.075348,272.265076
3,-10.0,71.0,1979-05-04 16:00:00,269.593445,272.149017
4,-10.0,71.0,1979-05-05 16:00:00,267.282318,271.494873


In [26]:
# inspect shape
df_1999.shape

(74243862, 5)

Iterating over latitude and longitude to delete observations that do not appear in either the lat or lon column of the location dataset. The result is that there might still be combinations of latitude and longitude that do not appear in the locations data. However, those will be dealt with later because the main intention of this intermediate step is to considerably decrease the data file size.

In [27]:
# first filtering
df_1999_1 = df_1999[df_1999['latitude'].isin(locations['lat'])]

In [28]:
# inspect shape
df_1999_1.shape

(24928596, 5)

In [29]:
# inspect instances
df_1999_1.head()

Unnamed: 0,longitude,latitude,time,d2m,t2m
3366,-10.0,70.75,1979-05-01 16:00:00,267.097534,271.335846
3367,-10.0,70.75,1979-05-02 16:00:00,266.065735,270.380005
3368,-10.0,70.75,1979-05-03 16:00:00,269.296387,272.400024
3369,-10.0,70.75,1979-05-04 16:00:00,269.535095,272.287415
3370,-10.0,70.75,1979-05-05 16:00:00,267.510406,271.577393


In [30]:
# second filtering
df_1999_1 = df_1999_1[df_1999_1['longitude'].isin(locations['lon'])]

In [31]:
# inspect shape
df_1999_1.shape

(8361144, 5)

In [32]:
# drop time column
df_1999 = df_1999_1.drop('time', 1)

In [33]:
# inspect data
df_1999

Unnamed: 0,longitude,latitude,d2m,t2m
3366,-10.00,70.75,267.097534,271.335846
3367,-10.00,70.75,266.065735,270.380005
3368,-10.00,70.75,269.296387,272.400024
3369,-10.00,70.75,269.535095,272.287415
3370,-10.00,70.75,267.510406,271.577393
...,...,...,...,...
73782715,29.75,37.00,278.057312,290.700531
73782716,29.75,37.00,280.260559,288.038422
73782717,29.75,37.00,276.140472,288.576508
73782718,29.75,37.00,277.618774,288.017792


Refining the filtered rows by creating a new column that combines latitude and longitude. A step that was considered to be too demanding in terms of data storage before.

In [34]:
# get coordinates
df_1999['lat_long'] = df_1999[['latitude', 'longitude']].apply(tuple, axis=1)

In [35]:
# keep only instances whose coordinates are in the locations df
df_1999 = df_1999[df_1999['lat_long'].isin(locations['lat_long'])]

In [36]:
# inspect shape
df_1999.shape

(3891096, 5)

In [37]:
# get clesius
df_1999['t2m'] = df_1999['t2m']-273.15
df_1999['d2m'] = df_1999['d2m']-273.15

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1999['t2m'] = df_1999['t2m']-273.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1999['d2m'] = df_1999['d2m']-273.15


In [38]:
# get apparent temperature
df_1999['apparent_temperature'] = -2.653+(0.994*df_1999['t2m'])+(0.0153*df_1999['d2m']**2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1999['apparent_temperature'] = -2.653+(0.994*df_1999['t2m'])+(0.0153*df_1999['d2m']**2)


In [40]:
df_1999.shape

(3891096, 6)

In [41]:
df_78.shape

(1591812, 6)

In [42]:
# Concatenating the data from 1970-1978 with the data from 1979-1999
df_70_99 = pd.concat([df_78, df_1999])
df_70_99.shape

(5482908, 6)

In [45]:
# get the 99th percentile per location
quantile_70_99 = df_70_99.groupby(['latitude', 'longitude']).quantile(.99)

In [46]:
# inspect the percentiles for some locations
quantile_70_99

Unnamed: 0_level_0,Unnamed: 1_level_0,d2m,t2m,apparent_temperature
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
37.00,-6.25,20.096707,38.971600,39.232187
37.00,-5.50,19.063138,38.051736,38.677543
37.00,-4.75,19.049752,34.922610,35.727877
37.00,-4.00,18.862826,33.302654,33.167269
37.00,-3.25,16.045638,31.926669,30.210756
...,...,...,...,...
70.00,29.00,14.545596,24.339234,23.777039
70.75,24.50,13.737318,21.566346,20.787273
70.75,25.25,13.840844,21.866741,21.309028
70.75,27.50,13.529815,21.956956,21.426597


In [47]:
# get coordinates as column
quantile_70_99 = quantile_70_99.reset_index(level=['longitude', 'latitude'])

In [48]:
# sort by lowest percentile in apparent temperature
quantile_70_99.sort_values(by=['apparent_temperature'])

Unnamed: 0,latitude,longitude,d2m,t2m,apparent_temperature
1139,70.00,18.50,11.979330,15.275835,14.248363
1140,70.00,19.25,12.269667,17.629803,16.702019
903,61.75,8.00,9.678231,18.684970,16.776928
352,46.75,10.25,9.812175,19.244250,16.979467
873,61.00,5.00,14.568381,17.430023,17.383223
...,...,...,...,...,...
30,38.50,-7.00,17.649606,38.660677,38.612960
1,37.00,-5.50,19.063138,38.051736,38.677543
0,37.00,-6.25,20.096707,38.971600,39.232187
12,37.75,-5.50,18.241062,39.995458,39.892451


In [61]:
# write result to csv
quantile_70_99.to_csv('quantile_thresholds.csv')