# Reverse Geocoding to obtain country and NUTS from coordinates

In [1]:
# load relevant libraries
import xarray as xr
import pandas as pd
import numpy as np



In [2]:
# open raw data and transform to pandas df
ds = xr.open_dataset('C:/Users/benhu/MasterThesisRawData/download18_21.nc')
df = ds.to_dataframe()

ecCodes library not found using ['eccodes', 'libeccodes.so', 'libeccodes']


In [3]:
# get coordinates and time as variables
df = df.reset_index(level=['longitude', 'latitude', 'time'])
df

Unnamed: 0,longitude,latitude,time,u10,d2m,t2m,uvb
0,-10.0,71.0,2018-05-01 02:00:00,-6.397677,274.760529,276.599854,0.000000
1,-10.0,71.0,2018-05-01 11:00:00,-10.199122,273.712616,276.260498,118736.773438
2,-10.0,71.0,2018-05-01 16:00:00,-11.301038,273.334991,276.105469,110801.500000
3,-10.0,71.0,2018-05-01 23:00:00,-12.028891,274.409027,276.233093,55.687500
4,-10.0,71.0,2018-05-02 02:00:00,-10.598689,274.831635,276.350311,0.000000
...,...,...,...,...,...,...,...
53995531,30.0,37.0,2021-09-29 23:00:00,0.064060,276.603729,287.014404,0.000000
53995532,30.0,37.0,2021-09-30 02:00:00,0.652620,278.227905,285.402771,0.000000
53995533,30.0,37.0,2021-09-30 11:00:00,-0.249185,278.330078,293.606445,211349.796875
53995534,30.0,37.0,2021-09-30 16:00:00,-1.631648,278.479675,291.566620,7524.593750


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53995536 entries, 0 to 53995535
Data columns (total 7 columns):
 #   Column     Dtype         
---  ------     -----         
 0   longitude  float64       
 1   latitude   float64       
 2   time       datetime64[ns]
 3   u10        float32       
 4   d2m        float32       
 5   t2m        float32       
 6   uvb        float32       
dtypes: datetime64[ns](1), float32(4), float64(2)
memory usage: 2.0 GB


In [5]:
# specify coordinates range for Europe to filter 
lon = list(np.arange(-10, 31, 0.75))
lat = list(np.arange(37, 72, 0.75))

In [7]:
# subset data to coordinates of Europe
df = df[df['longitude'].isin(lon) & df['latitude'].isin(lat)]

In [8]:
# inspect shape --> 1/9th of the original data
df.shape

(6080832, 7)

In [9]:
# transform coordinates to string
df['lat'] = df['latitude'].astype(str)
df['lon'] = df['longitude'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lat'] = df['latitude'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lon'] = df['longitude'].astype(str)


In [10]:
# get column that adds coordinates together
df['lat_long'] = df[['lat', 'lon']].apply(tuple, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lat_long'] = df[['lat', 'lon']].apply(tuple, axis=1)


In [11]:
# drop duplicates to have one instance representing one unique location
df = df.drop_duplicates(subset=['lat_long'], keep='first')

In [12]:
df

Unnamed: 0,longitude,latitude,time,u10,d2m,t2m,uvb,lat,lon,lat_long
0,-10.00,71.00,2018-05-01 02:00:00,-6.397677,274.760529,276.599854,0.0,71.0,-10.0,"(71.0, -10.0)"
7344,-10.00,70.25,2018-05-01 02:00:00,-6.526506,275.174927,277.215179,0.0,70.25,-10.0,"(70.25, -10.0)"
14688,-10.00,69.50,2018-05-01 02:00:00,-6.965965,275.057220,277.189667,0.0,69.5,-10.0,"(69.5, -10.0)"
22032,-10.00,68.75,2018-05-01 02:00:00,-7.714090,274.774414,276.953369,0.0,68.75,-10.0,"(68.75, -10.0)"
29376,-10.00,68.00,2018-05-01 02:00:00,-8.437366,274.559448,276.742554,0.0,68.0,-10.0,"(68.0, -10.0)"
...,...,...,...,...,...,...,...,...,...,...
53625888,29.75,40.25,2018-05-01 02:00:00,-0.704992,282.959808,284.051086,0.0,40.25,29.75,"(40.25, 29.75)"
53633232,29.75,39.50,2018-05-01 02:00:00,-0.580741,277.455475,284.869659,0.0,39.5,29.75,"(39.5, 29.75)"
53640576,29.75,38.75,2018-05-01 02:00:00,-1.340637,273.789459,281.447937,0.0,38.75,29.75,"(38.75, 29.75)"
53647920,29.75,38.00,2018-05-01 02:00:00,-0.574201,278.682373,288.373627,0.0,38.0,29.75,"(38.0, 29.75)"


In [13]:
lati = df['latitude']
long = df['longitude']

In [14]:
# import nuts_finder as a reverse geocoding library
import nuts_finder

In [15]:
# loop over coordinates to identify locations
nf = nuts_finder.NutsFinder()
nuts = []
for lat, lon in zip(df['latitude'],df['longitude']):
    a = nf.find(lat, lon)
    nuts.append(a)

In [16]:
# inspect results
nuts

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [{'NUTS_ID': 'IE',
   'LEVL_CODE': 0,
   'CNTR_CODE': 'IE',
   'NAME_LATN': 'Éire/Ireland',
   'NUTS_NAME': 'Éire/Ireland',
   'MOUNT_TYPE': 0,
   'URBN_TYPE': None,
   'COAST_TYPE': 0,
   'FID': 'IE'},
  {'NUTS_ID': 'IE0',
   'LEVL_CODE': 1,
   'CNTR_CODE': 'IE',
   'NAME_LATN': 'Ireland',
   'NUTS_NAME': 'Ireland',
   'MOUNT_TYPE': 0,
   'URBN_TYPE': None,
   'COAST_TYPE': 0,
   'FID': 'IE0'},
  {'NUTS_ID': 'IE05',
   'LEVL_CODE': 2,
   'CNTR_CODE': 'IE',
   'NAME_LATN': 'Southern',
   'NUTS_NAME': 'Southern',
   'MOUNT_TYPE': 0,
   'URBN_TYPE': None,
   'COAST_TYPE': 0,
   'FID': 'IE05'},
  {'NUTS_ID': 'IE053',
   'LEVL_CODE': 3,
   'CNTR_CODE': 'IE',
   'NAME_LATN': 'South-West',
   'NUTS_NAME': 'South-West',
   'MOUNT_TYPE': 4,
   'URBN_TYPE': 3,
   'COAST_TYPE': 1,
   'FID': 'IE053'}],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 

In [17]:
# store results in pandas df
df1 = pd.DataFrame(nuts, columns = ['country', 'NUTS1', 'NUTS2', 'NUTS3'])

In [18]:
df1

Unnamed: 0,country,NUTS1,NUTS2,NUTS3
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
2479,"{'NUTS_ID': 'TR', 'LEVL_CODE': 0, 'CNTR_CODE':...","{'NUTS_ID': 'TR4', 'LEVL_CODE': 1, 'CNTR_CODE'...","{'NUTS_ID': 'TR41', 'LEVL_CODE': 2, 'CNTR_CODE...","{'NUTS_ID': 'TR411', 'LEVL_CODE': 3, 'CNTR_COD..."
2480,"{'NUTS_ID': 'TR', 'LEVL_CODE': 0, 'CNTR_CODE':...","{'NUTS_ID': 'TR3', 'LEVL_CODE': 1, 'CNTR_CODE'...","{'NUTS_ID': 'TR33', 'LEVL_CODE': 2, 'CNTR_CODE...","{'NUTS_ID': 'TR333', 'LEVL_CODE': 3, 'CNTR_COD..."
2481,"{'NUTS_ID': 'TR', 'LEVL_CODE': 0, 'CNTR_CODE':...","{'NUTS_ID': 'TR3', 'LEVL_CODE': 1, 'CNTR_CODE'...","{'NUTS_ID': 'TR33', 'LEVL_CODE': 2, 'CNTR_CODE...","{'NUTS_ID': 'TR334', 'LEVL_CODE': 3, 'CNTR_COD..."
2482,"{'NUTS_ID': 'TR', 'LEVL_CODE': 0, 'CNTR_CODE':...","{'NUTS_ID': 'TR3', 'LEVL_CODE': 1, 'CNTR_CODE'...","{'NUTS_ID': 'TR33', 'LEVL_CODE': 2, 'CNTR_CODE...","{'NUTS_ID': 'TR332', 'LEVL_CODE': 3, 'CNTR_COD..."


In [20]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   country  1168 non-null   object
 1   NUTS1    1168 non-null   object
 2   NUTS2    1168 non-null   object
 3   NUTS3    1168 non-null   object
dtypes: object(4)
memory usage: 77.8+ KB


In [21]:
# extract substrings that include the relevant information on country and region from columns
df1['country'] = df1['country'].astype(str).str[13:15]
df1['NUTS1'] = df1['NUTS1'].astype(str).str[13:16]
df1['NUTS2'] = df1['NUTS2'].astype(str).str[13:17]
df1['NUTS3'] = df1['NUTS3'].astype(str).str[13:18]
df1
#13,14,15,16

Unnamed: 0,country,NUTS1,NUTS2,NUTS3
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
2479,TR,TR4,TR41,TR411
2480,TR,TR3,TR33,TR333
2481,TR,TR3,TR33,TR334
2482,TR,TR3,TR33,TR332


In [22]:
# reset indices of reverse geocoding df and previous weather df
df = df.reset_index()
df1 = df1.reset_index()
# concatenate both dfs
locations = pd.concat([df, df1], axis=1)
locations

Unnamed: 0,index,longitude,latitude,time,u10,d2m,t2m,uvb,lat,lon,lat_long,index.1,country,NUTS1,NUTS2,NUTS3
0,0,-10.00,71.00,2018-05-01 02:00:00,-6.397677,274.760529,276.599854,0.0,71.0,-10.0,"(71.0, -10.0)",0,,,,
1,7344,-10.00,70.25,2018-05-01 02:00:00,-6.526506,275.174927,277.215179,0.0,70.25,-10.0,"(70.25, -10.0)",1,,,,
2,14688,-10.00,69.50,2018-05-01 02:00:00,-6.965965,275.057220,277.189667,0.0,69.5,-10.0,"(69.5, -10.0)",2,,,,
3,22032,-10.00,68.75,2018-05-01 02:00:00,-7.714090,274.774414,276.953369,0.0,68.75,-10.0,"(68.75, -10.0)",3,,,,
4,29376,-10.00,68.00,2018-05-01 02:00:00,-8.437366,274.559448,276.742554,0.0,68.0,-10.0,"(68.0, -10.0)",4,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2479,53625888,29.75,40.25,2018-05-01 02:00:00,-0.704992,282.959808,284.051086,0.0,40.25,29.75,"(40.25, 29.75)",2479,TR,TR4,TR41,TR411
2480,53633232,29.75,39.50,2018-05-01 02:00:00,-0.580741,277.455475,284.869659,0.0,39.5,29.75,"(39.5, 29.75)",2480,TR,TR3,TR33,TR333
2481,53640576,29.75,38.75,2018-05-01 02:00:00,-1.340637,273.789459,281.447937,0.0,38.75,29.75,"(38.75, 29.75)",2481,TR,TR3,TR33,TR334
2482,53647920,29.75,38.00,2018-05-01 02:00:00,-0.574201,278.682373,288.373627,0.0,38.0,29.75,"(38.0, 29.75)",2482,TR,TR3,TR33,TR332


The unique locations now include the corresponding country and NUTS regions

In [24]:
# replace empty cells with NaN
locations = locations.replace(r'', np.nan, regex=True)
locations.head()

Unnamed: 0,index,longitude,latitude,time,u10,d2m,t2m,uvb,lat,lon,lat_long,index.1,country,NUTS1,NUTS2,NUTS3
0,0,-10.0,71.0,2018-05-01 02:00:00,-6.397677,274.760529,276.599854,0.0,71.0,-10.0,"(71.0, -10.0)",0,,,,
1,7344,-10.0,70.25,2018-05-01 02:00:00,-6.526506,275.174927,277.215179,0.0,70.25,-10.0,"(70.25, -10.0)",1,,,,
2,14688,-10.0,69.5,2018-05-01 02:00:00,-6.965965,275.05722,277.189667,0.0,69.5,-10.0,"(69.5, -10.0)",2,,,,
3,22032,-10.0,68.75,2018-05-01 02:00:00,-7.71409,274.774414,276.953369,0.0,68.75,-10.0,"(68.75, -10.0)",3,,,,
4,29376,-10.0,68.0,2018-05-01 02:00:00,-8.437366,274.559448,276.742554,0.0,68.0,-10.0,"(68.0, -10.0)",4,,,,


In [25]:
# drop observations that do not have information on country (likely locations on the sea)
locations = locations[locations['country'].notna()]
locations.head()

Unnamed: 0,index,longitude,latitude,time,u10,d2m,t2m,uvb,lat,lon,lat_long,index.1,country,NUTS1,NUTS2,NUTS3
25,183600,-10.0,52.25,2018-05-01 02:00:00,1.41971,276.346252,280.795715,0.0,52.25,-10.0,"(52.25, -10.0)",25,IE,IE0,IE05,IE053
69,1175040,-9.25,53.75,2018-05-01 02:00:00,0.718015,276.432098,279.697357,0.0,53.75,-9.25,"(53.75, -9.25)",69,IE,IE0,IE04,IE042
70,1182384,-9.25,53.0,2018-05-01 02:00:00,0.432891,276.576782,280.215363,0.0,53.0,-9.25,"(53.0, -9.25)",70,IE,IE0,IE05,IE051
71,1189728,-9.25,52.25,2018-05-01 02:00:00,0.773602,275.237061,278.616028,0.0,52.25,-9.25,"(52.25, -9.25)",71,IE,IE0,IE05,IE053
89,1321920,-9.25,38.75,2018-05-01 02:00:00,1.162051,281.014404,284.2099,0.0,38.75,-9.25,"(38.75, -9.25)",89,PT,PT1,PT17,PT170


In [26]:
locations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 25 to 2483
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   index      1168 non-null   int64         
 1   longitude  1168 non-null   float64       
 2   latitude   1168 non-null   float64       
 3   time       1168 non-null   datetime64[ns]
 4   u10        1168 non-null   float32       
 5   d2m        1168 non-null   float32       
 6   t2m        1168 non-null   float32       
 7   uvb        1168 non-null   float32       
 8   lat        1168 non-null   object        
 9   lon        1168 non-null   object        
 10  lat_long   1168 non-null   object        
 11  index      1168 non-null   int64         
 12  country    1168 non-null   object        
 13  NUTS1      1168 non-null   object        
 14  NUTS2      1168 non-null   object        
 15  NUTS3      1168 non-null   object        
dtypes: datetime64[ns](1), float32(4), float64

In [27]:
# show unique countries
locations['country'].unique()

array(['IE', 'PT', 'ES', 'UK', 'FR', 'NL', 'BE', 'NO', 'DE', 'CH', 'IT',
       'DK', 'AT', 'SE', 'CZ', 'SI', 'HR', 'PL', 'SK', 'HU', 'ME', 'RS',
       'AL', 'MK', 'EL', 'FI', 'LV', 'LT', 'RO', 'EE', 'BG', 'TR'],
      dtype=object)

In [28]:
# keep only relevant variables that are needed to be merged
locations = locations[['lat','lon', 'country', 'NUTS1', 'NUTS2', 'NUTS3']]
locations.head()

Unnamed: 0,lat,lon,country,NUTS1,NUTS2,NUTS3
25,52.25,-10.0,IE,IE0,IE05,IE053
69,53.75,-9.25,IE,IE0,IE04,IE042
70,53.0,-9.25,IE,IE0,IE05,IE051
71,52.25,-9.25,IE,IE0,IE05,IE053
89,38.75,-9.25,PT,PT1,PT17,PT170


In [30]:
# drop locations from countries that do not provide mortality data or are located outside of Europe
non_european = ['SJ', 'GB', 'IM', 'JE', 'DZ', 'TN', 'BA', 'RU', 'XK', 'MK', 'UA', 'BY', 'TR', 'MD']
locations = locations[~locations['country'].isin(non_european)]
locations.shape

(1156, 6)

In [31]:
# write data to csv
locations.to_csv('locations1.csv')