# General Operations on weather data and merging with MRT

import cdsapi

c = cdsapi.Client()

c.retrieve(
    'reanalysis-era5-single-levels',
    {
        'product_type': 'reanalysis',
        'variable': [
            '10m_u_component_of_wind', '2m_dewpoint_temperature', '2m_temperature',
            'downward_uv_radiation_at_the_surface',
        ],
        'year': [
            '2018', '2019', '2020', '2021',
        ],
        'month': [
            '05', '06', '07',
            '08', '09',
        ],
        'day': [
            '01', '02', '03',
            '04', '05', '06',
            '07', '08', '09',
            '10', '11', '12',
            '13', '14', '15',
            '16', '17', '18',
            '19', '20', '21',
            '22', '23', '24',
            '25', '26', '27',
            '28', '29', '30',
            '31',
        ],
        'time': [
            '02:00', '11:00', '16:00',
            '23:00',
        ],
        'area': [
            71.2, -10, 37,
            30,
        ],
        'format': 'netcdf',
    },
    'download18_21.nc')


In [1]:
import xarray as xr
import numpy as np
import pandas as pd



In [2]:
ds = xr.open_dataset('C:/Users/benhu/MasterThesisRawData/download18_21.nc')
df_21 = ds.to_dataframe()

ecCodes library not found using ['eccodes', 'libeccodes.so', 'libeccodes']


In [3]:
# have latitude and longitude as columns
df_21 = df_21.reset_index(level=['longitude', 'latitude', 'time'])

In [4]:
df_21

Unnamed: 0,longitude,latitude,time,u10,d2m,t2m,uvb
0,-10.0,71.0,2018-05-01 02:00:00,-6.397677,274.760529,276.599854,0.000000
1,-10.0,71.0,2018-05-01 11:00:00,-10.199122,273.712616,276.260498,118736.773438
2,-10.0,71.0,2018-05-01 16:00:00,-11.301038,273.334991,276.105469,110801.500000
3,-10.0,71.0,2018-05-01 23:00:00,-12.028891,274.409027,276.233093,55.687500
4,-10.0,71.0,2018-05-02 02:00:00,-10.598689,274.831635,276.350311,0.000000
...,...,...,...,...,...,...,...
53995531,30.0,37.0,2021-09-29 23:00:00,0.064060,276.603729,287.014404,0.000000
53995532,30.0,37.0,2021-09-30 02:00:00,0.652620,278.227905,285.402771,0.000000
53995533,30.0,37.0,2021-09-30 11:00:00,-0.249185,278.330078,293.606445,211349.796875
53995534,30.0,37.0,2021-09-30 16:00:00,-1.631648,278.479675,291.566620,7524.593750


In [5]:
df_21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53995536 entries, 0 to 53995535
Data columns (total 7 columns):
 #   Column     Dtype         
---  ------     -----         
 0   longitude  float64       
 1   latitude   float64       
 2   time       datetime64[ns]
 3   u10        float32       
 4   d2m        float32       
 5   t2m        float32       
 6   uvb        float32       
dtypes: datetime64[ns](1), float32(4), float64(2)
memory usage: 2.0 GB


In [6]:
#df['appTemp'] = df.apply(lambda row: -2.653+(0.994*df['t2m'])+(0.368*df['d2m']^2), axis=1)
#df.apply(lambda row: row.a + row.b, axis=1)

In [7]:
# specify the coordinates to keep according to European geography
lon = list(np.arange(-10, 31, 0.75))
lat = list(np.arange(37, 72, 0.75))

In [8]:
#lon = list(range(-10, 31))
#lat = list(range(37, 72))

In [9]:
# subset data to coordinates of Europe
df_21 = df_21[df_21['longitude'].isin(lon) & df_21['latitude'].isin(lat)]

In [10]:
df_21.shape

(6080832, 7)

In [11]:
# conver coordinates to string in order to from them to a tuple in a later step
df_21['latitude'] = df_21['latitude'].astype(str)
df_21['longitude'] = df_21['longitude'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_21['latitude'] = df_21['latitude'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_21['longitude'] = df_21['longitude'].astype(str)


In [12]:
df_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6080832 entries, 2448 to 53660159
Data columns (total 7 columns):
 #   Column     Dtype         
---  ------     -----         
 0   longitude  object        
 1   latitude   object        
 2   time       datetime64[ns]
 3   u10        float32       
 4   d2m        float32       
 5   t2m        float32       
 6   uvb        float32       
dtypes: datetime64[ns](1), float32(4), object(2)
memory usage: 278.4+ MB


In [13]:
# create unique column for each location by combining latitude and longitude
df_21['lat_long'] = df_21[['latitude', 'longitude']].apply(tuple, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_21['lat_long'] = df_21[['latitude', 'longitude']].apply(tuple, axis=1)


In [14]:
df_21.head()

Unnamed: 0,longitude,latitude,time,u10,d2m,t2m,uvb,lat_long
2448,-10.0,70.75,2018-05-01 02:00:00,-6.413372,274.883942,276.799286,0.0,"(70.75, -10.0)"
2449,-10.0,70.75,2018-05-01 11:00:00,-10.501904,273.822144,276.428741,120490.890625,"(70.75, -10.0)"
2450,-10.0,70.75,2018-05-01 16:00:00,-11.561312,273.520538,276.270905,121430.59375,"(70.75, -10.0)"
2451,-10.0,70.75,2018-05-01 23:00:00,-11.928835,274.700836,276.502472,6.953125,"(70.75, -10.0)"
2452,-10.0,70.75,2018-05-02 02:00:00,-10.05983,275.070312,276.542175,0.0,"(70.75, -10.0)"


In [15]:
# extract date in separate column
df_21['date'] = df_21['time'].dt.strftime('%Y-%m-%d')
# extract time in separate column
df_21['clock'] = df_21['time'].dt.strftime('%H:%M:%S')
#subdf = subdf.drop(['time'], axis=1)
#subdf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_21['date'] = df_21['time'].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_21['clock'] = df_21['time'].dt.strftime('%H:%M:%S')


In [16]:
# get temperature in celsius
df_21['t2m'] = df_21['t2m']-273.15
df_21['d2m'] = df_21['d2m']-273.15

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_21['t2m'] = df_21['t2m']-273.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_21['d2m'] = df_21['d2m']-273.15


In [17]:
# get apparent temperature
df_21['apparent_temperature'] = -2.653+(0.994*df_21['t2m'])+(0.0153*df_21['d2m']**2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_21['apparent_temperature'] = -2.653+(0.994*df_21['t2m'])+(0.0153*df_21['d2m']**2)


In [18]:
df_21

Unnamed: 0,longitude,latitude,time,u10,d2m,t2m,uvb,lat_long,date,clock,apparent_temperature
2448,-10.0,70.75,2018-05-01 02:00:00,-6.413372,1.733942,3.649286,0.000000,"(70.75, -10.0)",2018-05-01,02:00:00,1.020390
2449,-10.0,70.75,2018-05-01 11:00:00,-10.501904,0.672144,3.278741,120490.890625,"(70.75, -10.0)",2018-05-01,11:00:00,0.612981
2450,-10.0,70.75,2018-05-01 16:00:00,-11.561312,0.370538,3.120905,121430.593750,"(70.75, -10.0)",2018-05-01,16:00:00,0.451280
2451,-10.0,70.75,2018-05-01 23:00:00,-11.928835,1.550836,3.352472,6.953125,"(70.75, -10.0)",2018-05-01,23:00:00,0.716155
2452,-10.0,70.75,2018-05-02 02:00:00,-10.059830,1.920313,3.392175,0.000000,"(70.75, -10.0)",2018-05-02,02:00:00,0.775243
...,...,...,...,...,...,...,...,...,...,...,...
53660155,29.75,37.0,2021-09-29 23:00:00,-0.664447,4.869470,13.192346,0.000000,"(37.0, 29.75)",2021-09-29,23:00:00,10.822982
53660156,29.75,37.0,2021-09-30 02:00:00,-0.774312,5.863428,11.729120,0.000000,"(37.0, 29.75)",2021-09-30,02:00:00,9.531756
53660157,29.75,37.0,2021-09-30 11:00:00,1.183632,6.160944,20.881799,215533.218750,"(37.0, 29.75)",2021-09-30,11:00:00,18.684254
53660158,29.75,37.0,2021-09-30 16:00:00,0.276595,5.427759,19.203058,7907.437500,"(37.0, 29.75)",2021-09-30,16:00:00,16.885586


In [19]:
# subset data per hour
sub2am = df_21[df_21['clock'] == '02:00:00']
sub11am = df_21[df_21['clock'] == '11:00:00']
sub16am = df_21[df_21['clock'] == '16:00:00']
sub23am = df_21[df_21['clock'] == '23:00:00']

In [20]:
# rename colums to indicate hours
sub2am = sub2am.rename(columns={"t2m": "temperature_2AM", 'd2m': 'dew_point_2AM', 'uvb': 'uvb_2AM', 'u10': 'wind_2AM', "apparent_temperature": 'apparent_temperature_2AM'})
sub11am = sub11am.rename(columns={"t2m": "temperature_11AM", 'd2m': 'dew_point_11AM', 'uvb': 'uvb_11AM', 'u10': 'wind_11AM', "apparent_temperature": 'apparent_temperature_11AM'})
sub16am = sub16am.rename(columns={"t2m": "temperature_4PM", 'd2m': 'dew_point_4PM', 'uvb': 'uvb_4PM', 'u10': 'wind_4PM', "apparent_temperature": 'apparent_temperature_4PM'})
sub23am = sub23am.rename(columns={"t2m": "temperature_11PM", 'd2m': 'dew_point_11PM', 'uvb': 'uvb_11PM', 'u10': 'wind_11PM', "apparent_temperature": 'apparent_temperature_11PM'})

In [21]:
# drop irrelevant variables from subset
sub2am = sub2am.drop(['clock','time'],1)
sub11am = sub11am.drop(['clock','time','lat_long'],1)
sub16am = sub16am.drop(['clock','time','lat_long'],1)
sub23am = sub23am.drop(['clock','time','lat_long'],1)

In [22]:
# merging subsets of data per hour
df1 = sub2am.merge(sub11am, how='inner', on=['date','latitude', 'longitude'])
df2 = df1.merge(sub16am, how='inner', on=['date','latitude', 'longitude'])
final00_21 = df2.merge(sub23am, how='inner', on=['date','latitude', 'longitude'])
final00_21.head()

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,wind_4PM,dew_point_4PM,temperature_4PM,uvb_4PM,apparent_temperature_4PM,wind_11PM,dew_point_11PM,temperature_11PM,uvb_11PM,apparent_temperature_11PM
0,-10.0,70.75,-6.413372,1.733942,3.649286,0.0,"(70.75, -10.0)",2018-05-01,1.02039,-10.501904,...,-11.561312,0.370538,3.120905,121430.59375,0.45128,-11.928835,1.550836,3.352472,6.953125,0.716155
1,-10.0,70.75,-10.05983,1.920313,3.392175,0.0,"(70.75, -10.0)",2018-05-02,0.775243,0.00455,...,-0.417252,-1.143286,0.224695,88666.265625,-2.409655,-5.783612,0.994196,1.892084,41.765625,-0.757146
2,-10.0,70.75,-6.911687,1.908044,2.454523,0.0,"(70.75, -10.0)",2018-05-03,-0.157503,2.519663,...,-2.903591,0.163721,2.320276,124486.367188,-0.346236,-7.260899,0.319849,1.514001,250.59375,-1.146517
3,-10.0,70.75,2.228653,0.51684,1.94787,0.0,"(70.75, -10.0)",2018-05-04,-0.71273,-4.100985,...,2.30582,1.633417,1.819299,57830.0625,-0.803795,-2.328765,1.053064,1.977167,375.875,-0.670729
4,-10.0,70.75,2.793671,0.701563,0.776727,0.0,"(70.75, -10.0)",2018-05-05,-1.873403,4.490686,...,2.892418,-0.826941,2.352411,176580.75,-0.304241,3.54572,-0.795081,1.858057,897.9375,-0.79642


In [23]:
#final0002.to_csv('data_00_02.csv')
#final0002 = pd.read_csv('data_00_02.csv', index_col=0)

# Merge Locations with weather data

In [24]:
# read locations data
locations = pd.read_csv('locations1.csv', index_col=0)
locations.head()

Unnamed: 0,lat,lon,country,NUTS1,NUTS2,NUTS3
22,54.25,-10.0,IE,IE0,IE04,IE042
23,53.5,-10.0,IE,IE0,IE04,IE042
25,52.0,-10.0,IE,IE0,IE05,IE053
68,54.25,-9.25,IE,IE0,IE04,IE042
69,53.5,-9.25,IE,IE0,IE04,IE042


In [25]:
# change coordinates to string
locations['lat'] = locations['lat'].astype(str)
locations['lon'] = locations['lon'].astype(str)

In [26]:
# be sure to also have coordinates on weather data as string
final00_21['latitude'] = final00_21['latitude'].astype(str)
final00_21['longitude'] = final00_21['longitude'].astype(str)
# merging locations with weather data
df_weather_21 = pd.merge(final00_21, locations,  how='left', left_on=['latitude','longitude'], right_on = ['lat','lon']).drop(['lat', 'lon'], axis = 1)
df_weather_21.head()

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,apparent_temperature_4PM,wind_11PM,dew_point_11PM,temperature_11PM,uvb_11PM,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3
0,-10.0,70.75,-6.413372,1.733942,3.649286,0.0,"(70.75, -10.0)",2018-05-01,1.02039,-10.501904,...,0.45128,-11.928835,1.550836,3.352472,6.953125,0.716155,,,,
1,-10.0,70.75,-10.05983,1.920313,3.392175,0.0,"(70.75, -10.0)",2018-05-02,0.775243,0.00455,...,-2.409655,-5.783612,0.994196,1.892084,41.765625,-0.757146,,,,
2,-10.0,70.75,-6.911687,1.908044,2.454523,0.0,"(70.75, -10.0)",2018-05-03,-0.157503,2.519663,...,-0.346236,-7.260899,0.319849,1.514001,250.59375,-1.146517,,,,
3,-10.0,70.75,2.228653,0.51684,1.94787,0.0,"(70.75, -10.0)",2018-05-04,-0.71273,-4.100985,...,-0.803795,-2.328765,1.053064,1.977167,375.875,-0.670729,,,,
4,-10.0,70.75,2.793671,0.701563,0.776727,0.0,"(70.75, -10.0)",2018-05-05,-1.873403,4.490686,...,-0.304241,3.54572,-0.795081,1.858057,897.9375,-0.79642,,,,


If country or any NUTS NaN it is likely that the coordinates indicate locations on water.

In [27]:
df_weather_21.shape

(1520208, 28)

In [28]:
# instances with country NaN are dropped as they do not indicate land area
df_weather_21 = df_weather_21.dropna(subset=['country'])

In [29]:
# the data is reduced to less than half
df_weather_21.shape

(707472, 28)

In [30]:
# show first instances
df_weather_21.head()

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,apparent_temperature_4PM,wind_11PM,dew_point_11PM,temperature_11PM,uvb_11PM,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3
13464,-10.0,54.25,1.456331,5.095056,8.450098,0.0,"(54.25, -10.0)",2018-05-01,6.143579,2.84141,...,8.931084,4.27488,4.798364,7.701501,0.0,5.354564,IE,IE0,IE04,IE042
13465,-10.0,54.25,5.869879,2.406671,7.270471,0.0,"(54.25, -10.0)",2018-05-02,4.662467,9.012136,...,6.83229,6.630429,5.507867,9.245996,0.0,7.00167,IE,IE0,IE04,IE042
13466,-10.0,54.25,7.052231,6.607263,9.581567,0.0,"(54.25, -10.0)",2018-05-03,7.539014,6.411354,...,10.051018,4.267687,8.497888,9.950189,0.0,8.342364,IE,IE0,IE04,IE042
13467,-10.0,54.25,4.552812,9.397852,10.369867,0.0,"(54.25, -10.0)",2018-05-04,9.005938,4.440985,...,9.648761,2.116827,9.052881,10.386896,0.0,8.925481,IE,IE0,IE04,IE042
13468,-10.0,54.25,4.188559,9.720697,10.56933,0.0,"(54.25, -10.0)",2018-05-05,9.298641,4.770579,...,11.637569,2.479772,9.515527,10.736627,0.0,9.40455,IE,IE0,IE04,IE042


# Loading of MRT Dataset

In [31]:
# loading MRT data for the given years
rad_21 = pd.read_csv('rad_with_MRT_21.csv', index_col=0)

In [32]:
# changing coordinates to string
rad_21['latitude'] = rad_21['latitude'].astype(str)
rad_21['longitude'] = rad_21['longitude'].astype(str)

In [33]:
# merge overall weather set with MRT set
weather_final_21 = df_weather_21.merge(rad_21, how='inner', on=['date','latitude', 'longitude'])

In [34]:
# show first instances
weather_final_21.head()

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,uvb_11PM,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM
0,-10.0,54.25,1.456331,5.095056,8.450098,0.0,"(54.25, -10.0)",2018-05-01,6.143579,2.84141,...,0.0,5.354564,IE,IE0,IE04,IE042,4.87,14.75,14.02,1.85
1,-10.0,54.25,5.869879,2.406671,7.270471,0.0,"(54.25, -10.0)",2018-05-02,4.662467,9.012136,...,0.0,7.00167,IE,IE0,IE04,IE042,-0.82,14.76,13.94,4.52
2,-10.0,54.25,7.052231,6.607263,9.581567,0.0,"(54.25, -10.0)",2018-05-03,7.539014,6.411354,...,0.0,8.342364,IE,IE0,IE04,IE042,5.59,16.29,25.54,4.61
3,-10.0,54.25,4.552812,9.397852,10.369867,0.0,"(54.25, -10.0)",2018-05-04,9.005938,4.440985,...,0.0,8.925481,IE,IE0,IE04,IE042,9.37,29.1,27.72,9.34
4,-10.0,54.25,4.188559,9.720697,10.56933,0.0,"(54.25, -10.0)",2018-05-05,9.298641,4.770579,...,0.0,9.40455,IE,IE0,IE04,IE042,9.28,24.73,22.79,10.05


In [35]:
# change date to datetime and get week number
weather_final_21['date'] =  pd.to_datetime(weather_final_21['date'])
weather_final_21['Week_Number'] = weather_final_21['date'].dt.week

  weather_final_21['Week_Number'] = weather_final_21['date'].dt.week


In [36]:
weather_final_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 707472 entries, 0 to 707471
Data columns (total 33 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   longitude                  707472 non-null  object        
 1   latitude                   707472 non-null  object        
 2   wind_2AM                   707472 non-null  float32       
 3   dew_point_2AM              707472 non-null  float64       
 4   temperature_2AM            707472 non-null  float64       
 5   uvb_2AM                    707472 non-null  float32       
 6   lat_long                   707472 non-null  object        
 7   date                       707472 non-null  datetime64[ns]
 8   apparent_temperature_2AM   707472 non-null  float64       
 9   wind_11AM                  707472 non-null  float32       
 10  dew_point_11AM             707472 non-null  float64       
 11  temperature_11AM           707472 non-null  float64 

In [37]:
weather_final_21.head()

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM,Week_Number
0,-10.0,54.25,1.456331,5.095056,8.450098,0.0,"(54.25, -10.0)",2018-05-01,6.143579,2.84141,...,5.354564,IE,IE0,IE04,IE042,4.87,14.75,14.02,1.85,18
1,-10.0,54.25,5.869879,2.406671,7.270471,0.0,"(54.25, -10.0)",2018-05-02,4.662467,9.012136,...,7.00167,IE,IE0,IE04,IE042,-0.82,14.76,13.94,4.52,18
2,-10.0,54.25,7.052231,6.607263,9.581567,0.0,"(54.25, -10.0)",2018-05-03,7.539014,6.411354,...,8.342364,IE,IE0,IE04,IE042,5.59,16.29,25.54,4.61,18
3,-10.0,54.25,4.552812,9.397852,10.369867,0.0,"(54.25, -10.0)",2018-05-04,9.005938,4.440985,...,8.925481,IE,IE0,IE04,IE042,9.37,29.1,27.72,9.34,18
4,-10.0,54.25,4.188559,9.720697,10.56933,0.0,"(54.25, -10.0)",2018-05-05,9.298641,4.770579,...,9.40455,IE,IE0,IE04,IE042,9.28,24.73,22.79,10.05,18


In [38]:
# round data to 2 decimals
weather_final_21 = weather_final_21.round(decimals=2)
weather_final_21

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM,Week_Number
0,-10.0,54.25,1.46,5.10,8.45,0.0,"(54.25, -10.0)",2018-05-01,6.14,2.84,...,5.35,IE,IE0,IE04,IE042,4.87,14.75,14.02,1.85,18
1,-10.0,54.25,5.87,2.41,7.27,0.0,"(54.25, -10.0)",2018-05-02,4.66,9.01,...,7.00,IE,IE0,IE04,IE042,-0.82,14.76,13.94,4.52,18
2,-10.0,54.25,7.05,6.61,9.58,0.0,"(54.25, -10.0)",2018-05-03,7.54,6.41,...,8.34,IE,IE0,IE04,IE042,5.59,16.29,25.54,4.61,18
3,-10.0,54.25,4.55,9.40,10.37,0.0,"(54.25, -10.0)",2018-05-04,9.01,4.44,...,8.93,IE,IE0,IE04,IE042,9.37,29.10,27.72,9.34,18
4,-10.0,54.25,4.19,9.72,10.57,0.0,"(54.25, -10.0)",2018-05-05,9.30,4.77,...,9.40,IE,IE0,IE04,IE042,9.28,24.73,22.79,10.05,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707467,29.75,61.75,2.28,5.96,6.96,0.0,"(61.75, 29.75)",2021-09-26,4.81,1.17,...,1.22,FI,FI1,FI1C,FI1C5,4.13,18.18,6.60,0.10,38
707468,29.75,61.75,-0.01,2.83,3.14,0.0,"(61.75, 29.75)",2021-09-27,0.60,0.18,...,1.45,FI,FI1,FI1C,FI1C5,0.34,20.55,3.04,0.14,39
707469,29.75,61.75,-1.48,3.60,3.85,0.0,"(61.75, 29.75)",2021-09-28,1.37,-1.15,...,4.43,FI,FI1,FI1C,FI1C5,3.95,17.04,6.69,5.08,39
707470,29.75,61.75,-1.20,5.68,6.56,0.0,"(61.75, 29.75)",2021-09-29,4.36,-0.11,...,4.77,FI,FI1,FI1C,FI1C5,4.56,16.11,6.52,5.48,39


In [39]:
# drop possible duplicates
weather_final_21 = weather_final_21.drop_duplicates(keep='first')
weather_final_21.shape

(707472, 33)

In [42]:
# write data to csv
weather_final_21.to_csv('FINAL_WEATHER_MERGED_21')