# General Operations on weather data and merging with MRT

import cdsapi

c = cdsapi.Client()

c.retrieve(
    'reanalysis-era5-single-levels',
    {
        'product_type': 'reanalysis',
        'variable': [
            '10m_u_component_of_wind', '2m_dewpoint_temperature', '2m_temperature',
            'downward_uv_radiation_at_the_surface',
        ],
        'year': [
            '2006', '2007', '2008', '2009',
        ],
        'month': [
            '05', '06', '07',
            '08', '09',
        ],
        'day': [
            '01', '02', '03',
            '04', '05', '06',
            '07', '08', '09',
            '10', '11', '12',
            '13', '14', '15',
            '16', '17', '18',
            '19', '20', '21',
            '22', '23', '24',
            '25', '26', '27',
            '28', '29', '30',
            '31',
        ],
        'time': [
            '02:00', '11:00', '16:00',
            '23:00',
        ],
        'area': [
            71.2, -10, 37,
            30,
        ],
        'format': 'netcdf',
    },
    'download06_09.nc')

In [1]:
# load required libraries
import xarray as xr
import numpy as np
import pandas as pd



In [2]:
# load raw dataset
ds = xr.open_dataset('C:/Users/benhu/MasterThesisRawData/download06_09.nc')
df_09 = ds.to_dataframe()

ecCodes library not found using ['eccodes', 'libeccodes.so', 'libeccodes']


In [3]:
# have latitude and longitude as columns
df_09 = df_09.reset_index(level=['longitude', 'latitude', 'time'])

In [4]:
df_09

Unnamed: 0,longitude,latitude,time,u10,d2m,t2m,uvb
0,-10.0,71.0,2006-05-01 02:00:00,-3.929414,274.499176,275.617523,0.000000
1,-10.0,71.0,2006-05-01 11:00:00,-4.232500,274.354675,275.889038,163730.859375
2,-10.0,71.0,2006-05-01 16:00:00,-4.457170,274.600342,275.759521,166199.687500
3,-10.0,71.0,2006-05-01 23:00:00,-6.334185,274.738892,275.845276,55.328125
4,-10.0,71.0,2006-05-02 02:00:00,-7.695270,274.682770,275.848846,0.000000
...,...,...,...,...,...,...,...
53995531,30.0,37.0,2009-09-29 23:00:00,0.309430,278.330292,279.535950,0.000000
53995532,30.0,37.0,2009-09-30 02:00:00,0.195539,278.467987,278.469482,0.000000
53995533,30.0,37.0,2009-09-30 11:00:00,-1.024894,280.481720,292.095093,270450.500000
53995534,30.0,37.0,2009-09-30 16:00:00,-0.594849,279.884155,292.226379,10719.000000


In [5]:
df_09.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53995536 entries, 0 to 53995535
Data columns (total 7 columns):
 #   Column     Dtype         
---  ------     -----         
 0   longitude  float64       
 1   latitude   float64       
 2   time       datetime64[ns]
 3   u10        float32       
 4   d2m        float32       
 5   t2m        float32       
 6   uvb        float32       
dtypes: datetime64[ns](1), float32(4), float64(2)
memory usage: 2.0 GB


In [6]:
#df['appTemp'] = df.apply(lambda row: -2.653+(0.994*df['t2m'])+(0.368*df['d2m']^2), axis=1)
#df.apply(lambda row: row.a + row.b, axis=1)

In [7]:
# specify the coordinates to keep according to European geography
lon = list(np.arange(-10, 31, 0.75))
lat = list(np.arange(37, 72, 0.75))

In [8]:
#lon = list(range(-10, 31))
#lat = list(range(37, 72))

In [9]:
# subset data to coordinates of Europe
df_09 = df_09[df_09['longitude'].isin(lon) & df_09['latitude'].isin(lat)]

In [10]:
df_09.shape

(6080832, 7)

In [11]:
# conver coordinates to string in order to from them to a tuple in a later step
df_09['latitude'] = df_09['latitude'].astype(str)
df_09['longitude'] = df_09['longitude'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_09['latitude'] = df_09['latitude'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_09['longitude'] = df_09['longitude'].astype(str)


In [12]:
df_09.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6080832 entries, 2448 to 53660159
Data columns (total 7 columns):
 #   Column     Dtype         
---  ------     -----         
 0   longitude  object        
 1   latitude   object        
 2   time       datetime64[ns]
 3   u10        float32       
 4   d2m        float32       
 5   t2m        float32       
 6   uvb        float32       
dtypes: datetime64[ns](1), float32(4), object(2)
memory usage: 278.4+ MB


In [13]:
# create unique column for each location by combining latitude and longitude
df_09['lat_long'] = df_09[['latitude', 'longitude']].apply(tuple, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_09['lat_long'] = df_09[['latitude', 'longitude']].apply(tuple, axis=1)


In [14]:
df_09.head()

Unnamed: 0,longitude,latitude,time,u10,d2m,t2m,uvb,lat_long
2448,-10.0,70.75,2006-05-01 02:00:00,-4.030857,274.55188,275.71756,0.0,"(70.75, -10.0)"
2449,-10.0,70.75,2006-05-01 11:00:00,-4.442855,274.483887,275.961395,163239.859375,"(70.75, -10.0)"
2450,-10.0,70.75,2006-05-01 16:00:00,-4.717313,274.639435,275.835449,164533.046875,"(70.75, -10.0)"
2451,-10.0,70.75,2006-05-01 23:00:00,-6.527737,274.889343,275.964966,6.921875,"(70.75, -10.0)"
2452,-10.0,70.75,2006-05-02 02:00:00,-7.387205,274.823029,275.893524,0.0,"(70.75, -10.0)"


In [15]:
# extract date in separate column
df_09['date'] = df_09['time'].dt.strftime('%Y-%m-%d')
# extract time in separate column
df_09['clock'] = df_09['time'].dt.strftime('%H:%M:%S')
#subdf = subdf.drop(['time'], axis=1)
#subdf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_09['date'] = df_09['time'].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_09['clock'] = df_09['time'].dt.strftime('%H:%M:%S')


In [16]:
# get temperature in celsius
df_09['t2m'] = df_09['t2m']-273.15
df_09['d2m'] = df_09['d2m']-273.15

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_09['t2m'] = df_09['t2m']-273.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_09['d2m'] = df_09['d2m']-273.15


In [17]:
# get apparent temperature
df_09['apparent_temperature'] = -2.653+(0.994*df_09['t2m'])+(0.0153*df_09['d2m']**2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_09['apparent_temperature'] = -2.653+(0.994*df_09['t2m'])+(0.0153*df_09['d2m']**2)


In [18]:
df_09

Unnamed: 0,longitude,latitude,time,u10,d2m,t2m,uvb,lat_long,date,clock,apparent_temperature
2448,-10.0,70.75,2006-05-01 02:00:00,-4.030857,1.401880,2.567560,0.000000,"(70.75, -10.0)",2006-05-01,02:00:00,-0.070777
2449,-10.0,70.75,2006-05-01 11:00:00,-4.442855,1.333887,2.811395,163239.859375,"(70.75, -10.0)",2006-05-01,11:00:00,0.168749
2450,-10.0,70.75,2006-05-01 16:00:00,-4.717313,1.489435,2.685449,164533.046875,"(70.75, -10.0)",2006-05-01,16:00:00,0.050278
2451,-10.0,70.75,2006-05-01 23:00:00,-6.527737,1.739343,2.814966,6.921875,"(70.75, -10.0)",2006-05-01,23:00:00,0.191363
2452,-10.0,70.75,2006-05-02 02:00:00,-7.387205,1.673029,2.743524,0.000000,"(70.75, -10.0)",2006-05-02,02:00:00,0.116888
...,...,...,...,...,...,...,...,...,...,...,...
53660155,29.75,37.0,2009-09-29 23:00:00,-0.287406,7.842584,8.403650,0.000000,"(37.0, 29.75)",2009-09-29,23:00:00,6.641272
53660156,29.75,37.0,2009-09-30 02:00:00,-0.233884,6.368646,6.369867,0.000000,"(37.0, 29.75)",2009-09-30,02:00:00,4.299210
53660157,29.75,37.0,2009-09-30 11:00:00,-0.038465,7.333429,19.568536,249468.953125,"(37.0, 29.75)",2009-09-30,11:00:00,17.620947
53660158,29.75,37.0,2009-09-30 16:00:00,1.012066,10.972437,17.234644,10705.156250,"(37.0, 29.75)",2009-09-30,16:00:00,16.320269


In [19]:
# subset data per hour
sub2am = df_09[df_09['clock'] == '02:00:00']
sub11am = df_09[df_09['clock'] == '11:00:00']
sub16am = df_09[df_09['clock'] == '16:00:00']
sub23am = df_09[df_09['clock'] == '23:00:00']

In [20]:
# rename colums to indicate hours
sub2am = sub2am.rename(columns={"t2m": "temperature_2AM", 'd2m': 'dew_point_2AM', 'uvb': 'uvb_2AM', 'u10': 'wind_2AM', "apparent_temperature": 'apparent_temperature_2AM'})
sub11am = sub11am.rename(columns={"t2m": "temperature_11AM", 'd2m': 'dew_point_11AM', 'uvb': 'uvb_11AM', 'u10': 'wind_11AM', "apparent_temperature": 'apparent_temperature_11AM'})
sub16am = sub16am.rename(columns={"t2m": "temperature_4PM", 'd2m': 'dew_point_4PM', 'uvb': 'uvb_4PM', 'u10': 'wind_4PM', "apparent_temperature": 'apparent_temperature_4PM'})
sub23am = sub23am.rename(columns={"t2m": "temperature_11PM", 'd2m': 'dew_point_11PM', 'uvb': 'uvb_11PM', 'u10': 'wind_11PM', "apparent_temperature": 'apparent_temperature_11PM'})

In [21]:
# drop irrelevant variables from subset
sub2am = sub2am.drop(['clock','time'],1)
sub11am = sub11am.drop(['clock','time','lat_long'],1)
sub16am = sub16am.drop(['clock','time','lat_long'],1)
sub23am = sub23am.drop(['clock','time','lat_long'],1)

In [22]:
# merging subsets of data per hour
df1 = sub2am.merge(sub11am, how='inner', on=['date','latitude', 'longitude'])
df2 = df1.merge(sub16am, how='inner', on=['date','latitude', 'longitude'])
final00_09 = df2.merge(sub23am, how='inner', on=['date','latitude', 'longitude'])
final00_09.head()

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,wind_4PM,dew_point_4PM,temperature_4PM,uvb_4PM,apparent_temperature_4PM,wind_11PM,dew_point_11PM,temperature_11PM,uvb_11PM,apparent_temperature_11PM
0,-10.0,70.75,-4.030857,1.40188,2.56756,0.0,"(70.75, -10.0)",2006-05-01,-0.070777,-4.442855,...,-4.717313,1.489435,2.685449,164533.046875,0.050278,-6.527737,1.739343,2.814966,6.921875,0.191363
1,-10.0,70.75,-7.387205,1.673029,2.743524,0.0,"(70.75, -10.0)",2006-05-02,0.116888,-9.940718,...,-9.162777,1.72915,3.403589,112736.125,0.775914,-7.103413,1.865991,2.605066,89.90625,-0.010291
2,-10.0,70.75,-8.092953,1.439264,2.877496,0.0,"(70.75, -10.0)",2006-05-03,0.238925,-10.627795,...,-12.80354,1.8388,4.10473,41333.8125,1.478834,-7.008192,2.102319,3.019495,193.640625,0.416
3,-10.0,70.75,-7.121461,1.857507,2.974847,0.0,"(70.75, -10.0)",2006-05-04,0.356788,0.454438,...,2.577283,1.21911,2.096826,133392.65625,-0.546015,0.834695,1.379785,2.701532,387.265625,0.061451
4,-10.0,70.75,-5.404389,1.48092,2.556848,0.0,"(70.75, -10.0)",2006-05-05,-0.077938,-6.072797,...,-5.654575,2.381952,4.322656,102044.789062,1.730528,-2.337436,2.648035,3.77514,491.0,1.206774


In [23]:
#final0002.to_csv('data_00_02.csv')
#final0002 = pd.read_csv('data_00_02.csv', index_col=0)

# Merge Locations with weather data

In [24]:
# read locations data
locations = pd.read_csv('locations1.csv', index_col=0)
locations.head()

Unnamed: 0,lat,lon,country,NUTS1,NUTS2,NUTS3
22,54.25,-10.0,IE,IE0,IE04,IE042
23,53.5,-10.0,IE,IE0,IE04,IE042
25,52.0,-10.0,IE,IE0,IE05,IE053
68,54.25,-9.25,IE,IE0,IE04,IE042
69,53.5,-9.25,IE,IE0,IE04,IE042


In [25]:
# change coordinates to string
locations['lat'] = locations['lat'].astype(str)
locations['lon'] = locations['lon'].astype(str)

In [26]:
# be sure to also have coordinates on weather data as string
final00_09['latitude'] = final00_09['latitude'].astype(str)
final00_09['longitude'] = final00_09['longitude'].astype(str)
# merging locations with weather data
df_weather_09 = pd.merge(final00_09, locations,  how='left', left_on=['latitude','longitude'], right_on = ['lat','lon']).drop(['lat', 'lon'], axis = 1)
df_weather_09.head()

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,apparent_temperature_4PM,wind_11PM,dew_point_11PM,temperature_11PM,uvb_11PM,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3
0,-10.0,70.75,-4.030857,1.40188,2.56756,0.0,"(70.75, -10.0)",2006-05-01,-0.070777,-4.442855,...,0.050278,-6.527737,1.739343,2.814966,6.921875,0.191363,,,,
1,-10.0,70.75,-7.387205,1.673029,2.743524,0.0,"(70.75, -10.0)",2006-05-02,0.116888,-9.940718,...,0.775914,-7.103413,1.865991,2.605066,89.90625,-0.010291,,,,
2,-10.0,70.75,-8.092953,1.439264,2.877496,0.0,"(70.75, -10.0)",2006-05-03,0.238925,-10.627795,...,1.478834,-7.008192,2.102319,3.019495,193.640625,0.416,,,,
3,-10.0,70.75,-7.121461,1.857507,2.974847,0.0,"(70.75, -10.0)",2006-05-04,0.356788,0.454438,...,-0.546015,0.834695,1.379785,2.701532,387.265625,0.061451,,,,
4,-10.0,70.75,-5.404389,1.48092,2.556848,0.0,"(70.75, -10.0)",2006-05-05,-0.077938,-6.072797,...,1.730528,-2.337436,2.648035,3.77514,491.0,1.206774,,,,


If country or any NUTS NaN it is likely that the coordinates indicate locations on water.

In [27]:
df_weather_09.shape

(1520208, 28)

In [28]:
# instances with country NaN are dropped as they do not indicate land area
df_weather_09 = df_weather_09.dropna(subset=['country'])

In [29]:
# the data is reduced to less than half
df_weather_09.shape

(707472, 28)

In [30]:
# show first instances
df_weather_09.head()

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,apparent_temperature_4PM,wind_11PM,dew_point_11PM,temperature_11PM,uvb_11PM,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3
13464,-10.0,54.25,5.965993,3.760736,7.873987,0.0,"(54.25, -10.0)",2006-05-01,5.390133,6.444582,...,6.691702,-0.575556,5.295892,7.824884,0.0,5.554046,IE,IE0,IE04,IE042
13465,-10.0,54.25,-4.352613,5.374078,7.936517,0.0,"(54.25, -10.0)",2006-05-02,5.677773,3.271828,...,7.7417,4.524624,4.400812,8.538538,0.0,6.130624,IE,IE0,IE04,IE042
13466,-10.0,54.25,0.006344,4.615015,9.056573,0.0,"(54.25, -10.0)",2006-05-03,6.675099,6.287749,...,9.189383,-4.62956,7.561243,10.99035,0.0,9.146146,IE,IE0,IE04,IE042
13467,-10.0,54.25,-3.506214,9.009698,11.226129,0.0,"(54.25, -10.0)",2006-05-04,9.747745,-2.427677,...,11.425242,5.299453,8.116296,9.717554,0.0,8.014125,IE,IE0,IE04,IE042
13468,-10.0,54.25,4.527114,3.234552,8.261652,0.0,"(54.25, -10.0)",2006-05-05,5.719155,0.371665,...,10.364541,-3.946218,7.647943,10.87243,0.0,9.049109,IE,IE0,IE04,IE042


# Loading of MRT Dataset

In [31]:
# loading MRT data for the given years
rad_09 = pd.read_csv('rad_with_MRT_09.csv', index_col=0)

In [32]:
# changing coordinates to string
rad_09['latitude'] = rad_09['latitude'].astype(str)
rad_09['longitude'] = rad_09['longitude'].astype(str)

In [33]:
# merge overall weather set with MRT set
weather_final_09 = df_weather_09.merge(rad_09, how='inner', on=['date','latitude', 'longitude'])

In [34]:
# show first instances
weather_final_09.head()

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,uvb_11PM,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM
0,-10.0,54.25,5.965993,3.760736,7.873987,0.0,"(54.25, -10.0)",2006-05-01,5.390133,6.444582,...,0.0,5.554046,IE,IE0,IE04,IE042,0.0,15.24,15.55,4.0
1,-10.0,54.25,-4.352613,5.374078,7.936517,0.0,"(54.25, -10.0)",2006-05-02,5.677773,3.271828,...,0.0,6.130624,IE,IE0,IE04,IE042,5.15,17.99,16.98,0.67
2,-10.0,54.25,0.006344,4.615015,9.056573,0.0,"(54.25, -10.0)",2006-05-03,6.675099,6.287749,...,0.0,9.146146,IE,IE0,IE04,IE042,0.95,16.28,12.53,3.48
3,-10.0,54.25,-3.506214,9.009698,11.226129,0.0,"(54.25, -10.0)",2006-05-04,9.747745,-2.427677,...,0.0,8.014125,IE,IE0,IE04,IE042,8.54,21.95,25.56,8.77
4,-10.0,54.25,4.527114,3.234552,8.261652,0.0,"(54.25, -10.0)",2006-05-05,5.719155,0.371665,...,0.0,9.049109,IE,IE0,IE04,IE042,5.23,16.24,16.41,3.49


In [35]:
# change date to datetime and get week number
weather_final_09['date'] =  pd.to_datetime(weather_final_09['date'])
weather_final_09['Week_Number'] = weather_final_09['date'].dt.week

  weather_final_09['Week_Number'] = weather_final_09['date'].dt.week


In [36]:
weather_final_09.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 707472 entries, 0 to 707471
Data columns (total 33 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   longitude                  707472 non-null  object        
 1   latitude                   707472 non-null  object        
 2   wind_2AM                   707472 non-null  float32       
 3   dew_point_2AM              707472 non-null  float64       
 4   temperature_2AM            707472 non-null  float64       
 5   uvb_2AM                    707472 non-null  float32       
 6   lat_long                   707472 non-null  object        
 7   date                       707472 non-null  datetime64[ns]
 8   apparent_temperature_2AM   707472 non-null  float64       
 9   wind_11AM                  707472 non-null  float32       
 10  dew_point_11AM             707472 non-null  float64       
 11  temperature_11AM           707472 non-null  float64 

In [37]:
weather_final_09.head()

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM,Week_Number
0,-10.0,54.25,5.965993,3.760736,7.873987,0.0,"(54.25, -10.0)",2006-05-01,5.390133,6.444582,...,5.554046,IE,IE0,IE04,IE042,0.0,15.24,15.55,4.0,18
1,-10.0,54.25,-4.352613,5.374078,7.936517,0.0,"(54.25, -10.0)",2006-05-02,5.677773,3.271828,...,6.130624,IE,IE0,IE04,IE042,5.15,17.99,16.98,0.67,18
2,-10.0,54.25,0.006344,4.615015,9.056573,0.0,"(54.25, -10.0)",2006-05-03,6.675099,6.287749,...,9.146146,IE,IE0,IE04,IE042,0.95,16.28,12.53,3.48,18
3,-10.0,54.25,-3.506214,9.009698,11.226129,0.0,"(54.25, -10.0)",2006-05-04,9.747745,-2.427677,...,8.014125,IE,IE0,IE04,IE042,8.54,21.95,25.56,8.77,18
4,-10.0,54.25,4.527114,3.234552,8.261652,0.0,"(54.25, -10.0)",2006-05-05,5.719155,0.371665,...,9.049109,IE,IE0,IE04,IE042,5.23,16.24,16.41,3.49,18


In [38]:
# round data to 2 decimals
weather_final_09 = weather_final_09.round(decimals=2)
weather_final_09

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM,Week_Number
0,-10.0,54.25,5.97,3.76,7.87,0.0,"(54.25, -10.0)",2006-05-01,5.39,6.44,...,5.55,IE,IE0,IE04,IE042,0.00,15.24,15.55,4.00,18
1,-10.0,54.25,-4.35,5.37,7.94,0.0,"(54.25, -10.0)",2006-05-02,5.68,3.27,...,6.13,IE,IE0,IE04,IE042,5.15,17.99,16.98,0.67,18
2,-10.0,54.25,0.01,4.62,9.06,0.0,"(54.25, -10.0)",2006-05-03,6.68,6.29,...,9.15,IE,IE0,IE04,IE042,0.95,16.28,12.53,3.48,18
3,-10.0,54.25,-3.51,9.01,11.23,0.0,"(54.25, -10.0)",2006-05-04,9.75,-2.43,...,8.01,IE,IE0,IE04,IE042,8.54,21.95,25.56,8.77,18
4,-10.0,54.25,4.53,3.23,8.26,0.0,"(54.25, -10.0)",2006-05-05,5.72,0.37,...,9.05,IE,IE0,IE04,IE042,5.23,16.24,16.41,3.49,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707467,29.75,61.75,1.59,4.03,5.50,0.0,"(61.75, 29.75)",2009-09-26,3.07,4.24,...,10.78,FI,FI1,FI1C,FI1C5,1.66,18.26,7.17,4.22,39
707468,29.75,61.75,4.01,10.30,11.33,0.0,"(61.75, 29.75)",2009-09-27,10.23,4.90,...,10.60,FI,FI1,FI1C,FI1C5,2.60,18.15,7.43,8.47,39
707469,29.75,61.75,2.45,9.79,10.40,0.0,"(61.75, 29.75)",2009-09-28,9.15,3.13,...,3.89,FI,FI1,FI1C,FI1C5,8.58,12.30,-0.09,4.07,40
707470,29.75,61.75,3.77,1.64,4.40,0.0,"(61.75, 29.75)",2009-09-29,1.76,4.87,...,-1.05,FI,FI1,FI1C,FI1C5,-0.36,15.73,-0.77,-5.92,40


In [39]:
# drop possible duplicates
weather_final_09 = weather_final_09.drop_duplicates(keep='first')
weather_final_09.shape

(707472, 33)

In [42]:
# write data to csv
weather_final_09.to_csv('FINAL_WEATHER_MERGED_09')