# Imports

In [1]:
import pandas as pd
from datetime import datetime, timedelta

In [2]:
import time
import os
import base64
import hashlib
import geopandas as gpd
import requests
import re
from dotenv import load_dotenv
from shapely.geometry import Polygon, MultiPolygon, GeometryCollection, Point


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [3]:
import numpy as np                           

# Data Collection

In [4]:
df_weather = pd.read_csv('Data/Weather/open_meteo_historical_weather_data.csv')

In [5]:
df_weather.columns

Index(['time', 'temperature_2m_max (°C)', 'temperature_2m_min (°C)',
       'temperature_2m_mean (°C)', 'apparent_temperature_max (°C)',
       'apparent_temperature_min (°C)', 'apparent_temperature_mean (°C)',
       'shortwave_radiation_sum (MJ/m²)', 'precipitation_sum (mm)',
       'rain_sum (mm)', 'snowfall_sum (cm)', 'precipitation_hours (h)',
       'windspeed_10m_max (km/h)', 'windgusts_10m_max (km/h)',
       'winddirection_10m_dominant (°)', 'et0_fao_evapotranspiration (mm)'],
      dtype='object')

In [6]:
df_weather.head()

Unnamed: 0,time,temperature_2m_max (°C),temperature_2m_min (°C),temperature_2m_mean (°C),apparent_temperature_max (°C),apparent_temperature_min (°C),apparent_temperature_mean (°C),shortwave_radiation_sum (MJ/m²),precipitation_sum (mm),rain_sum (mm),snowfall_sum (cm),precipitation_hours (h),windspeed_10m_max (km/h),windgusts_10m_max (km/h),winddirection_10m_dominant (°),et0_fao_evapotranspiration (mm)
0,1420088400,-2.7,-8.2,-5.6,-14.4,-10.2,-12.4,3.82,1.1,0.0,0.91,6,33.2,67.7,228,0.63
1,1420174800,-4.6,-13.7,-9.0,-22.2,-10.6,-16.4,7.12,0.0,0.0,0.0,0,22.6,51.1,276,0.73
2,1420261200,-8.5,-16.5,-12.8,-24.0,-16.5,-20.9,4.32,8.7,0.0,6.09,10,17.4,33.5,55,0.28
3,1420347600,3.8,-7.6,-1.7,-15.9,-3.5,-9.9,0.64,19.7,9.0,7.49,19,25.9,56.9,12,0.14
4,1420434000,-5.5,-22.2,-15.6,-27.6,-13.5,-22.3,6.91,0.0,0.0,0.0,0,27.1,58.0,269,0.39


In [7]:
df_weather.dtypes

time                                 int64
temperature_2m_max (°C)            float64
temperature_2m_min (°C)            float64
temperature_2m_mean (°C)           float64
apparent_temperature_max (°C)      float64
apparent_temperature_min (°C)      float64
apparent_temperature_mean (°C)     float64
shortwave_radiation_sum (MJ/m²)    float64
precipitation_sum (mm)             float64
rain_sum (mm)                      float64
snowfall_sum (cm)                  float64
precipitation_hours (h)              int64
windspeed_10m_max (km/h)           float64
windgusts_10m_max (km/h)           float64
winddirection_10m_dominant (°)       int64
et0_fao_evapotranspiration (mm)    float64
dtype: object

# Data Cleaning

### Manipulate time data type

In [8]:
df_weather['time'] = pd.to_datetime(df_weather['time'], unit='s')

In [9]:
df_weather = df_weather.rename(columns={'time': 'DATE'})

In [10]:
df_weather['DAY'] = df_weather['DATE'].dt.strftime('%Y-%m-%d')
df_weather['MONTH'] = df_weather['DATE'].dt.strftime('%Y-%m')
df_weather['YEAR'] = df_weather['DATE'].dt.strftime('%Y')

In [11]:
df_weather.head()

Unnamed: 0,DATE,temperature_2m_max (°C),temperature_2m_min (°C),temperature_2m_mean (°C),apparent_temperature_max (°C),apparent_temperature_min (°C),apparent_temperature_mean (°C),shortwave_radiation_sum (MJ/m²),precipitation_sum (mm),rain_sum (mm),snowfall_sum (cm),precipitation_hours (h),windspeed_10m_max (km/h),windgusts_10m_max (km/h),winddirection_10m_dominant (°),et0_fao_evapotranspiration (mm),DAY,MONTH,YEAR
0,2015-01-01 05:00:00,-2.7,-8.2,-5.6,-14.4,-10.2,-12.4,3.82,1.1,0.0,0.91,6,33.2,67.7,228,0.63,2015-01-01,2015-01,2015
1,2015-01-02 05:00:00,-4.6,-13.7,-9.0,-22.2,-10.6,-16.4,7.12,0.0,0.0,0.0,0,22.6,51.1,276,0.73,2015-01-02,2015-01,2015
2,2015-01-03 05:00:00,-8.5,-16.5,-12.8,-24.0,-16.5,-20.9,4.32,8.7,0.0,6.09,10,17.4,33.5,55,0.28,2015-01-03,2015-01,2015
3,2015-01-04 05:00:00,3.8,-7.6,-1.7,-15.9,-3.5,-9.9,0.64,19.7,9.0,7.49,19,25.9,56.9,12,0.14,2015-01-04,2015-01,2015
4,2015-01-05 05:00:00,-5.5,-22.2,-15.6,-27.6,-13.5,-22.3,6.91,0.0,0.0,0.0,0,27.1,58.0,269,0.39,2015-01-05,2015-01,2015


In [12]:
df_weather.shape

(2954, 19)

### Handling missing values

In [13]:
df_weather.isna().sum()

DATE                               0
temperature_2m_max (°C)            0
temperature_2m_min (°C)            0
temperature_2m_mean (°C)           0
apparent_temperature_max (°C)      0
apparent_temperature_min (°C)      0
apparent_temperature_mean (°C)     0
shortwave_radiation_sum (MJ/m²)    0
precipitation_sum (mm)             0
rain_sum (mm)                      0
snowfall_sum (cm)                  0
precipitation_hours (h)            0
windspeed_10m_max (km/h)           0
windgusts_10m_max (km/h)           0
winddirection_10m_dominant (°)     0
et0_fao_evapotranspiration (mm)    0
DAY                                0
MONTH                              0
YEAR                               0
dtype: int64

### Remove unecessary columns

In [14]:
df_weather = df_weather.drop(['precipitation_hours (h)',
                            'apparent_temperature_max (°C)',
                            'apparent_temperature_min (°C)',
                            'apparent_temperature_mean (°C)'], axis=1)

In [15]:
df_weather = df_weather[df_weather['MONTH'] != '2023-02']

In [16]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2953 entries, 0 to 2952
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   DATE                             2953 non-null   datetime64[ns]
 1   temperature_2m_max (°C)          2953 non-null   float64       
 2   temperature_2m_min (°C)          2953 non-null   float64       
 3   temperature_2m_mean (°C)         2953 non-null   float64       
 4   shortwave_radiation_sum (MJ/m²)  2953 non-null   float64       
 5   precipitation_sum (mm)           2953 non-null   float64       
 6   rain_sum (mm)                    2953 non-null   float64       
 7   snowfall_sum (cm)                2953 non-null   float64       
 8   windspeed_10m_max (km/h)         2953 non-null   float64       
 9   windgusts_10m_max (km/h)         2953 non-null   float64       
 10  winddirection_10m_dominant (°)   2953 non-null   int64      

# Output file

In [17]:
df_weather.to_csv(
    'Data/Processed_Datasets/Cleaned_datasets/df_weather.csv')