In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

In [47]:
ithaca_data = pd.read_csv('datasets/hourly_ithaca_weather.csv').dropna()

In [48]:
ithaca_data.head()

Unnamed: 0,datetime (UTC),"coordinates (lat,lon)",model (name),model elevation (surface),utc_offset (hrs),temperature (degC),dewpoint_temperature (degC),relative_humidity (0-1),wind_speed (m/s),total_cloud_cover (0-1),total_precipitation (mm of water equivalent)
0,2005-01-01 00:00:00,"(42.439604, -76.496802)",era5,368.22,-5.0,9.63,6.49,0.81,5.0,0.83,0.01
1,2005-01-01 01:00:00,"(42.439604, -76.496802)",era5,368.22,-5.0,9.47,6.63,0.82,4.98,0.93,0.01
2,2005-01-01 02:00:00,"(42.439604, -76.496802)",era5,368.22,-5.0,9.59,6.76,0.83,5.28,0.9,0.0
3,2005-01-01 03:00:00,"(42.439604, -76.496802)",era5,368.22,-5.0,9.57,6.17,0.79,5.53,0.76,0.0
4,2005-01-01 04:00:00,"(42.439604, -76.496802)",era5,368.22,-5.0,9.52,6.47,0.81,5.44,0.57,0.0


In [49]:
ithaca_data = ithaca_data.rename(columns={'datetime (UTC)': 'datetime',
                                          'temperature (degC)': 'temperature',
                                          'dewpoint_temperature (degC)': 'dewpoint_temperature',
                                          'relative_humidity (0-1)': 'relative_humidity',
                                          'total_cloud_cover (0-1)': 'cloud_cover',
                                          'total_precipitation (mm of water equivalent)': 'rainfall'
                                          })

In [50]:
# Assuming 'datetime' column is in datetime format
ithaca_data['datetime'] = pd.to_datetime(ithaca_data['datetime'])

# Extract date from datetime and set it as index
ithaca_data['date'] = ithaca_data['datetime'].dt.date
ithaca_data.set_index('date', inplace=True)

# Define aggregation functions for each column
agg_functions = {'temperature': 'mean',
                 'dewpoint_temperature': 'mean',
                 'relative_humidity': 'mean',
                 'cloud_cover': 'mean',
                 'precipitation': 'sum'}

# Group by date and apply aggregation functions
daily_ithaca_data = ithaca_data.groupby('date').agg(agg_functions)

# Reset the index to get 'date' back as a column
daily_ithaca_data.reset_index(inplace=True)


daily_ithaca_data


Unnamed: 0,date,temperature,dewpoint_temperature,relative_humidity,cloud_cover,precipitation
0,2005-01-01,5.793333,1.602917,0.747083,0.633333,0.69
1,2005-01-02,0.222500,-2.469583,0.821667,0.902500,1.98
2,2005-01-03,4.257500,3.196667,0.927500,0.997500,5.18
3,2005-01-04,2.041250,1.375833,0.953333,0.997917,10.04
4,2005-01-05,-0.984583,-3.369167,0.839583,1.000000,3.77
...,...,...,...,...,...,...
6569,2022-12-27,-6.364167,-9.941250,0.758750,0.853750,0.36
6570,2022-12-28,-1.111250,-5.182917,0.752083,0.885417,0.00
6571,2022-12-29,3.182917,-2.744167,0.659583,0.574167,0.00
6572,2022-12-30,8.702917,3.053333,0.682500,0.806250,0.10


In [51]:
daily_ithaca_data.precipitation.describe()

count    6574.000000
mean        3.192536
std         6.213442
min         0.000000
25%         0.020000
50%         0.710000
75%         3.390000
max        96.420000
Name: precipitation, dtype: float64

In [52]:
# round down to 0 if precipitation is less than 1 mm
daily_ithaca_data['precipitation'] = daily_ithaca_data['precipitation'].apply(lambda x: max(0, 0 if x < 1 else x))


In [53]:
daily_ithaca_data.to_csv('datasets/daily_ithaca_data.csv')

In [58]:
daily_ithaca_data['rain_tomorrow'] = (daily_ithaca_data['precipitation'].shift(-1) > 0).astype(int)

In [59]:
daily_ithaca_data.head(50)

Unnamed: 0,date,temperature,dewpoint_temperature,relative_humidity,cloud_cover,precipitation,rain_tomorrow
0,2005-01-01,5.793333,1.602917,0.747083,0.633333,0.0,1
1,2005-01-02,0.2225,-2.469583,0.821667,0.9025,1.98,1
2,2005-01-03,4.2575,3.196667,0.9275,0.9975,5.18,1
3,2005-01-04,2.04125,1.375833,0.953333,0.997917,10.04,1
4,2005-01-05,-0.984583,-3.369167,0.839583,1.0,3.77,1
5,2005-01-06,-2.617083,-4.1,0.897917,1.0,22.3,0
6,2005-01-07,-1.189583,-5.477083,0.730833,0.92,0.0,1
7,2005-01-08,-1.250833,-3.6775,0.842083,0.997083,15.28,0
8,2005-01-09,-2.109583,-3.741667,0.887083,0.921667,0.0,1
9,2005-01-10,1.045,-1.7625,0.817917,0.773333,1.3,1
