In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

# Display all columns and rows (optional, you can limit this if needed)
pd.set_option('display.max_columns', None)  # Show all columns

# Load the weather data CSV
df = pd.read_csv("beirut_weather_dataset_6years.csv")

df

Unnamed: 0,dt,temp,pressure,humidity,clouds,wind_speed,wind_deg,Season
0,2020-03-01T00:00,0.7,1012.5,97,100,7.0,258,Spring
1,2020-03-01T01:00,0.3,1012.3,97,100,5.5,247,Spring
2,2020-03-01T02:00,0.8,1011.4,93,100,10.0,240,Spring
3,2020-03-01T03:00,0.3,1011.2,97,100,8.4,250,Spring
4,2020-03-01T04:00,0.4,1011.1,96,87,6.1,270,Spring
...,...,...,...,...,...,...,...,...
52747,2025-03-01T19:00,1.1,1022.9,91,2,3.3,347,Spring
52748,2025-03-01T20:00,-0.1,1023.6,90,22,2.7,4,Spring
52749,2025-03-01T21:00,-0.7,1024.0,85,40,2.0,5,Spring
52750,2025-03-01T22:00,-1.0,1024.2,82,25,2.2,9,Spring


In [2]:
# Convert time to datetime object for easier handling
df['dt'] = pd.to_datetime(df['dt'])

# Set the index to the datetime column
df.set_index('dt', inplace=True) # setting the index column as date for easier queries

# Sort the data by the datetime index (in case it's not already sorted)
df.sort_index(inplace=True)

In [3]:
# Check the first few rows of the data to ensure it's correct
print(df.head())

                     temp  pressure  humidity  clouds  wind_speed  wind_deg  \
dt                                                                            
2019-03-01 00:00:00  -1.8    1011.6        89      78         6.9       261   
2019-03-01 01:00:00  -1.5    1012.1        89      57         6.9       261   
2019-03-01 02:00:00  -0.7    1012.1        95      73         9.7       270   
2019-03-01 03:00:00  -0.8    1012.1        93      90        10.5       276   
2019-03-01 04:00:00  -0.5    1012.1        96      91         9.7       274   

                     Season  
dt                           
2019-03-01 00:00:00  Spring  
2019-03-01 01:00:00  Spring  
2019-03-01 02:00:00  Spring  
2019-03-01 03:00:00  Spring  
2019-03-01 04:00:00  Spring  


In [4]:
# Check the first and last date in the dataset
print("First date:", df.index.min())
print("Last date:", df.index.max())

First date: 2019-03-01 00:00:00
Last date: 2025-03-01 23:00:00


In [6]:
# Display rows with any null values
null_rows = df[df.isnull().any(axis=1)]
print(null_rows)

Empty DataFrame
Columns: [temp, pressure, humidity, clouds, wind_speed, wind_deg, Season]
Index: []


In [7]:
df['hour_of_day'] = df.index.hour
df['day_of_week'] = df.index.dayofweek
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['is_morning'] = (df['hour_of_day'] >= 6) & (df['hour_of_day'] < 12)
df['is_afternoon'] = (df['hour_of_day'] >= 12) & (df['hour_of_day'] < 18)
df['is_evening'] = (df['hour_of_day'] >= 18) & (df['hour_of_day'] < 22)
df['is_night'] = (df['hour_of_day'] >= 22) | (df['hour_of_day'] < 6)

df

Unnamed: 0_level_0,temp,pressure,humidity,clouds,wind_speed,wind_deg,Season,hour_of_day,day_of_week,is_weekend,is_morning,is_afternoon,is_evening,is_night
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-03-01 00:00:00,-1.8,1011.6,89,78,6.9,261,Spring,0,4,0,False,False,False,True
2019-03-01 01:00:00,-1.5,1012.1,89,57,6.9,261,Spring,1,4,0,False,False,False,True
2019-03-01 02:00:00,-0.7,1012.1,95,73,9.7,270,Spring,2,4,0,False,False,False,True
2019-03-01 03:00:00,-0.8,1012.1,93,90,10.5,276,Spring,3,4,0,False,False,False,True
2019-03-01 04:00:00,-0.5,1012.1,96,91,9.7,274,Spring,4,4,0,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-01 19:00:00,1.1,1022.9,91,2,3.3,347,Spring,19,5,1,False,False,True,False
2025-03-01 20:00:00,-0.1,1023.6,90,22,2.7,4,Spring,20,5,1,False,False,True,False
2025-03-01 21:00:00,-0.7,1024.0,85,40,2.0,5,Spring,21,5,1,False,False,True,False
2025-03-01 22:00:00,-1.0,1024.2,82,25,2.2,9,Spring,22,5,1,False,False,False,True


In [8]:
# Get the number of rows and columns in the DataFrame
print("Number of rows and columns:", df.shape)

Number of rows and columns: (52752, 14)


In [9]:
df['elapsed_time_of_day'] = df['hour_of_day'] + (df.index.minute / 60)
df['month_of_year'] = df.index.month
df['day_of_year'] = df.index.dayofyear

df

Unnamed: 0_level_0,temp,pressure,humidity,clouds,wind_speed,wind_deg,Season,hour_of_day,day_of_week,is_weekend,is_morning,is_afternoon,is_evening,is_night,elapsed_time_of_day,month_of_year,day_of_year
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2019-03-01 00:00:00,-1.8,1011.6,89,78,6.9,261,Spring,0,4,0,False,False,False,True,0.0,3,60
2019-03-01 01:00:00,-1.5,1012.1,89,57,6.9,261,Spring,1,4,0,False,False,False,True,1.0,3,60
2019-03-01 02:00:00,-0.7,1012.1,95,73,9.7,270,Spring,2,4,0,False,False,False,True,2.0,3,60
2019-03-01 03:00:00,-0.8,1012.1,93,90,10.5,276,Spring,3,4,0,False,False,False,True,3.0,3,60
2019-03-01 04:00:00,-0.5,1012.1,96,91,9.7,274,Spring,4,4,0,False,False,False,True,4.0,3,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-01 19:00:00,1.1,1022.9,91,2,3.3,347,Spring,19,5,1,False,False,True,False,19.0,3,60
2025-03-01 20:00:00,-0.1,1023.6,90,22,2.7,4,Spring,20,5,1,False,False,True,False,20.0,3,60
2025-03-01 21:00:00,-0.7,1024.0,85,40,2.0,5,Spring,21,5,1,False,False,True,False,21.0,3,60
2025-03-01 22:00:00,-1.0,1024.2,82,25,2.2,9,Spring,22,5,1,False,False,False,True,22.0,3,60


In [24]:
# Calculate the expanding average for hourly temperature
df['temp'].groupby(df.index.date).apply(lambda x: x.expanding(1).mean())

            dt                 
2019-03-01  2019-03-01 00:00:00   -1.800000
            2019-03-01 01:00:00   -1.650000
            2019-03-01 02:00:00   -1.333333
            2019-03-01 03:00:00   -1.200000
            2019-03-01 04:00:00   -1.060000
                                     ...   
2025-03-01  2025-03-01 19:00:00    3.545000
            2025-03-01 20:00:00    3.371429
            2025-03-01 21:00:00    3.186364
            2025-03-01 22:00:00    3.004348
            2025-03-01 23:00:00    2.825000
Name: temp, Length: 52752, dtype: float64

In [33]:
# Calculate the expanding average for hourly temperature
expanded_avg = df['temp'].groupby(df.index.date).apply(lambda x: x.expanding(1).mean())

# Reset index to flatten the MultiIndex and align with df
expanded_avg = expanded_avg.reset_index(drop=True)

expanded_avg

0       -1.800000
1       -1.650000
2       -1.333333
3       -1.200000
4       -1.060000
           ...   
52747    3.545000
52748    3.371429
52749    3.186364
52750    3.004348
52751    2.825000
Name: temp, Length: 52752, dtype: float64

In [37]:
df.drop('hourly_avg', axis=1, inplace=True)

# Reset index temporarily to group by date
df_reset = df.reset_index()

# Assign the result back to the original dataframe
df_reset['hourly_avg_temp'] = expanded_avg

# Set the index back to the 'dt' column (which is the original datetime column)
df = df_reset.set_index('dt')

df.head()

Unnamed: 0_level_0,temp,pressure,humidity,clouds,wind_speed,wind_deg,Season,hour_of_day,day_of_week,is_weekend,is_morning,is_afternoon,is_evening,is_night,elapsed_time_of_day,month_of_year,day_of_year,hourly_avg_temp
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2019-03-01 00:00:00,-1.8,1011.6,89,78,6.9,261,Spring,0,4,0,False,False,False,True,0.0,3,60,-1.8
2019-03-01 01:00:00,-1.5,1012.1,89,57,6.9,261,Spring,1,4,0,False,False,False,True,1.0,3,60,-1.65
2019-03-01 02:00:00,-0.7,1012.1,95,73,9.7,270,Spring,2,4,0,False,False,False,True,2.0,3,60,-1.333333
2019-03-01 03:00:00,-0.8,1012.1,93,90,10.5,276,Spring,3,4,0,False,False,False,True,3.0,3,60,-1.2
2019-03-01 04:00:00,-0.5,1012.1,96,91,9.7,274,Spring,4,4,0,False,False,False,True,4.0,3,60,-1.06


In [45]:
# Calculate the expanding maximum temperature for each day
expanded_max = df['temp'].groupby(df.index.date).apply(lambda x: x.expanding(1).max())

# Reset index to flatten the MultiIndex and align with df
expanded_max = expanded_max.reset_index(drop=True)

# Reset index temporarily to group by date
df_reset = df.reset_index()

# Assign the result back to the original dataframe
df_reset['hourly_temp_max'] = expanded_max

# Set the index back to the 'dt' column (which is the original datetime column)
df = df_reset.set_index('dt')

df.head()

Unnamed: 0_level_0,temp,pressure,humidity,clouds,wind_speed,wind_deg,Season,hour_of_day,day_of_week,is_weekend,is_morning,is_afternoon,is_evening,is_night,elapsed_time_of_day,month_of_year,day_of_year,hourly_avg_temp,hourly_temp_max
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2019-03-01 00:00:00,-1.8,1011.6,89,78,6.9,261,Spring,0,4,0,False,False,False,True,0.0,3,60,-1.8,-1.8
2019-03-01 01:00:00,-1.5,1012.1,89,57,6.9,261,Spring,1,4,0,False,False,False,True,1.0,3,60,-1.65,-1.5
2019-03-01 02:00:00,-0.7,1012.1,95,73,9.7,270,Spring,2,4,0,False,False,False,True,2.0,3,60,-1.333333,-0.7
2019-03-01 03:00:00,-0.8,1012.1,93,90,10.5,276,Spring,3,4,0,False,False,False,True,3.0,3,60,-1.2,-0.7
2019-03-01 04:00:00,-0.5,1012.1,96,91,9.7,274,Spring,4,4,0,False,False,False,True,4.0,3,60,-1.06,-0.5


In [46]:
df['hourly_temp_max'].head(50)

dt
2019-03-01 00:00:00   -1.8
2019-03-01 01:00:00   -1.5
2019-03-01 02:00:00   -0.7
2019-03-01 03:00:00   -0.7
2019-03-01 04:00:00   -0.5
2019-03-01 05:00:00   -0.3
2019-03-01 06:00:00   -0.1
2019-03-01 07:00:00   -0.1
2019-03-01 08:00:00   -0.1
2019-03-01 09:00:00    0.7
2019-03-01 10:00:00    1.7
2019-03-01 11:00:00    2.7
2019-03-01 12:00:00    3.4
2019-03-01 13:00:00    3.5
2019-03-01 14:00:00    3.6
2019-03-01 15:00:00    3.6
2019-03-01 16:00:00    3.6
2019-03-01 17:00:00    3.6
2019-03-01 18:00:00    3.6
2019-03-01 19:00:00    3.6
2019-03-01 20:00:00    3.6
2019-03-01 21:00:00    3.6
2019-03-01 22:00:00    3.6
2019-03-01 23:00:00    3.6
2019-03-02 00:00:00    1.0
2019-03-02 01:00:00    1.8
2019-03-02 02:00:00    2.2
2019-03-02 03:00:00    2.2
2019-03-02 04:00:00    2.2
2019-03-02 05:00:00    2.2
2019-03-02 06:00:00    2.2
2019-03-02 07:00:00    2.2
2019-03-02 08:00:00    2.4
2019-03-02 09:00:00    3.3
2019-03-02 10:00:00    3.9
2019-03-02 11:00:00    4.8
2019-03-02 12:00:00    5.

In [47]:
# Calculate the expanding minimum temperature for each day
expanded_min = df['temp'].groupby(df.index.date).apply(lambda x: x.expanding(1).min())

# Reset index to flatten the MultiIndex and align with df
expanded_min = expanded_min.reset_index(drop=True)

# Reset index temporarily to group by date
df_reset = df.reset_index()

# Assign the result back to the original dataframe
df_reset['hourly_temp_min'] = expanded_min

# Set the index back to the 'dt' column (which is the original datetime column)
df = df_reset.set_index('dt')

df.head()

Unnamed: 0_level_0,temp,pressure,humidity,clouds,wind_speed,wind_deg,Season,hour_of_day,day_of_week,is_weekend,is_morning,is_afternoon,is_evening,is_night,elapsed_time_of_day,month_of_year,day_of_year,hourly_avg_temp,hourly_temp_max,hourly_temp_min
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-03-01 00:00:00,-1.8,1011.6,89,78,6.9,261,Spring,0,4,0,False,False,False,True,0.0,3,60,-1.8,-1.8,-1.8
2019-03-01 01:00:00,-1.5,1012.1,89,57,6.9,261,Spring,1,4,0,False,False,False,True,1.0,3,60,-1.65,-1.5,-1.8
2019-03-01 02:00:00,-0.7,1012.1,95,73,9.7,270,Spring,2,4,0,False,False,False,True,2.0,3,60,-1.333333,-0.7,-1.8
2019-03-01 03:00:00,-0.8,1012.1,93,90,10.5,276,Spring,3,4,0,False,False,False,True,3.0,3,60,-1.2,-0.7,-1.8
2019-03-01 04:00:00,-0.5,1012.1,96,91,9.7,274,Spring,4,4,0,False,False,False,True,4.0,3,60,-1.06,-0.5,-1.8


In [50]:
# Rolling averages for temperature (3-hour window)
# core_weather['rolling_avg_3hr'] = core_weather['temp'].rolling(window=3).mean()
df['rolling_avg_3hr'] = df['temp'].rolling(window=3).mean()

In [51]:
df

Unnamed: 0_level_0,temp,pressure,humidity,clouds,wind_speed,wind_deg,Season,hour_of_day,day_of_week,is_weekend,is_morning,is_afternoon,is_evening,is_night,elapsed_time_of_day,month_of_year,day_of_year,hourly_avg_temp,hourly_temp_max,hourly_temp_min,rolling_avg_3hr
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-03-01 00:00:00,-1.8,1011.6,89,78,6.9,261,Spring,0,4,0,False,False,False,True,0.0,3,60,-1.800000,-1.8,-1.8,
2019-03-01 01:00:00,-1.5,1012.1,89,57,6.9,261,Spring,1,4,0,False,False,False,True,1.0,3,60,-1.650000,-1.5,-1.8,
2019-03-01 02:00:00,-0.7,1012.1,95,73,9.7,270,Spring,2,4,0,False,False,False,True,2.0,3,60,-1.333333,-0.7,-1.8,-1.333333
2019-03-01 03:00:00,-0.8,1012.1,93,90,10.5,276,Spring,3,4,0,False,False,False,True,3.0,3,60,-1.200000,-0.7,-1.8,-1.000000
2019-03-01 04:00:00,-0.5,1012.1,96,91,9.7,274,Spring,4,4,0,False,False,False,True,4.0,3,60,-1.060000,-0.5,-1.8,-0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-01 19:00:00,1.1,1022.9,91,2,3.3,347,Spring,19,5,1,False,False,True,False,19.0,3,60,3.545000,7.2,-0.4,3.000000
2025-03-01 20:00:00,-0.1,1023.6,90,22,2.7,4,Spring,20,5,1,False,False,True,False,20.0,3,60,3.371429,7.2,-0.4,1.100000
2025-03-01 21:00:00,-0.7,1024.0,85,40,2.0,5,Spring,21,5,1,False,False,True,False,21.0,3,60,3.186364,7.2,-0.7,0.100000
2025-03-01 22:00:00,-1.0,1024.2,82,25,2.2,9,Spring,22,5,1,False,False,False,True,22.0,3,60,3.004348,7.2,-1.0,-0.600000


In [52]:
# Add lag features (previous hour's temperature, humidity, wind speed, etc.)
df['temp_lag_1'] = df['temp'].shift(1)  # Previous hour's temperature
df['pressure_lag_1'] = df['pressure'].shift(1)  # Previous hour's pressure
df['humidity_lag_1'] = df['humidity'].shift(1)  # Previous hour's humidity
df['clouds_lag_1'] = df['clouds'].shift(1)  # Previous hour's cloud cover
df['wind_speed_lag_1'] = df['wind_speed'].shift(1)  # Previous hour's wind speed
df['wind_deg_lag_1'] = df['wind_deg'].shift(1)  # Previous hour's wind direction


# Rolling statistics (e.g., 24-hour rolling mean for temperature)
df['temp_rolling_avg'] = df['temp'].rolling(window=24).mean()  # 24-hour rolling mean for temperature
df['humidity_rolling_avg'] = df['humidity'].rolling(window=24).mean()  # 24-hour rolling mean for humidity
df['wind_speed_rolling_avg'] = df['wind_speed'].rolling(window=24).mean()  # 24-hour rolling mean for wind speed

df

Unnamed: 0_level_0,temp,pressure,humidity,clouds,wind_speed,wind_deg,Season,hour_of_day,day_of_week,is_weekend,is_morning,is_afternoon,is_evening,is_night,elapsed_time_of_day,month_of_year,day_of_year,hourly_avg_temp,hourly_temp_max,hourly_temp_min,rolling_avg_3hr,temp_lag_1,pressure_lag_1,humidity_lag_1,clouds_lag_1,wind_speed_lag_1,wind_deg_lag_1,temp_rolling_avg,humidity_rolling_avg,wind_speed_rolling_avg
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
2019-03-01 00:00:00,-1.8,1011.6,89,78,6.9,261,Spring,0,4,0,False,False,False,True,0.0,3,60,-1.800000,-1.8,-1.8,,,,,,,,,,
2019-03-01 01:00:00,-1.5,1012.1,89,57,6.9,261,Spring,1,4,0,False,False,False,True,1.0,3,60,-1.650000,-1.5,-1.8,,-1.8,1011.6,89.0,78.0,6.9,261.0,,,
2019-03-01 02:00:00,-0.7,1012.1,95,73,9.7,270,Spring,2,4,0,False,False,False,True,2.0,3,60,-1.333333,-0.7,-1.8,-1.333333,-1.5,1012.1,89.0,57.0,6.9,261.0,,,
2019-03-01 03:00:00,-0.8,1012.1,93,90,10.5,276,Spring,3,4,0,False,False,False,True,3.0,3,60,-1.200000,-0.7,-1.8,-1.000000,-0.7,1012.1,95.0,73.0,9.7,270.0,,,
2019-03-01 04:00:00,-0.5,1012.1,96,91,9.7,274,Spring,4,4,0,False,False,False,True,4.0,3,60,-1.060000,-0.5,-1.8,-0.666667,-0.8,1012.1,93.0,90.0,10.5,276.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-01 19:00:00,1.1,1022.9,91,2,3.3,347,Spring,19,5,1,False,False,True,False,19.0,3,60,3.545000,7.2,-0.4,3.000000,2.3,1022.1,90.0,7.0,3.7,317.0,2.945833,78.333333,4.254167
2025-03-01 20:00:00,-0.1,1023.6,90,22,2.7,4,Spring,20,5,1,False,False,True,False,20.0,3,60,3.371429,7.2,-0.4,1.100000,1.1,1022.9,91.0,2.0,3.3,347.0,2.912500,78.333333,4.237500
2025-03-01 21:00:00,-0.7,1024.0,85,40,2.0,5,Spring,21,5,1,False,False,True,False,21.0,3,60,3.186364,7.2,-0.7,0.100000,-0.1,1023.6,90.0,22.0,2.7,4.0,2.887500,78.291667,4.216667
2025-03-01 22:00:00,-1.0,1024.2,82,25,2.2,9,Spring,22,5,1,False,False,False,True,22.0,3,60,3.004348,7.2,-1.0,-0.600000,-0.7,1024.0,85.0,40.0,2.0,5.0,2.866667,78.208333,4.216667


In [53]:
# Create target variables for the next day (shift by 24 hours)
df['target_temp'] = df['temp'].shift(-24)  # Shift by 24 hours to predict next day's temperature
df['target_pressure'] = df['pressure'].shift(-24)  # Same for pressure
df['target_humidity'] = df['humidity'].shift(-24)  # Same for humidity
df['target_clouds'] = df['clouds'].shift(-24)  # Same for clouds
df['target_wind_speed'] = df['wind_speed'].shift(-24)  # Same for wind speed
df['target_wind_deg'] = df['wind_deg'].shift(-24)  # Same for wind direction

In [54]:
df.tail()

Unnamed: 0_level_0,temp,pressure,humidity,clouds,wind_speed,wind_deg,Season,hour_of_day,day_of_week,is_weekend,is_morning,is_afternoon,is_evening,is_night,elapsed_time_of_day,month_of_year,day_of_year,hourly_avg_temp,hourly_temp_max,hourly_temp_min,rolling_avg_3hr,temp_lag_1,pressure_lag_1,humidity_lag_1,clouds_lag_1,wind_speed_lag_1,wind_deg_lag_1,temp_rolling_avg,humidity_rolling_avg,wind_speed_rolling_avg,target_temp,target_pressure,target_humidity,target_clouds,target_wind_speed,target_wind_deg
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
2025-03-01 19:00:00,1.1,1022.9,91,2,3.3,347,Spring,19,5,1,False,False,True,False,19.0,3,60,3.545,7.2,-0.4,3.0,2.3,1022.1,90.0,7.0,3.7,317.0,2.945833,78.333333,4.254167,,,,,,
2025-03-01 20:00:00,-0.1,1023.6,90,22,2.7,4,Spring,20,5,1,False,False,True,False,20.0,3,60,3.371429,7.2,-0.4,1.1,1.1,1022.9,91.0,2.0,3.3,347.0,2.9125,78.333333,4.2375,,,,,,
2025-03-01 21:00:00,-0.7,1024.0,85,40,2.0,5,Spring,21,5,1,False,False,True,False,21.0,3,60,3.186364,7.2,-0.7,0.1,-0.1,1023.6,90.0,22.0,2.7,4.0,2.8875,78.291667,4.216667,,,,,,
2025-03-01 22:00:00,-1.0,1024.2,82,25,2.2,9,Spring,22,5,1,False,False,False,True,22.0,3,60,3.004348,7.2,-1.0,-0.6,-0.7,1024.0,85.0,40.0,2.0,5.0,2.866667,78.208333,4.216667,,,,,,
2025-03-01 23:00:00,-1.3,1024.3,79,4,2.9,7,Spring,23,5,1,False,False,False,True,23.0,3,60,2.825,7.2,-1.3,-1.0,-1.0,1024.2,82.0,25.0,2.2,9.0,2.825,78.083333,4.216667,,,,,,


In [55]:
# Drop rows with NaN values caused by lag/rolling features
df.dropna(inplace=True)

df.head()

Unnamed: 0_level_0,temp,pressure,humidity,clouds,wind_speed,wind_deg,Season,hour_of_day,day_of_week,is_weekend,is_morning,is_afternoon,is_evening,is_night,elapsed_time_of_day,month_of_year,day_of_year,hourly_avg_temp,hourly_temp_max,hourly_temp_min,rolling_avg_3hr,temp_lag_1,pressure_lag_1,humidity_lag_1,clouds_lag_1,wind_speed_lag_1,wind_deg_lag_1,temp_rolling_avg,humidity_rolling_avg,wind_speed_rolling_avg,target_temp,target_pressure,target_humidity,target_clouds,target_wind_speed,target_wind_deg
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
2019-03-01 23:00:00,1.3,1017.0,96,100,5.8,292,Spring,23,4,0,False,False,False,True,23.0,3,60,0.966667,3.6,-1.8,0.966667,0.9,1017.4,94.0,100.0,6.6,299.0,0.966667,85.125,9.175,2.7,1016.2,99.0,100.0,6.9,231.0
2019-03-02 00:00:00,1.0,1016.8,98,100,7.0,282,Spring,0,5,1,False,False,False,True,0.0,3,61,1.0,1.0,1.0,1.066667,1.3,1017.0,96.0,100.0,5.8,292.0,1.083333,85.5,9.179167,3.1,1015.9,98.0,54.0,6.0,245.0
2019-03-02 01:00:00,1.8,1016.2,100,100,5.2,282,Spring,1,5,1,False,False,False,True,1.0,3,61,1.4,1.8,1.0,1.366667,1.0,1016.8,98.0,100.0,7.0,282.0,1.220833,85.958333,9.108333,2.9,1016.0,99.0,59.0,5.8,240.0
2019-03-02 02:00:00,2.2,1016.1,100,100,7.3,279,Spring,2,5,1,False,False,False,True,2.0,3,61,1.666667,2.2,1.0,1.666667,1.8,1016.2,100.0,100.0,5.2,282.0,1.341667,86.166667,9.008333,2.9,1015.7,98.0,100.0,7.6,225.0
2019-03-02 03:00:00,1.1,1015.7,100,100,7.6,273,Spring,3,5,1,False,False,False,True,3.0,3,61,1.525,2.2,1.0,1.7,2.2,1016.1,100.0,100.0,7.3,279.0,1.420833,86.458333,8.8875,3.9,1015.2,98.0,100.0,3.8,221.0


In [56]:
df.tail()

Unnamed: 0_level_0,temp,pressure,humidity,clouds,wind_speed,wind_deg,Season,hour_of_day,day_of_week,is_weekend,is_morning,is_afternoon,is_evening,is_night,elapsed_time_of_day,month_of_year,day_of_year,hourly_avg_temp,hourly_temp_max,hourly_temp_min,rolling_avg_3hr,temp_lag_1,pressure_lag_1,humidity_lag_1,clouds_lag_1,wind_speed_lag_1,wind_deg_lag_1,temp_rolling_avg,humidity_rolling_avg,wind_speed_rolling_avg,target_temp,target_pressure,target_humidity,target_clouds,target_wind_speed,target_wind_deg
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
2025-02-28 19:00:00,2.0,1019.7,91,97,3.1,336,Winter,19,4,0,False,False,True,False,19.0,2,59,3.575,8.4,-0.1,2.733333,1.6,1019.5,89.0,99.0,3.7,309.0,2.966667,57.333333,3.75,1.1,1022.9,91.0,2.0,3.3,347.0
2025-02-28 20:00:00,0.7,1020.0,90,31,3.1,350,Winter,20,4,0,False,False,True,False,20.0,2,59,3.438095,8.4,-0.1,1.433333,2.0,1019.7,91.0,97.0,3.1,336.0,3.0125,58.0,3.8125,-0.1,1023.6,90.0,22.0,2.7,4.0
2025-02-28 21:00:00,-0.1,1020.0,86,44,2.5,356,Winter,21,4,0,False,False,True,False,21.0,2,59,3.277273,8.4,-0.1,0.866667,0.7,1020.0,90.0,31.0,3.1,350.0,3.0125,58.666667,3.854167,-0.7,1024.0,85.0,40.0,2.0,5.0
2025-02-28 22:00:00,-0.5,1020.1,84,87,2.2,360,Winter,22,4,0,False,False,False,True,22.0,2,59,3.113043,8.4,-0.5,0.033333,-0.1,1020.0,86.0,44.0,2.5,356.0,2.9875,59.333333,3.891667,-1.0,1024.2,82.0,25.0,2.2,9.0
2025-02-28 23:00:00,-0.3,1019.9,82,47,2.9,11,Winter,23,4,0,False,False,False,True,23.0,2,59,2.970833,8.4,-0.5,-0.3,-0.5,1020.1,84.0,87.0,2.2,360.0,2.970833,59.916667,3.941667,-1.3,1024.3,79.0,4.0,2.9,7.0


In [57]:
# Save the DataFrame with all features to a CSV file
df.to_csv("processed_weather_data_6years.csv", index=False)