# Feature Engineering

In [41]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter('ignore')

In [42]:
df = pd.read_csv("../data/cleaned_weather.csv")
df.columns

Index(['country', 'location_name', 'latitude', 'longitude', 'timezone',
       'last_updated', 'temperature_celsius', 'condition_text', 'wind_kph',
       'wind_degree', 'wind_direction', 'pressure_mb', 'precip_mm', 'humidity',
       'cloud', 'feels_like_celsius', 'visibility_km', 'uv_index', 'gust_kph',
       'air_quality_Carbon_Monoxide', 'air_quality_Ozone',
       'air_quality_Nitrogen_dioxide', 'air_quality_Sulphur_dioxide',
       'air_quality_PM2.5', 'air_quality_PM10'],
      dtype='object')

## Create Lag Features (Past Time Steps as Input)

- LSTM and Transformers need past data to learn temporal dependencies.
- Adding lag features allows the model to learn from past weather conditions.

In [43]:
lag_days = 7
for i in range(1, lag_days + 1):
    df[f'temperature_lag_{i}'] = df['temperature_celsius'].shift(i)
    df[f'humidity_lag_{i}'] = df['humidity'].shift(i)

In [44]:
df.head(10)

Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated,temperature_celsius,condition_text,wind_kph,wind_degree,...,temperature_lag_3,humidity_lag_3,temperature_lag_4,humidity_lag_4,temperature_lag_5,humidity_lag_5,temperature_lag_6,humidity_lag_6,temperature_lag_7,humidity_lag_7
0,Afghanistan,Kabul,34.52,69.18,Asia/Kabul,2024-05-16 13:15,26.6,Partly Cloudy,13.3,338,...,,,,,,,,,,
1,Albania,Tirana,41.33,19.82,Europe/Tirane,2024-05-16 10:45,19.0,Partly cloudy,11.2,320,...,,,,,,,,,,
2,Algeria,Algiers,36.76,3.05,Africa/Algiers,2024-05-16 09:45,23.0,Sunny,15.1,280,...,,,,,,,,,,
3,Andorra,Andorra La Vella,42.5,1.52,Europe/Andorra,2024-05-16 10:45,6.3,Light drizzle,11.9,215,...,26.6,24.0,,,,,,,,
4,Angola,Luanda,-8.84,13.23,Africa/Luanda,2024-05-16 09:45,26.0,Partly cloudy,13.0,150,...,19.0,94.0,26.6,24.0,,,,,,
5,Antigua and Barbuda,Saint John's,17.12,-61.85,America/Antigua,2024-05-16 04:45,26.0,Partly cloudy,9.0,90,...,23.0,29.0,19.0,94.0,26.6,24.0,,,,
6,Argentina,Buenos Aires,-34.59,-58.67,America/Argentina/Buenos_Aires,2024-05-16 05:45,8.0,Clear,3.6,10,...,6.3,61.0,23.0,29.0,19.0,94.0,26.6,24.0,,
7,Armenia,Yerevan,40.18,44.51,Asia/Yerevan,2024-05-16 12:45,19.0,Partly cloudy,6.8,140,...,26.0,89.0,6.3,61.0,23.0,29.0,19.0,94.0,26.6,24.0
8,Australia,Canberra,-35.28,149.22,Australia/Sydney,2024-05-16 18:45,9.0,Clear,4.0,100,...,26.0,84.0,26.0,89.0,6.3,61.0,23.0,29.0,19.0,94.0
9,Austria,Vienna,48.2,16.37,Europe/Vienna,2024-05-16 10:45,16.0,Partly cloudy,20.2,110,...,8.0,93.0,26.0,84.0,26.0,89.0,6.3,61.0,23.0,29.0


##  Rolling Window Statistics (Smoothing & Trend Capturing)
- Rolling mean captures trends, while rolling standard deviation detects fluctuations and anomalies over a time window.

In [45]:
df['temperature_rolling_mean_3d'] = df['temperature_celsius'].rolling(window=3).mean()
df['temperature_rolling_std_7d'] = df['temperature_celsius'].rolling(window=7).std()
df['humidity_rolling_mean_3d'] = df['humidity'].rolling(window=3).mean()
df['humidity_rolling_std_7d'] = df['humidity'].rolling(window=7).std()

In [46]:
df

Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated,temperature_celsius,condition_text,wind_kph,wind_degree,...,temperature_lag_5,humidity_lag_5,temperature_lag_6,humidity_lag_6,temperature_lag_7,humidity_lag_7,temperature_rolling_mean_3d,temperature_rolling_std_7d,humidity_rolling_mean_3d,humidity_rolling_std_7d
0,Afghanistan,Kabul,34.5200,69.1800,Asia/Kabul,2024-05-16 13:15,26.6,Partly Cloudy,13.3,338,...,,,,,,,,,,
1,Albania,Tirana,41.3300,19.8200,Europe/Tirane,2024-05-16 10:45,19.0,Partly cloudy,11.2,320,...,,,,,,,,,,
2,Algeria,Algiers,36.7600,3.0500,Africa/Algiers,2024-05-16 09:45,23.0,Sunny,15.1,280,...,,,,,,,22.866667,,49.000000,
3,Andorra,Andorra La Vella,42.5000,1.5200,Europe/Andorra,2024-05-16 10:45,6.3,Light drizzle,11.9,215,...,,,,,,,16.100000,,61.333333,
4,Angola,Luanda,-8.8400,13.2300,Africa/Luanda,2024-05-16 09:45,26.0,Partly cloudy,13.0,150,...,,,,,,,18.433333,,59.666667,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56901,Venezuela,Caracas,10.5000,-66.9167,America/Caracas,2025-03-05 06:00,24.4,Clear,3.6,55,...,7.1,81.0,30.4,24.0,7.3,64.0,20.900000,10.186592,62.666667,30.885349
56902,Vietnam,Hanoi,21.0333,105.8500,Asia/Bangkok,2025-03-05 17:00,26.4,Partly cloudy,8.6,70,...,4.6,95.0,7.1,81.0,30.4,24.0,25.666667,9.482565,79.000000,23.408383
56903,Yemen,Sanaa,15.3547,44.2067,Asia/Aden,2025-03-05 13:00,24.6,Patchy rain nearby,18.4,243,...,23.3,100.0,4.6,95.0,7.1,81.0,25.133333,8.465757,57.333333,30.772591
56904,Zambia,Lusaka,-15.4167,28.2833,Africa/Lusaka,2025-03-05 12:00,23.4,Patchy rain nearby,14.8,92,...,12.1,30.0,23.3,100.0,4.6,95.0,24.800000,4.921527,58.000000,28.717010


## Seasonal Features (Month, Day, Hour Cyclic Encoding)
- Weather follows seasonal trends (monthly, weekly, daily). 
- Using plain integers makes the model think that January (1) and December (12) are far apart, when in reality, they are close in seasonal cycles.

For example:

1. If we train a model with month = 12 (December) and month = 1 (January), it will assume these values are very different.
2. But in reality, December and January are neighbors in the annual weather cycle.

In [47]:
df['last_updated'] = pd.to_datetime(df['last_updated'])
df.set_index('last_updated', inplace=True)

df['month'] = df.index.month
df['day_of_week'] = df.index.dayofweek
df['hour_of_day'] = df.index.hour

# sine & cosine formulas for cyclic encoding
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day'] / 24)


In [48]:
df.head()

Unnamed: 0_level_0,country,location_name,latitude,longitude,timezone,temperature_celsius,condition_text,wind_kph,wind_degree,wind_direction,...,temperature_rolling_std_7d,humidity_rolling_mean_3d,humidity_rolling_std_7d,month,day_of_week,hour_of_day,month_sin,month_cos,hour_sin,hour_cos
last_updated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-05-16 13:15:00,Afghanistan,Kabul,34.52,69.18,Asia/Kabul,26.6,Partly Cloudy,13.3,338,NNW,...,,,,5,3,13,0.5,-0.866025,-0.258819,-0.965926
2024-05-16 10:45:00,Albania,Tirana,41.33,19.82,Europe/Tirane,19.0,Partly cloudy,11.2,320,NW,...,,,,5,3,10,0.5,-0.866025,0.5,-0.866025
2024-05-16 09:45:00,Algeria,Algiers,36.76,3.05,Africa/Algiers,23.0,Sunny,15.1,280,W,...,,49.0,,5,3,9,0.5,-0.866025,0.707107,-0.707107
2024-05-16 10:45:00,Andorra,Andorra La Vella,42.5,1.52,Europe/Andorra,6.3,Light drizzle,11.9,215,SW,...,,61.333333,,5,3,10,0.5,-0.866025,0.5,-0.866025
2024-05-16 09:45:00,Angola,Luanda,-8.84,13.23,Africa/Luanda,26.0,Partly cloudy,13.0,150,SSE,...,,59.666667,,5,3,9,0.5,-0.866025,0.707107,-0.707107


In [49]:
df.isna().sum().sum()

np.int64(72)

In [50]:
df.dropna(inplace=True)

In [51]:
df.isna().sum().sum()

np.int64(0)

## Weather-Specific Features (Custom Domain Features)
- Humidity × Temperature captures heat 
- Wind Speed categorizes wind intensity
- Dew Point measures moisture in the air


In [52]:
df['humidity_temperature_interaction'] = df['humidity'] * df['temperature_celsius']
df['wind_category'] = pd.cut(df['wind_kph'], bins=[0, 10, 20, 30], labels=['low', 'moderate', 'high'])
df['dew_point'] = df['temperature_celsius'] - ((100 - df['humidity']) / 5)


##  Outlier Handling using IQR (Interquartile Range)
Extreme outliers distort forecasting models, so we remove them using IQR.

In [53]:
Q1 = df['temperature_celsius'].quantile(0.25)
Q3 = df['temperature_celsius'].quantile(0.75)
IQR = Q3 - Q1
outlier_condition = (df['temperature_celsius'] < (Q1 - 1.5 * IQR)) | (df['temperature_celsius'] > (Q3 + 1.5 * IQR))
df = df[~outlier_condition]


## Smoothing using Exponential Moving Average (EMA)

EMA assigns more weight to recent values, making it useful for capturing short-term trends.

In [54]:
df['temperature_ema'] = df['temperature_celsius'].ewm(span=5, adjust=False).mean()
df['humidity_ema'] = df['humidity'].ewm(span=5, adjust=False).mean()


In [55]:
df.head()

Unnamed: 0_level_0,country,location_name,latitude,longitude,timezone,temperature_celsius,condition_text,wind_kph,wind_degree,wind_direction,...,hour_of_day,month_sin,month_cos,hour_sin,hour_cos,humidity_temperature_interaction,wind_category,dew_point,temperature_ema,humidity_ema
last_updated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-05-16 12:45:00,Armenia,Yerevan,40.18,44.51,Asia/Yerevan,19.0,Partly cloudy,6.8,140,SE,...,12,0.5,-0.866025,1.224647e-16,-1.0,760.0,low,7.0,19.0,40.0
2024-05-16 18:45:00,Australia,Canberra,-35.28,149.22,Australia/Sydney,9.0,Clear,4.0,100,E,...,18,0.5,-0.866025,-1.0,-1.83697e-16,783.0,low,6.4,15.666667,55.666667
2024-05-16 10:45:00,Austria,Vienna,48.2,16.37,Europe/Vienna,16.0,Partly cloudy,20.2,110,ESE,...,10,0.5,-0.866025,0.5,-0.8660254,1008.0,high,8.6,15.777778,58.111111
2024-05-16 12:45:00,Azerbaijan,Baku,40.4,49.88,Asia/Baku,17.0,Partly cloudy,6.8,20,NNE,...,12,0.5,-0.866025,1.224647e-16,-1.0,1156.0,low,10.6,16.185185,61.407407
2024-05-16 04:45:00,Bahamas,Nassau,25.08,-77.35,America/Nassau,27.0,Partly cloudy,25.9,180,S,...,4,0.5,-0.866025,0.8660254,0.5,2403.0,high,24.8,19.790123,70.604938


In [56]:
df.isna().sum().sum()

np.int64(2560)

In [57]:
df.fillna(method='bfill', inplace=True)

In [58]:
df.isna().sum().sum()

np.int64(0)

In [None]:
df.to_csv("../data/feature_engineered_cleaned.csv")