# Feature engineering

In [1]:
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt

## Load raw train data

In [2]:
data = pd.read_csv('data/weather_features_target_v1.csv')

In [3]:
data.target.value_counts()

target
0    34096
1      992
Name: count, dtype: int64

In [4]:
data.head()

Unnamed: 0,date,time,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,et0_fao_evapotranspiration,wind_speed_100m,wind_direction_100m,date_time,is_possible_ff,target
0,2020-01-01,00:00:00,58.7003,62.26887,45.8303,55.678288,0.0,0.004173,6.215479,239.74365,2020-01-01 00:00:00,False,0
1,2020-01-01,01:00:00,54.380302,61.066536,41.2403,50.51594,0.0,0.001316,4.52958,212.90524,2020-01-01 01:00:00,False,0
2,2020-01-01,02:00:00,57.0803,50.207703,38.7203,53.103203,0.0,0.000875,0.31636,224.9999,2020-01-01 02:00:00,False,0
3,2020-01-01,03:00:00,50.4203,65.64563,39.3503,46.027035,0.0,0.000634,4.343468,168.11136,2020-01-01 03:00:00,False,0
4,2020-01-01,04:00:00,53.1203,54.41129,37.1003,49.10236,0.0,0.000436,3.472758,194.93147,2020-01-01 04:00:00,False,0


In [5]:
# Drop redundant columns
duplicate_col = ['date', 'time', 'is_possible_ff']
data.drop(columns=duplicate_col, inplace=True)

## Time based features

In [9]:
# Make sure date_time column is in the right dtype (datetime) and not object
data['date_time'] = pd.to_datetime(data['date_time'])

In [10]:
# For seasonal patterns

data['year'] = data['date_time'].dt.year
data['month'] = data['date_time'].dt.month
data['day_of_year'] = data['date_time'].dt.dayofyear

# For day patterns
data['hour'] = data['date_time'].dt.hour

# For week patterns
data['day_of_week'] = data['date_time'].dt.dayofweek  # Monday=0, Sunday=6
data['week_of_year'] = data['date_time'].dt.isocalendar().week

# For month patterns

data['is_month_start'] = data['date_time'].dt.is_month_start
data['is_month_end'] = data['date_time'].dt.is_month_end

In [11]:
data.head()

Unnamed: 0,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,et0_fao_evapotranspiration,wind_speed_100m,wind_direction_100m,date_time,target,year,month,day_of_year,hour,day_of_week,week_of_year,is_month_start,is_month_end
0,58.7003,62.26887,45.8303,55.678288,0.0,0.004173,6.215479,239.74365,2020-01-01 00:00:00,0,2020,1,1,0,2,1,True,False
1,54.380302,61.066536,41.2403,50.51594,0.0,0.001316,4.52958,212.90524,2020-01-01 01:00:00,0,2020,1,1,1,2,1,True,False
2,57.0803,50.207703,38.7203,53.103203,0.0,0.000875,0.31636,224.9999,2020-01-01 02:00:00,0,2020,1,1,2,2,1,True,False
3,50.4203,65.64563,39.3503,46.027035,0.0,0.000634,4.343468,168.11136,2020-01-01 03:00:00,0,2020,1,1,3,2,1,True,False
4,53.1203,54.41129,37.1003,49.10236,0.0,0.000436,3.472758,194.93147,2020-01-01 04:00:00,0,2020,1,1,4,2,1,True,False


Keep adding more feature engineering code here.

## Finding feature importance

LOFO (Leave One Feature Out) Importance calculates the importances of a set of features based on a metric of choice, for a model of choice, by iteratively removing each feature from the set, and evaluating the performance of the model, with a validation scheme of choice, based on the chosen metric.

LOFO first evaluates the performance of the model with all the input features included, then iteratively removes one feature at a time, retrains the model, and evaluates its performance on a validation set. The mean and standard deviation (across the folds) of the importance of each feature is then reported.

If a model is not passed as an argument to LOFO Importance, it will run LightGBM as a default model.

In [12]:
%pip install lofo-importance

Collecting lofo-importance
  Downloading lofo_importance-0.3.4-py3-none-any.whl.metadata (6.6 kB)
Collecting scipy (from lofo-importance)
  Using cached scipy-1.12.0-cp39-cp39-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting scikit-learn>=0.20.3 (from lofo-importance)
  Using cached scikit_learn-1.4.1.post1-cp39-cp39-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting lightgbm (from lofo-importance)
  Downloading lightgbm-4.3.0.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting networkx (from lofo-importance)
  Using cached networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting joblib>=1.2.0 (from scikit-learn>=0.20.3->lofo-importance)
  Usin