In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from feature_engine.datetime import DatetimeFeatures
from feature_engine.timeseries.forecasting import LagFeatures

import os,glob

In [2]:
df = pd.read_csv('../Data/results/Train[updated].csv',parse_dates=['date'])
df.head(3)

Unnamed: 0,lat,lon,burn_area,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,...,landcover_1,landcover_2,landcover_3,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,precipitation,date
0,-15.858835,29.237029,0.0,1195,0,263,1195,206,10,1692,...,0.0,0.018654,0.0,0.714446,0.012174,0.24489,0.009836,0.0,0.256932,2001-01-01
1,-15.858835,29.487029,0.0,1196,0,232,1196,201,10,1859,...,0.0,0.0,0.0,0.654783,9.5e-05,0.345121,0.0,0.0,0.273093,2001-01-01
2,-15.858835,29.737029,0.0,1190,0,314,1190,192,10,1677,...,0.0,0.0,0.0,0.516421,0.0,0.483579,0.0,0.0,0.285109,2001-01-01


# Drop the columns

EDA showed that we should drop to columns as they do not have any correlation with outcome

In [3]:
df.drop(['climate_swe', 'landcover_3'], axis=1, inplace=True)

# Temporal Features

There are monthly, yearly, and seasonal trends in burnt areas; therefore, the Month of Year (MOY), and seasonal indicators can be generated.

In [4]:
FTs_To_Extract = ["month", "quarter"]

pipe = Pipeline([
    
    # create datetime features.
    ('date', DatetimeFeatures(
        variables="date",
        features_to_extract=FTs_To_Extract,
        drop_original=False
    )),
])

# Extract features.
dtfs = pipe.fit_transform(df)

dtfs.iloc[:,-2:].describe().loc[['min','max'],:]

Unnamed: 0,date_month,date_quarter
min,1.0,1.0
max,12.0,4.0


# Lag Features

In [16]:
# dtfs.index.name = 'index'
climate_columns = dtfs[[col for col in dtfs.columns if col.startswith(("climate_", "lat","lo", "p", "date_m"))]]

In [17]:
month_avg = climate_columns.groupby(['lat', 'lon','date_month']).mean().reset_index()
month_avg.index.name = 'index'
month_avg.head(14)

Unnamed: 0_level_0,lat,lon,date_month,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,climate_srad,climate_tmmn,climate_tmmx,climate_vap,climate_vpd,climate_vs,precipitation
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,-22.358835,31.237029,1,662.307692,1042.923077,-56.076923,1705.076923,69.615385,3.307692,19.923077,2515.461538,221.384615,359.0,2453.769231,186.307692,178.769231,0.161206
1,-22.358835,31.237029,2,669.538462,846.615385,-123.769231,1516.076923,71.538462,3.538462,30.769231,2642.846154,224.230769,352.846154,2540.923077,169.769231,158.076923,0.072256
2,-22.358835,31.237029,3,328.461538,1114.692308,-128.461538,1443.153846,34.230769,1.692308,26.692308,2230.0,205.384615,343.384615,2337.923077,160.692308,174.076923,0.060118
3,-22.358835,31.237029,4,204.0,902.923077,-142.923077,1106.692308,21.230769,1.0,24.230769,1895.076923,182.923077,330.769231,2093.769231,150.615385,138.846154,0.030479
4,-22.358835,31.237029,5,66.538462,834.0,-127.923077,900.538462,6.846154,0.230769,22.384615,1803.538462,141.307692,305.923077,1629.307692,138.923077,104.615385,0.010238
5,-22.358835,31.237029,6,55.923077,690.0,-115.076923,746.076923,5.769231,0.230769,20.846154,1584.153846,114.307692,287.307692,1407.230769,125.846154,116.153846,0.006218
6,-22.358835,31.237029,7,17.923077,835.307692,-104.307692,853.384615,1.769231,0.0,19.846154,1652.307692,105.461538,288.769231,1334.692308,130.769231,137.615385,0.00504
7,-22.358835,31.237029,8,15.923077,1147.153846,-112.692308,1163.153846,1.461538,0.076923,18.769231,1977.0,127.846154,313.230769,1447.615385,159.846154,174.230769,0.003255
8,-22.358835,31.237029,9,96.461538,1346.230769,-130.923077,1442.692308,10.076923,0.384615,18.076923,2371.923077,157.769231,329.384615,1632.076923,179.384615,198.846154,0.013364
9,-22.358835,31.237029,10,192.615385,1498.0,-137.307692,1690.615385,20.384615,0.923077,17.153846,2450.230769,193.615385,355.384615,1978.538462,207.076923,201.692308,0.028644


In [18]:
lags = [1,2]
lag_cols = month_avg.drop('date_month',axis=1).columns
for l in lags:
    # Shift the timeseries index to get the lagged versions
    df_shift = month_avg[lag_cols].shift(periods=l)
    # Join back to the original dataframe
    month_avg = month_avg.merge(df_shift, on=['lat','lon','index'], how="left", suffixes=("", f"_lag_{l}"))

In [19]:
month_avg[['lat','lon','date_month','climate_aet','climate_aet_lag_1','climate_aet_lag_2']].head(14)

Unnamed: 0_level_0,lat,lon,date_month,climate_aet,climate_aet_lag_1,climate_aet_lag_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,-22.358835,31.237029,1,662.307692,,
1,-22.358835,31.237029,2,669.538462,662.307692,
2,-22.358835,31.237029,3,328.461538,669.538462,662.307692
3,-22.358835,31.237029,4,204.0,328.461538,669.538462
4,-22.358835,31.237029,5,66.538462,204.0,328.461538
5,-22.358835,31.237029,6,55.923077,66.538462,204.0
6,-22.358835,31.237029,7,17.923077,55.923077,66.538462
7,-22.358835,31.237029,8,15.923077,17.923077,55.923077
8,-22.358835,31.237029,9,96.461538,15.923077,17.923077
9,-22.358835,31.237029,10,192.615385,96.461538,15.923077


In [20]:
landcover_df = dtfs.drop(climate_columns.columns[2:-1],axis=1)

In [21]:
data = month_avg.merge(landcover_df, on=['lat','lon','date_month'], how="left")

In [22]:
data.head()

Unnamed: 0,lat,lon,date_month,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,...,landcover_0,landcover_1,landcover_2,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,date,date_quarter
0,-22.358835,31.237029,1,662.307692,1042.923077,-56.076923,1705.076923,69.615385,3.307692,19.923077,...,0.0,0.0,0.003074,0.184807,0.004725,0.807394,0.0,0.0,2001-01-01,1
1,-22.358835,31.237029,1,662.307692,1042.923077,-56.076923,1705.076923,69.615385,3.307692,19.923077,...,0.0,0.0,0.003381,0.157698,0.004725,0.834195,0.0,0.0,2002-01-01,1
2,-22.358835,31.237029,1,662.307692,1042.923077,-56.076923,1705.076923,69.615385,3.307692,19.923077,...,0.0,0.0,0.003996,0.158682,0.00411,0.833212,0.0,0.0,2003-01-01,1
3,-22.358835,31.237029,1,662.307692,1042.923077,-56.076923,1705.076923,69.615385,3.307692,19.923077,...,0.0,0.0,0.003381,0.162053,0.004215,0.83035,0.0,0.0,2004-01-01,1
4,-22.358835,31.237029,1,662.307692,1042.923077,-56.076923,1705.076923,69.615385,3.307692,19.923077,...,0.0,0.0,0.003381,0.137911,0.004215,0.854493,0.0,0.0,2005-01-01,1


In [84]:
data.isna().sum().sum()

291018

In [85]:
data.dropna(inplace=True)

In [86]:
data.isna().sum().sum()

0

In [87]:
data.to_csv('../Data/results/Train[FeatEng].csv',index=False)