<a href="https://colab.research.google.com/github/benman1/python-time-series/blob/master/notebooks/Feature_Engineering.ipynb" target="_parent\"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is mainly about derived date features.

## Holidays

In [1]:
from workalendar.europe.united_kingdom import UnitedKingdom
UnitedKingdom().holidays()

[(datetime.date(2023, 1, 1), 'New year'),
 (datetime.date(2023, 1, 2), 'New Year shift'),
 (datetime.date(2023, 4, 7), 'Good Friday'),
 (datetime.date(2023, 4, 9), 'Easter Sunday'),
 (datetime.date(2023, 4, 10), 'Easter Monday'),
 (datetime.date(2023, 5, 1), 'Early May Bank Holiday'),
 (datetime.date(2023, 5, 29), 'Spring Bank Holiday'),
 (datetime.date(2023, 8, 28), 'Late Summer Bank Holiday'),
 (datetime.date(2023, 12, 25), 'Christmas Day'),
 (datetime.date(2023, 12, 26), 'Boxing Day')]

In [2]:
from typing import List
from dateutil.relativedelta import relativedelta, TH
import datetime
from workalendar.usa import California

def create_custom_holidays(year) -> List:
      custom_holidays = California().holidays()
      custom_holidays.append((
        (datetime.datetime(year, 11, 1) + relativedelta(weekday=TH(+4)) + datetime.timedelta(days=1)).date(),
        "Black Friday"
      ))
      return {k: v for (k, v) in custom_holidays}

custom_holidays = create_custom_holidays(2021)

In [3]:
custom_holidays

{datetime.date(2023, 1, 1): 'New year',
 datetime.date(2023, 1, 2): 'New year (Observed)',
 datetime.date(2023, 1, 16): 'Birthday of Martin Luther King, Jr.',
 datetime.date(2023, 2, 20): "Washington's Birthday",
 datetime.date(2023, 3, 31): 'Cesar Chavez Day',
 datetime.date(2023, 5, 29): 'Memorial Day',
 datetime.date(2023, 7, 4): 'Independence Day',
 datetime.date(2023, 9, 4): 'Labor Day',
 datetime.date(2023, 11, 10): 'Veterans Day (Observed)',
 datetime.date(2023, 11, 11): 'Veterans Day',
 datetime.date(2023, 11, 23): 'Thanksgiving Day',
 datetime.date(2023, 11, 24): 'Thanksgiving Friday',
 datetime.date(2023, 12, 25): 'Christmas Day',
 datetime.date(2021, 11, 26): 'Black Friday'}

In [4]:
def is_holiday(current_date: datetime.date):
    """Determine if we have a holiday."""
    return custom_holidays.get(current_date, False)

today = datetime.date(2021, 4, 11)
is_holiday(today)

False

## Date Annotations

In [5]:
import calendar

calendar.monthrange(2021, 1)

(4, 31)

In [6]:
from datetime import date
def year_anchor(current_date: datetime.date):
      return (
        (current_date - date(current_date.year, 1, 1)).days,
        (date(current_date.year, 12, 31) - current_date).days,
      )

year_anchor(today)


(100, 264)

In [7]:
def month_anchor(current_date: datetime.date):
      last_day = calendar.monthrange(current_date.year, current_date.month)[0]
    
      return (
        (current_date - datetime.date(current_date.year, current_date.month, 1)).days,
        (current_date - datetime.date(current_date.year, current_date.month, last_day)).days,
      )

month_anchor(today)


(10, 8)

## Paydays

In [8]:
def get_last_friday(current_date: datetime.date, weekday=calendar.FRIDAY):
      return max(week[weekday]
        for week in calendar.monthcalendar(
            current_date.year, current_date.month
        ))

get_last_friday(today)


30

## Seasons

In [9]:
YEAR = 2021
seasons = [
    ('winter', (date(YEAR,  1,  1),  date(YEAR,  3, 20))),
    ('spring', (date(YEAR,  3, 21),  date(YEAR,  6, 20))),
    ('summer', (date(YEAR,  6, 21),  date(YEAR,  9, 22))),
    ('autumn', (date(YEAR,  9, 23),  date(YEAR, 12, 20))),
    ('winter', (date(YEAR, 12, 21),  date(YEAR, 12, 31)))
]

def is_in_interval(current_date: datetime.date, seasons):
      return next(season for season, (start, end) in seasons
                if start <= current_date.replace(year=YEAR) <= end)
    
is_in_interval(today, seasons)


'spring'

## Sun and Moon

In [11]:
from astral.sun import sun
from astral import LocationInfo
CITY = LocationInfo("London", "England", "Europe/London", 51.5, -0.116)
def get_sunrise_dusk(current_date: datetime.date, city_name='London'):
      s = sun(CITY.observer, date=current_date)
      sunrise = s['sunrise']
      dusk = s['dusk']
      return (sunrise - dusk).seconds / 3600

get_sunrise_dusk(today)


9.788055555555555

## Business Days

In [12]:
import pandas as pd

def get_business_days(current_date: datetime.date):
    last_day = calendar.monthrange(current_date.year, current_date.month)[1]
    rng = pd.date_range(current_date.replace(day=1), periods=last_day, freq='D')
    business_days = pd.bdate_range(rng[0], rng[-1])
    return len(business_days), last_day - len(business_days)

get_business_days(date.today())


(20, 8)

# Automated Feature Extraction

In [13]:
import featuretools as ft
from featuretools.primitives import Minute, Hour, Day, Month, Year, Weekday

data = pd.DataFrame(
    {'Time': ['2014-01-01 01:41:50',
              '2014-01-01 02:06:50',
              '2014-01-01 02:31:50',
              '2014-01-01 02:56:50',
              '2014-01-01 03:21:50'],
     'Target': [0, 0, 0, 0, 1]}
)        
data['index'] = data.index
es = ft.EntitySet('My EntitySet')
es.add_dataframe(
    dataframe_name='main_data_table',
    index='index',
    dataframe=data,
    time_index='Time'
)
feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name="main_data_table")




In [14]:
feature_matrix

Unnamed: 0_level_0,Target,DAY(Time),MONTH(Time),WEEKDAY(Time),YEAR(Time)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,1,1,2,2014
1,0,1,1,2,2014
2,0,1,1,2,2014
3,0,1,1,2,2014
4,1,1,1,2,2014


In [15]:
from tsfresh.feature_extraction import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters

settings = ComprehensiveFCParameters()
extract_features(data, column_id='Time', default_fc_parameters=settings)


Feature Extraction: 100%|████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.79it/s]


Unnamed: 0,Target__variance_larger_than_standard_deviation,Target__has_duplicate_max,Target__has_duplicate_min,Target__has_duplicate,Target__sum_values,Target__abs_energy,Target__mean_abs_change,Target__mean_change,Target__mean_second_derivative_central,Target__median,...,index__fourier_entropy__bins_5,index__fourier_entropy__bins_10,index__fourier_entropy__bins_100,index__permutation_entropy__dimension_3__tau_1,index__permutation_entropy__dimension_4__tau_1,index__permutation_entropy__dimension_5__tau_1,index__permutation_entropy__dimension_6__tau_1,index__permutation_entropy__dimension_7__tau_1,index__query_similarity_count__query_None__threshold_0.0,index__mean_n_absolute_max__number_of_maxima_7
2014-01-01 01:41:50,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,...,,,,,,,,,,
2014-01-01 02:06:50,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,...,,,,,,,,,,
2014-01-01 02:31:50,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,...,,,,,,,,,,
2014-01-01 02:56:50,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,...,,,,,,,,,,
2014-01-01 03:21:50,0.0,0.0,0.0,0.0,1.0,1.0,,,,1.0,...,,,,,,,,,,
