# Feature Engineering

In [1]:
%matplotlib inline
import matplotlib as mpl
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import numpy as np
import os

## Loading Data

In [2]:
input_file_name = "merged.csv"
input_file_path = os.path.join(os.path.join(os.getcwd(), "data"), input_file_name)

data = pd.read_csv(input_file_path, index_col=0, parse_dates=True)
data.head()
#load_data.shape

Unnamed: 0,id14061671,id14061684,id14061709,id14061716,id14061685,id14061674,id14061721,id14061680,id14061714,id11010024,...,rh_termin,rh,rhmin,rhmax,padavine,veter_hitrost,veter_vek_smer,veter_max_hitrost,energija_gl,energija_di
2011-01-01 00:30:00,0,0,75,15,37,18,13,4,0,1,...,95,95,95,95,0,1.3,100.0,1.9,0,0
2011-01-01 00:45:00,0,0,79,15,33,17,10,4,0,0,...,95,95,95,95,0,1.0,102.0,1.65,0,0
2011-01-01 01:00:00,0,0,77,15,32,21,12,4,0,0,...,95,95,95,95,0,0.7,104.0,1.4,0,0
2011-01-01 01:15:00,0,0,81,16,36,19,13,4,0,1,...,95,95,95,95,0,0.85,108.5,1.5,0,0
2011-01-01 01:30:00,0,0,82,14,34,18,10,4,0,0,...,95,95,95,95,0,1.0,113.0,1.6,0,0


## Adding DateTime Features

In [3]:
data["HourOfDay"] = data.index.hour # Could try cyclic hours
data["DayOfWeek"] = data.index.dayofweek
data["Month"] = data.index.month # month of year [1-12]
data["Weekday"] = data.DayOfWeek.apply(lambda x: 1 if x < 5 else 0)
data["Weekend"] = data.DayOfWeek.apply(lambda x: 0 if x < 5 else 1)


Check days on weekends and before.

In [4]:
data['2014-05-02 23:30:00':'2014-05-03 00:30:00'][['HourOfDay','DayOfWeek','Month','Weekday', 'Weekend']]

Unnamed: 0,HourOfDay,DayOfWeek,Month,Weekday,Weekend
2014-05-02 23:30:00,23,4,5,1,0
2014-05-02 23:45:00,23,4,5,1,0
2014-05-03 00:00:00,0,5,5,0,1
2014-05-03 00:15:00,0,5,5,0,1
2014-05-03 00:30:00,0,5,5,0,1


## Adding Holidays

In [5]:
import json
from pprint import pprint

input_file_name = "holidays.txt"
input_file_path = os.path.join(os.path.join(os.getcwd(), "data"), input_file_name)

with open(input_file_path) as data_file:    
    holiday_data = json.load(data_file)

#pprint(holiday_data)
holidays_list = [rec["DateString"] for rec in holiday_data]
data["BusinessDay"] = np.is_busday(data.index.values.astype('datetime64[D]'), holidays=holidays_list).astype(int)

Check if first of May (and the day before) is considered as BusinessDay.

In [6]:
data['2014-04-30 23:30:00':'2014-05-01 00:30:00'][['HourOfDay','DayOfWeek','Month','Weekday','BusinessDay']]

Unnamed: 0,HourOfDay,DayOfWeek,Month,Weekday,BusinessDay
2014-04-30 23:30:00,23,2,4,1,1
2014-04-30 23:45:00,23,2,4,1,1
2014-05-01 00:00:00,0,3,5,1,0
2014-05-01 00:15:00,0,3,5,1,0
2014-05-01 00:30:00,0,3,5,1,0


## Exporting dataset

In [7]:
output_file_name = "enriched.csv"
output_file_name = os.path.join(os.path.join(os.getcwd(), "data"), output_file_name)

data.to_csv(output_file_name)