# Data Feature Extraction

This module help extract features from "periods_train.csv" or "periods_test.csv" such as is_holiday, wday (weekday), etc for final training of the model given an item_id.

## Note: 
This is a practice notebook. For the result

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# train = pd.read_csv('data/periods_train.csv')

In [3]:
# test = pd.read_csv('data/periods_test.csv')

In [4]:
df = pd.read_csv('data/periods_train_10_only.csv')

In [5]:
df

Unnamed: 0,item_id,activation_date,date_from,date_to
0,8f5caef7afb0,2017-02-14,2017-03-15,2017-03-16
1,66218ff526d1,2017-02-16,2017-03-15,2017-03-18
2,b237d9539b21,2017-03-01,2017-03-15,2017-03-28
3,80bf58082ad3,2017-03-19,2017-03-19,2017-03-28
4,67a9944a7373,2017-03-14,2017-03-15,2017-03-28
5,47081d25b7cb,2017-02-19,2017-03-15,2017-03-20
6,5a328b4a8be1,2017-02-20,2017-03-15,2017-03-21
7,87292b76e4e1,2017-03-17,2017-03-17,2017-03-28
8,2ccefc8a940b,2017-02-19,2017-03-15,2017-03-21
9,ded64e61cd85,2017-03-13,2017-03-15,2017-03-28


## initial preparation and sanity checks

In [6]:
for col in ['activation_date', 'date_from', 'date_to']:
    df[col] = df[col].apply(pd.to_datetime, yearfirst=True)

In [7]:
assert len(df) == len(df.drop_duplicates('item_id'))

In [8]:
assert (df['date_to'] > df['date_from']).all()

In [9]:
assert (df['date_from'] >= df['activation_date']).all()

## Feature generation

In [11]:
df['days_to_publish'] = (df['date_from'] - df['activation_date']).dt.days

In [12]:
df['days_online'] = (df['date_to'] - df['date_from']).dt.days

In [43]:
from utils import is_russian_2017_holiday
for col in ['activation_date', 'date_from', 'date_to']:
    df['{}_isholiday'.format(col)] = df[col].apply(is_russian_2017_holiday)
    df['{}_dayofweek'.format(col)] = df[col].apply(lambda x: x.dayofweek)
    df['{}_dayofyear'.format(col)] = df[col].apply(lambda x: x.dayofyear)

In [44]:
df

Unnamed: 0,item_id,activation_date,date_from,date_to,days_to_publish,days_online,activation_date_isholiday,date_from_isholiday,date_to_isholiday,activation_date_dayofweek,activation_date_dayofyear,date_from_dayofweek,date_from_dayofyear,date_to_dayofweek,date_to_dayofyear
0,8f5caef7afb0,2017-02-14,2017-03-15,2017-03-16,29,1,0,0,0,1,45,2,74,3,75
1,66218ff526d1,2017-02-16,2017-03-15,2017-03-18,27,3,0,0,0,3,47,2,74,5,77
2,b237d9539b21,2017-03-01,2017-03-15,2017-03-28,14,13,0,0,0,2,60,2,74,1,87
3,80bf58082ad3,2017-03-19,2017-03-19,2017-03-28,0,9,0,0,0,6,78,6,78,1,87
4,67a9944a7373,2017-03-14,2017-03-15,2017-03-28,1,13,0,0,0,1,73,2,74,1,87
5,47081d25b7cb,2017-02-19,2017-03-15,2017-03-20,24,5,0,0,0,6,50,2,74,0,79
6,5a328b4a8be1,2017-02-20,2017-03-15,2017-03-21,23,6,0,0,0,0,51,2,74,1,80
7,87292b76e4e1,2017-03-17,2017-03-17,2017-03-28,0,11,0,0,0,4,76,4,76,1,87
8,2ccefc8a940b,2017-02-19,2017-03-15,2017-03-21,24,6,0,0,0,6,50,2,74,1,80
9,ded64e61cd85,2017-03-13,2017-03-15,2017-03-28,2,13,0,0,0,0,72,2,74,1,87
