# Periods Data Feature Extraction

This module help extract features from "periods_train.csv" or "periods_test.csv" such as is_holiday, wday (weekday), etc for final training of the model given an item_id.

## Note: 
This is a practice notebook. For the result

In [15]:
import pandas as pd
import sqlite3
from sqlalchemy.engine import create_engine
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import preprocessing
from time import time
from utils import featurize_date_col, is_russian_2017_holiday

pd.options.mode.chained_assignment = None
%matplotlib inline

In [2]:
# avito = sqlite3.connect("data/avito.db")
start_time = time()
# either:
# 1) (only the read_csv parts take 30 seconds)
train = pd.read_csv('data/periods_train.csv')
# train.to_sql('periods_train', avito)
test = pd.read_csv('data/periods_test.csv')
# test.to_sql('periods_test', avito)

# or 2) (it took 120 seconds)
# train = pd.read_sql('SELECT * from periods_train', avito, index_col='item_id')
# test = pd.read_sql('SELECT * from periods_test', avito, index_col='item_id')

print('it took {} seconds'.format(time() - start_time))

it took 29.799591779708862 seconds


In [40]:
train[1000000:1500000].to_csv('data/periods_train_500000.csv')

In [3]:
train.head()

Unnamed: 0,item_id,activation_date,date_from,date_to
0,8f5caef7afb0,2017-02-14,2017-03-15,2017-03-16
1,66218ff526d1,2017-02-16,2017-03-15,2017-03-18
2,b237d9539b21,2017-03-01,2017-03-15,2017-03-28
3,80bf58082ad3,2017-03-19,2017-03-19,2017-03-28
4,67a9944a7373,2017-03-14,2017-03-15,2017-03-28


In [37]:
df = train[1000000:1500000]

In [38]:
null_idx = df['activation_date'].isnull()
df['activation_date'].loc[null_idx] = df['date_from'].loc[null_idx]

In [39]:
len(df['item_id'].unique())

497384

In [7]:
temp = df[df['item_id'].duplicated()]

In [8]:
dft = df[df['item_id'].isin(temp['item_id'])]

In [9]:
observe_ids = dft['item_id']

## initial preparation and sanity checks

In [10]:
for col in ['activation_date', 'date_from', 'date_to']:
    df[col] = pd.to_datetime(df[col], format='%Y-%m-%d')

In [11]:
assert (df['date_to'] >= df['date_from']).all()

In [12]:
assert (df['date_from'] >= df['activation_date']).all()

## Feature generation

In [13]:
df['days_to_publish'] = (df['date_from'] - df['activation_date']).dt.days

In [14]:
df['days_online'] = (df['date_to'] - df['date_from']).dt.days

In [16]:
for col in ['activation_date', 'date_from', 'date_to']:
    df = featurize_date_col(df, col, remove_when_done=False)

In [17]:
df.head()

Unnamed: 0,item_id,activation_date,date_from,date_to,days_to_publish,days_online,activation_date_isholiday,activation_date_wday,activation_date_yday,date_from_isholiday,date_from_wday,date_from_yday,date_to_isholiday,date_to_wday,date_to_yday
0,8f5caef7afb0,2017-02-14,2017-03-15,2017-03-16,29,1,0,1,45,0,2,74,0,3,75
1,66218ff526d1,2017-02-16,2017-03-15,2017-03-18,27,3,0,3,47,0,2,74,0,5,77
2,b237d9539b21,2017-03-01,2017-03-15,2017-03-28,14,13,0,2,60,0,2,74,0,1,87
3,80bf58082ad3,2017-03-19,2017-03-19,2017-03-28,0,9,0,6,78,0,6,78,0,1,87
4,67a9944a7373,2017-03-14,2017-03-15,2017-03-28,1,13,0,1,73,0,2,74,0,1,87


In [18]:
grouped = df.groupby('item_id')

In [19]:
base = grouped[['item_id']].count().rename(columns={'item_id': 'nlisted'})

In [32]:
base['sum_days_online'] = grouped[['days_online']].sum()
base['mean_days_online'] = grouped[['days_online']].mean()
base['last_days_online'] = grouped[['days_online']].last()
base['sum_days_to_publishe'] = grouped[['days_to_publish']].sum()
base['mean_days_to_publish'] = grouped[['days_to_publish']].mean()
base['median_date_to_isholiday'] = grouped[['date_to_isholiday']].median()
base['median_date_to_wday'] = grouped[['date_to_wday']].median()
base['median_date_to_yday'] = grouped[['date_to_yday']].median()

base['start_date'] = grouped[['date_from']].min()
base['end_date'] = grouped[['date_to']].max()
for col in ['start_date', 'end_date']:
    base = featurize_date_col(base, col, remove_when_done=True)

In [33]:
base.loc[observe_ids].head()

Unnamed: 0_level_0,nlisted,sum_days_online,mean_days_online,start_date_isholiday,start_date_wday,start_date_yday,sum_days_to_publishe,mean_days_to_publish,end_date_isholiday,end_date_wday,end_date_yday,last_days_online,mean_date_to_isholiday,mean_date_to_wday,mean_date_to_yday
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
8f5caef7afb0,2,12,6.0,0,2,74,29,14.5,0,1,87,11,0,2.0,81.0
b14eb046fefb,2,12,6.0,0,2,74,30,15.0,0,1,87,12,0,1.5,80.5
6da1b9c127e4,2,11,5.5,0,2,74,23,11.5,0,1,87,4,0,1.5,84.0
65461756475a,2,11,5.5,0,2,74,30,15.0,0,1,87,11,0,1.5,80.5
96319c6b00da,2,11,5.5,0,2,74,28,14.0,0,1,87,9,0,2.5,81.5
