In [None]:
##### PRELIMINARY FEATURE ENGINEERING AND SELECTION #####

In [None]:
### import libraries
import pandas as pd
import numpy as np
from datetime import datetime,date

In [None]:
start_time = datetime.now()
print('Start time: ', start_time)

In [None]:
##### LOAD DATA
print('Loading data...')

In [None]:
df = pd.read_csv('../data/processed_data/session_level_data_merged.tsv', sep='\t', low_memory=False, encoding='iso-8859-1', parse_dates=['hit_time_gmt', 'last_hit_time_gmt_visit'])

In [None]:
print('Time passed since start: ', datetime.now() - start_time)

In [None]:
### ENCODE TARGET
print('Encoding target...')

In [None]:
# binary encode target to be either 1 or 0
df['purchase'] = df['purchase'].apply(lambda x: 1 if x >= 1 else 0)

In [None]:
print('Time passed since start: ', datetime.now() - start_time)

In [None]:
### ADD FEATURES
print('Adding features...')

In [None]:
# add flag to indicate bounce
df['bounce'] = df['visit_page_num'].apply(lambda x: 1 if x == 1 else 0)

In [None]:
### add time features
# sort dataframe by visitor_id, visit_num and hit_time_gmt_min
df = df.sort_values(['visitor_id', 'visit_num', 'hit_time_gmt', 'last_hit_time_gmt_visit'], ascending=[True, True, True, True])

# day of week
df['day_of_week'] = df['hit_time_gmt'].dt.dayofweek

# hour of day
df['hour_of_day'] = df['hit_time_gmt'].dt.hour

# calculate visit duration in seconds
df['visit_duration_seconds'] = df['last_hit_time_gmt_visit'] - df['hit_time_gmt']
df['visit_duration_seconds'] = df['visit_duration_seconds'].apply(lambda x: x.seconds)

# add lag columns for visitor_id and last_hit_time_gmt_visit
df['visitor_id_lag'] = df['visitor_id'].shift(1)
df['last_hit_time_gmt_visit_lag'] = df['last_hit_time_gmt_visit'].shift(1)

# calculate days since last visit
df['days_since_last_visit'] = df.apply(lambda x: x['hit_time_gmt'] - x['last_hit_time_gmt_visit_lag'] 
                                       if x['visitor_id'] == x['visitor_id_lag'] 
                                       else np.nan, axis=1)
df['days_since_last_visit'] = df['days_since_last_visit'].apply(lambda x: x.days)

# days since last purchase

In [None]:
### buckets for past purchases, past visits, product views, page views
df['visit_in_last_7_days'] = df['days_since_last_visit'].apply(lambda x: 1 if (x >=1) & (x <= 7) else 0)

In [None]:
print('Time passed since start: ', datetime.now() - start_time)

In [None]:
### ENCODE CATEGORICAL FEATURES
print('Encoding categorical features...')

In [None]:
### encode categorical features
df.drop(['visitor_id', 
         'visitor_id_lag', 
         'last_hit_time_gmt_visit', 
         'last_hit_time_gmt_visit_lag',
         'days_since_last_visit'], axis=1, inplace=True)
object_cols = list(df.select_dtypes(include=['object']).columns)
dummies = pd.get_dummies(df.loc[:, df.columns.isin(object_cols)], drop_first=True)
df.drop(object_cols, axis=1, inplace=True)
df = pd.concat([df, dummies], axis=1)

In [None]:
print('Time passed since start: ', datetime.now() - start_time)

In [None]:
### WRITE DATA TO FILE

In [None]:
df.to_csv('../data/processed_data/session_level_data_final.tsv', sep='\t', encoding='iso-8859-1', index=False)

In [None]:
print('Total execution time: ', datetime.now() - start_time)