# Imports

In [46]:
import numpy as np
import pandas as pd
from recsys.config import Config

In [47]:
from recsys.algorithms import utils

In [48]:
config = Config()

# Load Data

In [49]:
df_test = pd.read_csv(config.data_path / 'test.csv')

# Get Labels

In [50]:
labels = df_test[(df_test['action_type'] == "clickout item") & (~df_test['reference'].isnull())]

### Filter checkouts

In [51]:
df_test = df_test[df_test['action_type'] != "clickout item"]

# Extract Features

In [52]:
df_test = df_test[df_test['action_type'] != "clickout item"]

# Session Features

#### Count

In [53]:
df_count = df_test.groupby(['user_id', 'session_id']).size().reset_index(name='count')

#### References

In [54]:
df_grouped = df_test.groupby(['user_id', 'session_id'], as_index=False)['reference'].apply(list)

In [55]:
df_references = df_grouped.reset_index(name='references')

In [56]:
def most_frequent(l): 
    if len(l) != 0:
        return max(set(l), key = l.count) 
    else:
        return -1
    
def last_element(l):
    if len(l) != 0:
        return l[-1]
    else:
        return -1

In [57]:
df_references['mostfreq_ref_hotel'] = df_references['references'].apply(lambda x: most_frequent([e for e in x if type(e)!=float and e.isdigit()]))

In [58]:
df_references['last_ref_hotel'] = df_references['references'].apply(lambda x: last_element([e for e in x if type(e)!=float and e.isdigit()]))

In [59]:
df_references['count_ref_hotel'] = df_references['references'].apply(lambda x: len([e for e in x if type(e)!=float and e.isdigit()]))

In [60]:
df_references['unique_ref_hotel'] = df_references['references'].apply(lambda x: len(set([e for e in x if type(e)!=float and e.isdigit()])))

In [61]:
df_references = df_references.drop('references', axis=1)

#### Actions

In [62]:
df_grouped = df_test.groupby(['user_id', 'session_id'], as_index=False)['action_type'].apply(list)

In [63]:
df_actions = df_grouped.reset_index(name='action_types')

In [64]:
split_actions = [pd.get_dummies(x.apply(pd.Series).stack()).sum(level=0)for x in np.split(df_actions['action_types'], np.arange(5, len(df_actions['action_types']), 10000))]

In [65]:
df_actions = pd.concat(split_actions, axis=0, sort=False)

In [66]:
#df_actions = pd.get_dummies(df_actions['action_types'].apply(pd.Series).stack()).sum(level=0)

#### Combine

In [67]:
df_session_features = pd.concat(
    [
        df_references,
        df_count[['count']],
        df_actions
    ], axis=1
)

In [68]:
df_session_features = df_session_features.fillna(0.)

In [72]:
df_test

(3253556, 12)

In [71]:
df_session_features.shape

(204957, 15)

# Save CSV

In [70]:
df_session_features.to_csv(config.data_path / 'session_features_test.csv', index=False)