# Imports

In [1]:
import numpy as np
import pandas as pd
from recsys.config import Config

In [2]:
from recsys.algorithms import utils

In [3]:
config = Config()

# Load Data

In [4]:
df_train = pd.read_csv(config.data_path / 'train.csv')
df_item_meta = pd.read_csv(config.data_path / 'item_metadata.csv')

# Item Features

In [6]:
df_item_meta['properties'] = df_item_meta['properties'].apply(utils.string_to_array)

In [7]:
df = pd.get_dummies(df_item_meta['properties'].apply(pd.Series).stack()).sum(level=0)

In [8]:
df_items = pd.concat(
    [
        df_item_meta,
        df
    ], axis=1
)

In [14]:
df_items = df_items.drop('properties', axis=1)

# Session Features

#### Count

In [15]:
df_count = df_train.groupby(['user_id', 'session_id']).size().reset_index(name='count')

#### References

In [16]:
df_grouped = df_train.groupby(['user_id', 'session_id'], as_index=False)['reference'].apply(list)

In [17]:
df_references = df_grouped.reset_index(name='references')

In [18]:
def most_frequent(l): 
    if len(l) != 0:
        return max(set(l), key = l.count) 
    else:
        return -1
    
def last_element(l):
    if len(l) != 0:
        return l[-1]
    else:
        return -1

In [19]:
df_references['mostfreq_ref_hotel'] = df_references['references'].apply(lambda x: most_frequent([e for e in x if e.isdigit()]))

In [20]:
df_references['last_ref_hotel'] = df_references['references'].apply(lambda x: last_element([e for e in x if e.isdigit()]))

In [21]:
df_references['count_ref_hotel'] = df_references['references'].apply(lambda x: len([e for e in x if e.isdigit()]))

In [22]:
df_references['unique_ref_hotel'] = df_references['references'].apply(lambda x: len(set([e for e in x if e.isdigit()])))

In [None]:
df_references = df_references.drop('references', axis=1)

#### Actions

In [23]:
df_grouped = df_train.groupby(['user_id', 'session_id'], as_index=False)['action_type'].apply(list)

In [24]:
df_actions = df_grouped.reset_index(name='action_types')

In [25]:
split_actions = [pd.get_dummies(x.apply(pd.Series).stack()).sum(level=0)for x in np.split(df_actions['action_types'], np.arange(5, len(df_actions['action_types']), 10000))]

In [26]:
df_actions = pd.concat(split_actions, axis=0, sort=False)

In [27]:
#df_actions = pd.get_dummies(df_actions['action_types'].apply(pd.Series).stack()).sum(level=0)

#### Combine

In [42]:
df_session_features = pd.concat(
    [
        df_references,
        df_count[['count']],
        df_actions
    ], axis=1
)

In [46]:
df_session_features = df_session_features.fillna(0.)

# Save CSV

In [51]:
df_items.to_csv(config.data_path / 'item_features.csv', index=False)

In [52]:
df_session_features.to_csv(config.data_path / 'session_features.csv', index=False)