In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
from matplotlib import pyplot as plt
from tqdm.auto import tqdm

In [2]:
# Create and register a new `tqdm` instance with `pandas`
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

In [3]:
interaction_action_types = {
    'clickout item',
    'interaction item rating',
    'interaction item info',
    'interaction item image',
    'interaction item deals',
    'search for item'
}

In [4]:
df = pd.read_csv('../data/test.csv').set_index(keys=['session_id', 'step'])

In [5]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,timestamp,action_type,reference,platform,city,device,current_filters,impressions,prices
session_id,step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1d688ec168932,1,004A07DM0IDW,1541555614,interaction item image,2059240,CO,"Santa Marta, Colombia",mobile,,,
1d688ec168932,2,004A07DM0IDW,1541555614,interaction item image,2059240,CO,"Santa Marta, Colombia",mobile,,,
1d688ec168932,3,004A07DM0IDW,1541555696,clickout item,1050068,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...
1d688ec168932,4,004A07DM0IDW,1541555707,clickout item,1050068,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...
1d688ec168932,5,004A07DM0IDW,1541555717,clickout item,1050068,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...
1d688ec168932,6,004A07DM0IDW,1541555792,clickout item,3241426,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...
1d688ec168932,7,004A07DM0IDW,1541555799,clickout item,,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...
f05ab0de907e2,1,009RGHI3G9A3,1541570935,interaction item info,7065316,IN,"Nathdwara, India",mobile,,,
f05ab0de907e2,2,009RGHI3G9A3,1541570940,clickout item,,IN,"Nathdwara, India",mobile,,10884872|7065316,64|28
26b6d294d66e7,1,00Y1Z24X8084,1541651766,clickout item,3843244,PH,"Iloilo City, Philippines",mobile,,2714480|4476010|3843244|3833012|9017890|198100...,74|14|22|38|55|44|28|34|23|27|12|108|19|21|36|...


In [6]:
def get_submission_target(df):
    """Identify target rows with missing click outs."""

    mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
    df_out = df[mask]

    return df_out


def string_to_array(s):
    """Convert pipe separated string to array."""

    if isinstance(s, str):
        out = s.split("|")
    elif math.isnan(s):
        out = []
    else:
        raise ValueError("Value must be either string of nan")
    return out

In [7]:
df_target = get_submission_target(df)
df_target

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,timestamp,action_type,reference,platform,city,device,current_filters,impressions,prices
session_id,step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1d688ec168932,7,004A07DM0IDW,1541555799,clickout item,,CO,"Santa Marta, Colombia",mobile,,2059240|2033381|1724779|127131|399441|103357|1...,70|46|48|76|65|65|106|66|87|43|52|44|60|61|50|...
f05ab0de907e2,2,009RGHI3G9A3,1541570940,clickout item,,IN,"Nathdwara, India",mobile,,10884872|7065316,64|28
26b6d294d66e7,2,00Y1Z24X8084,1541651823,clickout item,,PH,"Iloilo City, Philippines",mobile,,2714480|4476010|3843244|3833012|9017890|198100...,74|14|22|38|55|44|28|34|23|27|12|108|19|21|36|...
07628a0f5be0b,5,01V3WDTDM5CU,1541575643,clickout item,,PL,"Wisla, Poland",mobile,Sort by Price,3565720|2947584|4115018|2039671|3836538|801409...,16|18|20|21|22|22|28|28|28|30|30|33|33|35|35|3...
4a01c3afbc224,46,02AOAVF9PVYH,1541681278,clickout item,,JP,"Yokohama, Japan",desktop,Hotel|Resort|Sort by Price,1451247|559056|1045096|1963879|693596|1967173|...,80|81|81|82|82|82|83|83|83|85|85|88|88|91|92|9...
89171d441a304,36,0339C84S24ET,1541615683,clickout item,,TR,"Antalya, Turkey",mobile,,13361|5647680|116764|898719|8276346|9168|19325...,185|84|30|19|46|77|123|23|25|25|26|39|73|56|96...
e09591d07cdef,2,0386OH8JDE1Q,1541620536,clickout item,,UK,"John o' Groats, United Kingdom",desktop,,1193320|5488246|3858774|4552034|10620372|22696...,103|88|100|134|109|138|126|86
7663406cf586c,4,03LTH89QY623,1541554183,clickout item,,CA,"Koloa, USA",desktop,,241961|906477|991561|353701|1149665|77258|4943...,287|300|261|197|163|263|262|188|540|283|211|22...
725e8adf70e86,23,03VT0ODUTZB0,1541632490,clickout item,,UK,"Warrington, United Kingdom",desktop,,109938|164193|632366|1362450|1070666|164220|11...,45|67|78|60|58|57|86|68|57|55|148|92|66|55|61|...
73f4c417ff730,176,03XH0JWCWHAM,1541566143,clickout item,,MX,"Puebla, Mexico",mobile,Sort By Popularity,42692|5116230|42876|4342578|42864|3148690|2123...,53|181|60|45|96|60|80|59|41|122|50|43|113|48|5...


In [8]:
# index test dataset by (session_id, step)
# 
# for each action which has to be predicted: apply
#     find all interaction actions before the step of the action to predict
#     compute frequencies of items based on how many interaction actions reference them
#     sort the frequencies in descending order
#     explode the target action's impressions list
#     sort the impressions list by decending frequency order, where the default frequency is zero (the sort is stable)
#     combine the sorted impressions list into a string again
# pick only submission columns
# save csv

def sort_array_by_frequencies(array, frequencies):
    return sorted(array, key=lambda x: frequencies[x], reverse=True)

def sort_impressions_by_most_interactions(target_action):
    df_session = df.loc[target_action['session_id'], :].reset_index()
#     df_session = df.iloc[df.index.get_level_values('session_id') == target_action['session_id']]

    df_before_target = df_session[(df_session['step'] < target_action['step']) & (df_session['reference'].notnull())]
#     df_before_target = df_session.iloc[df_session.index.get_level_values('step') < target_action['step']]

#     df_session['reference'].notnull()
#     df_with_reference = df_before_target[df_before_target['reference'].notnull()]
    
    df_interactions = df_before_target[df_before_target['action_type'].isin(interaction_action_types)]

    sorted_impressions = sort_array_by_frequencies(
        array=string_to_array(target_action['impressions']),
        frequencies=Counter(df_interactions['reference'])
    )

    return ' '.join(sorted_impressions)

In [9]:
df_submission = df_target.reset_index()[["user_id", "session_id", "timestamp", "step", "impressions"]]

In [11]:
df_submission['new_impressions'] = df_submission.progress_apply(sort_impressions_by_most_interactions, axis=1)
df_submission

HBox(children=(IntProgress(value=0, max=253573), HTML(value='')))

Unnamed: 0,user_id,session_id,timestamp,step,impressions,new_impressions
0,004A07DM0IDW,1d688ec168932,1541555799,7,2059240|2033381|1724779|127131|399441|103357|1...,1050068|2059240|3241426|2033381|1724779|127131...
1,009RGHI3G9A3,f05ab0de907e2,1541570940,2,10884872|7065316,7065316|10884872
2,00Y1Z24X8084,26b6d294d66e7,1541651823,2,2714480|4476010|3843244|3833012|9017890|198100...,3843244|2714480|4476010|3833012|9017890|198100...
3,01V3WDTDM5CU,07628a0f5be0b,1541575643,5,3565720|2947584|4115018|2039671|3836538|801409...,2817590|3565720|2947584|4115018|2039671|383653...
4,02AOAVF9PVYH,4a01c3afbc224,1541681278,46,1451247|559056|1045096|1963879|693596|1967173|...,7304020|1177554|1451247|559056|1045096|1963879...
5,0339C84S24ET,89171d441a304,1541615683,36,13361|5647680|116764|898719|8276346|9168|19325...,37717|4957186|13361|5647680|116764|898719|8276...
6,0386OH8JDE1Q,e09591d07cdef,1541620536,2,1193320|5488246|3858774|4552034|10620372|22696...,5488246|1193320|3858774|4552034|10620372|22696...
7,03LTH89QY623,7663406cf586c,1541554183,4,241961|906477|991561|353701|1149665|77258|4943...,59566|241961|906477|991561|353701|1149665|7725...
8,03VT0ODUTZB0,725e8adf70e86,1541632490,23,109938|164193|632366|1362450|1070666|164220|11...,1070666|322491|109938|164193|632366|1362450|16...
9,03XH0JWCWHAM,73f4c417ff730,1541566143,176,42692|5116230|42876|4342578|42864|3148690|2123...,42692|6298428|42858|3148690|42618|6452022|5762...


In [12]:
df_submission.drop(columns=['impressions']).rename(columns={'new_impressions': 'item_recommendations'}).to_csv('../data/submission_most_interactions_2.csv', index=False)

# Hypothesis: Users interact most with the items which they afterwards click out

Thus, **interaction** with items correlates with them being clicked out. We can measure **interaction** in two way:
1. The number of actions which reference an item.
2. The time duration spent interacting with an item.

## Action types with item interaction - the reference value is the item ID

- `clickout item`: user makes a click-out on the item and gets forwarded to a partner website. Other items that were displayed to the user and their associated prices are listed under the ‘impressions’ and ‘prices’ column for this action.
- `interaction item rating`: user interacts with a rating or review of an item.
- `interaction item info`: user interacts with item information.
- `interaction item image`: user interacts with an image of an item.
- `interaction item deals`: user clicks on the view more deals button.
- `search for item`: user searches for an accommodation.