In [1]:
import pandas as pd
import numpy as np
import swifter
import gzip
import json
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from functools import partial, reduce
from src.models import ndcg_score
from collections import Counter
from datetime import datetime



In [2]:
path = '../'
train_filename = 'data/interim/train_dataset_features.parquet'
cols_load_train = ['item_bought', 'domain_id_item_bought']
cols_feat_domain = []

cols_item = [f'{i}_viewed_item_{j}' for i in ['most','last'] for j in range(1,3)]
cols_load_train.extend(cols_item)

cols_item_domain = [f'domain_id_{c}' for c in cols_item]
cols_load_train.extend(cols_item_domain)
cols_feat_domain.extend(cols_item_domain)

cols_domain = [f'most_viewed_domain_{i}' for i in range(1,3)]
cols_load_train.extend(cols_domain)
cols_feat_domain.extend(cols_domain)

cols_ngram_domain = [f'domain_id_most_searched_ngram_{i}' for i in range(1,3)]
cols_load_train.extend(cols_ngram_domain)
cols_feat_domain.extend(cols_ngram_domain)

cols_searched_domain = [f'domain_id_last_searched_{i}' for i in range(1,3)]
cols_load_train.extend(cols_searched_domain)
cols_feat_domain.extend(cols_searched_domain)

In [3]:
cols_load_train

['item_bought',
 'domain_id_item_bought',
 'most_viewed_item_1',
 'most_viewed_item_2',
 'last_viewed_item_1',
 'last_viewed_item_2',
 'domain_id_most_viewed_item_1',
 'domain_id_most_viewed_item_2',
 'domain_id_last_viewed_item_1',
 'domain_id_last_viewed_item_2',
 'most_viewed_domain_1',
 'most_viewed_domain_2',
 'domain_id_most_searched_ngram_1',
 'domain_id_most_searched_ngram_2',
 'domain_id_last_searched_1',
 'domain_id_last_searched_2']

In [5]:
df_train = pd.read_parquet(os.path.join(path, train_filename), columns=cols_load_train)

In [6]:
df_train

Unnamed: 0,item_bought,domain_id_item_bought,most_viewed_item_1,most_viewed_item_2,last_viewed_item_1,last_viewed_item_2,domain_id_most_viewed_item_1,domain_id_most_viewed_item_2,domain_id_last_viewed_item_1,domain_id_last_viewed_item_2,most_viewed_domain_1,most_viewed_domain_2,domain_id_most_searched_ngram_1,domain_id_most_searched_ngram_2,domain_id_last_searched_1,domain_id_last_searched_2
0,394965,MLB-PRINTER_INKS,626904.0,1833895.0,1.0,1.0,MLB-NETWORK_PLUGS,MLB-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,MLB-BOOTS_AND_BOOTIES,MLB-BOOTS_AND_BOOTIES,MLB-TOOLS,MLB-HATS_AND_CAPS,,,,
1,331081,MLM-BAR_CODE_SCANNERS,331081.0,768359.0,768359.0,11.0,MLM-BAR_CODE_SCANNERS,MLM-BAR_CODE_SCANNERS,MLM-BAR_CODE_SCANNERS,MLM-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,MLM-BAR_CODE_SCANNERS,MLM-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,,,,
2,1434557,MLB-WRENCH_SETS,1100593.0,74407.0,1021802.0,13.0,MLB-PLANTS,MLB-SPEAKERS,MLB-SPEAKERS,MLB-SPEAKERS,MLB-SPEAKERS,MLB-TV_STORAGE_UNITS,,,,
3,492271,MLM-CELLPHONE_SCREENS,15.0,106702.0,15.0,15.0,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,MLM-CAR_LIGHT_BULBS,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,MLM-CAR_LIGHT_BULBS,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,,,,
4,1540965,MLB-MUGS,860111.0,33.0,194780.0,33.0,MLB-BALL_PIT_BALLS,MLB-INFLATABLE_BALL_PITS,MLB-KIDS_TENTS,MLB-INFLATABLE_BALL_PITS,MLB-BALL_PIT_BALLS,MLB-INFLATABLE_BALL_PITS,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413158,2101186,MLB-INDUSTRIAL_AND_COMMERCIAL_EQUIPMENT,,,,,,,,,,,MLB-HOES,MLB-GLAZED_DISPLAY_CABINETS,,
413159,2101232,MLB-UNIVERSAL_REMOTE_CONTROLS,,,,,,,,,,,MLB-WIRELESS_CHARGERS,MLB-WIRELESS_CHARGERS,,
413160,2101773,MLB-FOOTBALL_SHIRTS,,,,,,,,,,,MLM-TV_SMPS,MLB-T_SHIRTS,,
413161,2102136,MLB-OPERATING_SYSTEMS,,,,,,,,,,,MLM-LASER_POINTERS,MLB-ANVILS,,


In [7]:
item_filename = 'data/interim/item_data.parquet'
cols_load_test = ['item_id', 'domain_id', 'domain_code']
df_item = pd.read_parquet(os.path.join(path, item_filename), columns=cols_load_test)

In [8]:
df_item

Unnamed: 0,item_id,domain_id,domain_code
0,111260,MLM-INDIVIDUAL_HOUSES_FOR_SALE,6105.0
1,871377,MLM-VIDEO_GAMES,7752.0
2,490232,MLM-SKIRTS,7273.0
3,1150706,MLM-GRAPHICS_CARDS,5890.0
4,934912,MLM-NOTEBOOKS,6705.0
...,...,...,...
2102272,1099649,MLB-BABY_STROLLERS,445.0
2102273,1482187,MLB-KITCHEN_SUPPLIES,2312.0
2102274,1118904,MLB-SOCKS,3526.0
2102275,237229,MLB-DISPOSABLE_BABY_DIAPERS,1290.0


# Feature

## Top most bought items for each domain

In [8]:
(df_train[['domain_id_item_bought','item_bought']]
 .reset_index()
 .groupby(by=['domain_id_item_bought','item_bought'])
 .count()
 .sort_values(by=['domain_id_item_bought','index'], ascending=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,index
domain_id_item_bought,item_bought,Unnamed: 2_level_1
MLM-YOGA_MATS,854504,8
MLM-YOGA_MATS,1895261,5
MLM-YOGA_MATS,1174364,4
MLM-YOGA_MATS,778139,1
MLM-XENON_KITS,316218,5
...,...,...
MLB-ABDOMINAL_TONING_BELTS,583001,1
MLB-ABDOMINAL_TONING_BELTS,1243213,1
MLB-ABDOMINAL_TONING_BELTS,1414850,1
MLB-3D_PRINTERS,588264,1


## Top 30 domains with the most purchases

In [9]:
(df_train[['domain_id_item_bought','item_bought']]
.groupby(by='domain_id_item_bought')
.count()
.sort_values(by='item_bought', ascending=False)).head(30)

Unnamed: 0_level_0,item_bought
domain_id_item_bought,Unnamed: 1_level_1
MLB-CELLPHONES,25070
MLB-SNEAKERS,14608
MLB-SUPPLEMENTS,9562
MLB-HEADPHONES,9053
MLB-SMARTWATCHES,7963
MLB-DOLLS,4915
MLB-VIDEO_GAMES,4637
MLM-HEADPHONES,4603
MLB-BOOTS_AND_BOOTIES,4325
MLB-STREAMING_MEDIA_DEVICES,4104


# Model

## First heuristic

### Introduction

Based on the domains from the most viewed and last viewed itens, return a mixed suggestion list.

Ex:


In [8]:
df_train.loc[0, cols_feat_domain]

domain_id_most_viewed_item_1                            MLB-NETWORK_PLUGS
domain_id_most_viewed_item_2       MLB-COMPUTER_EQUIPMENT_AND_SPARE_PARTS
domain_id_last_viewed_item_1                        MLB-BOOTS_AND_BOOTIES
domain_id_last_viewed_item_2                        MLB-BOOTS_AND_BOOTIES
most_viewed_domain_1                                            MLB-TOOLS
most_viewed_domain_2                                    MLB-HATS_AND_CAPS
domain_id_most_searched_ngram_1                                      None
domain_id_most_searched_ngram_2                                      None
domain_id_last_searched_1                                            None
domain_id_last_searched_2                                            None
Name: 0, dtype: object

In this case the suggested items will be divided among MLB-BOOTS_AND_BOOTIES, MLB-NETWORK_PLUGS and MLB-COMPUTER_EQUIPMENT_AND_SPARE_PARTS. So three items from each domain +1 from one of them, to make up 10 items total.

The first item to be recommended from each domain will come from its respective item feature (`last_viewed`, `most_viewed` or `most_searched`, if available).

In [11]:
df_train.loc[0, 'last_viewed_item_1']

1.0

In this example, the `last_viewed_1` was `item_id` number 1 and that would be out recommended item.

Now we still need to pick two more items from this domain. We'll pick the two most bought from the given domain (MLB-BOOTS_AND_BOOTIES in this example).

We also need to be careful with the order of the recommendations.

The recomendation order will be determined by the amount of sold items in each domain. For instance, we've seen above that MLB-CELLPHONES, MLB-SNEAKERS, MLB-SUPPLEMENTS and MLB-HEADPHONES are the top 4 domains in terms of sold items. In an event which these domains appear in the `domain_id_last_viewed_x` and `domain_id_most_viewed_y` features, in any order, we'd pick the items from MLB-CELLPHONES first, MLB-SNEAKERS next, MLB-SUPPLEMENTS and then finally MLB-HEADPHONES, when composing our recommendations.

We'll also default to recommending the top bought items overall if all else fails.

Another important point is to avoid repeating item recommendations since duplicate items are discarded during scoring.

### Implementation

In [8]:
df_train_split, df_test_split = train_test_split(df_train, random_state=7)

In [9]:
df_most_bought = (df_train_split[['domain_id_item_bought','item_bought']]
                  .reset_index()
                  .groupby(by=['domain_id_item_bought','item_bought'])
                  .count()
                  .sort_values(by=['domain_id_item_bought','index'], ascending=False))

In [11]:
df_most_bought.join(df_most_bought
 .reset_index()[['domain_id_item_bought','index']]
 .groupby(by='domain_id_item_bought')
 .sum()
 .sort_values(by='index', ascending=False),
                   how='left', rsuffix='_sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,index,index_sum
domain_id_item_bought,item_bought,Unnamed: 2_level_1,Unnamed: 3_level_1
MLM-YOGA_MATS,854504,6,14
MLM-YOGA_MATS,1895261,4,14
MLM-YOGA_MATS,1174364,3,14
MLM-YOGA_MATS,778139,1,14
MLM-XENON_KITS,316218,5,15
...,...,...,...
MLB-ABDOMINAL_TONING_BELTS,1717031,2,85
MLB-ABDOMINAL_TONING_BELTS,71856,1,85
MLB-ABDOMINAL_TONING_BELTS,1243213,1,85
MLB-ABDOMINAL_TONING_BELTS,1414850,1,85


In [15]:
cols = ['MLM-YOGA_MATS','MLM-XENON_KITS']
[i for items in (zip(*[df_most_bought.loc[c].head(10).index.values for c in cols])) for i in items]

[854504, 316218, 1895261, 1766808, 1174364, 522546, 778139, 861906]

In [16]:
available_domains = (df_most_bought
                     .reset_index()
                     ['domain_id_item_bought']
                     .unique())
most_bought_items = [i[0] 
                     for i in 
                     (df_most_bought
                      .sort_values(by='index', ascending=False)
                      .head(10).values)]

df_most_bought = (df_train_split[['domain_id_item_bought','item_bought']]
                  .reset_index()
                  .groupby(by=['domain_id_item_bought','item_bought'])
                  .count()
                  .sort_values(by=['domain_id_item_bought','index'], ascending=False))
# Add information about the number of items bought per domain
df_most_bought = df_most_bought.join(df_most_bought
                                     .reset_index()[['domain_id_item_bought','index']]
                                     .groupby(by='domain_id_item_bought')
                                     .sum()
                                     .sort_values(by='index', ascending=False),
                                     how='left', rsuffix='_sum')

In [10]:
# No ordering on the domains/items
def predict_simple(row:pd.Series, cols_domain:list,
                cols_item:list, df_most_bought:pd.DataFrame,
                available_domains:list,
                most_bought_items:list)->list:
    """
    It's important that `cols_item` and `cols_domain` are aligned.
    i.e. if the first elem from `cols_item` is `last_viewed_1`
    then the first elem from `cols_domain` should be `domain_id_last_viewed_1`.
    """
    valid_domains = [d for d in row[cols_domain].unique()
                     if d in available_domains]
    
    pred_list = list(set(reduce(lambda x, y: x + [row[y]]
                                if not np.isnan(row[y]) else x,
                                cols_item, [])))
    
    # Interleave top 10 items from each viewed/searched domain
    # and then flatten. We use this in order to recommend the
    # top items from the viewed/searched domains.
    top_items = [i 
                 for items in 
                 zip(*[df_most_bought.loc[c]
                       .head(10).index.values
                       for c in valid_domains])
                 for i in items]
    num_missing_items = 10 - len(pred_list)
    pred_list.extend(top_items[:num_missing_items])
    
    # In case we have not reached 10 items in our recomendation
    # list, we just return the top bought items overall.
    num_missing_items = 10 - len(pred_list)
    pred_list.extend(most_bought_items[:num_missing_items])
    
    pred_list = [int(x) for x in pred_list]
    
    return pred_list

In [6]:
# No ordering on the domains/items
# With voting
def predict_vote(row:pd.Series, cols_domain:list,
                cols_item:list, df_most_bought:pd.DataFrame,
                available_domains:list,
                most_bought_items:list)->list:
    """
    It's important that `cols_item` and `cols_domain` are aligned.
    i.e. if the first elem from `cols_item` is `last_viewed_1`
    then the first elem from `cols_domain` should be `domain_id_last_viewed_1`.
    """
    valid_domains = [d for d in row[cols_domain]
                     if d in available_domains]
    try:
        top_domain = Counter(valid_domains).most_common(1)[0][0]
    except IndexError as e:
        top_domain = 'MLB-CELLPHONES'
    
    pred_list = list(set(reduce(lambda x, y: x + [row[y]]
                                if not np.isnan(row[y]) else x,
                                cols_item, [])))
    
    # Interleave top 10 items from each viewed/searched domain
    # and then flatten. We use this in order to recommend the
    # top items from the viewed/searched domains.
    top_items = (df_most_bought.loc[top_domain]
                 .head(10).index.values)

    num_missing_items = 10 - len(pred_list)
    pred_list.extend(top_items[:num_missing_items])
    
    # In case we have not reached 10 items in our recomendation
    # list, we just return the top bought items overall.
    num_missing_items = 10 - len(pred_list)
    pred_list.extend(most_bought_items[:num_missing_items])
    
    pred_list = [int(x) for x in pred_list]
    
    return pred_list

In [12]:
# Order domain/items by domains with most sold items
def predict_ordered(row:pd.Series, cols_domain:list,
            cols_item:list, df_most_bought:pd.DataFrame,
            available_domains:list,
            most_bought_items:list)->list:
    """
    It's important that `cols_item` and `cols_domain` are aligned.
    i.e. if the first elem from `cols_item` is `last_viewed_1`
    then the first elem from `cols_domain` should be `domain_id_last_viewed_1`.
    """
    valid_domains = [d for d in row[cols_domain].unique()
                     if d in available_domains]
    num_bought_domain = [df_most_bought.loc[v,'index_sum'].values[0]
                         for v in valid_domains]
    
    sorted_items = sorted(zip(row[cols_item],num_bought_domain),
                          key=lambda t: t[1], reverse=True)
    pred_list = list(filter(lambda i: not np.isnan(i),
                            set([x[0] for x in sorted_items])))
    
    # Interleave top 10 items from each viewed/searched domain
    # and then flatten. We use this in order to recommend the
    # top items from the viewed/searched domains.
    
    sorted_domains = [d[0]
                      for d in
                      sorted(zip(valid_domains, num_bought_domain),
                             key=lambda t: t[1], reverse=True)]
    top_items = [i 
                 for items in 
                 zip(*[df_most_bought.loc[c]
                       .head(10).index.values
                       for c in valid_domains])
                 for i in items]
    num_missing_items = 10 - len(pred_list)
    pred_list.extend(top_items[:num_missing_items])
    
    # In case we have not reached 10 items in our recomendation
    # list, we just return the top bought items overall.
    num_missing_items = 10 - len(pred_list)
    pred_list.extend(most_bought_items[:num_missing_items])
    
    pred_list = [int(x) for x in pred_list]
    
    return pred_list

# Experiments

In [54]:
#Auxiliary function from ML
def jl_to_list(fname):
    output = []
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [55]:
path = '../'
item_data = jl_to_list(os.path.join(path,'data/raw/item_data.jl.gz'))

## All domain features

### Simple

In [20]:
predict_ = partial(predict_simple, cols_domain=cols_feat_domain,
                   cols_item=cols_item,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [21]:
%%time
y_pred = df_test_split.apply(predict_, axis=1).values

CPU times: user 3min 17s, sys: 290 ms, total: 3min 18s
Wall time: 3min 19s


In [22]:
y_pred[:10]

array([list([702360, 1609412, 1198853, 1969824, 501048, 624374, 1818325, 100484, 88545, 331585]),
       list([1354549, 9643, 1944909, 37223, 1567936, 1628496, 1433166, 649361, 1197718, 615995]),
       list([882697, 1710131, 197884, 859574, 2028013, 1371799, 301390, 1332849, 795772, 1098739]),
       list([1769201, 1140650, 2082809, 1344440, 37718, 1563480, 1843174, 1297894, 362475, 2042014]),
       list([236729, 1839702, 1055848, 1628484, 1017181, 1494195, 2071025, 347256, 1713316, 1145230]),
       list([1417407, 1781650, 2058598, 1263983, 457734, 443430, 1673835, 1151909, 1020256, 766628]),
       list([533194, 451314, 1533900, 340102, 116771, 1385320, 583139, 754274, 1506037, 29021]),
       list([1699907, 430252, 1887359, 1404005, 1779261, 100838, 717628, 585850, 1142113, 2047913]),
       list([1771696, 613068, 2065319, 1469766, 1735418, 7244, 393734, 777247, 1561930, 132800]),
       list([1317448, 1080242, 64837, 868959, 441418, 2037039, 412, 292, 291, 259])],
      dtype=obj

In [25]:
y_true = df_test_split['item_bought'].values

In [26]:
%%time
score = ndcg_score(y_true, y_pred, item_data, n_predictions=10)

CPU times: user 3.77 s, sys: 332 ms, total: 4.11 s
Wall time: 4.12 s


In [27]:
score

0.23140759403311448

### Vote

In [28]:
predict_ = partial(predict_vote, cols_domain=cols_feat_domain,
                   cols_item=cols_item,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [29]:
%%time
y_pred = df_test_split.apply(predict_, axis=1).values

CPU times: user 3min 12s, sys: 279 ms, total: 3min 13s
Wall time: 3min 13s


In [30]:
y_pred[:10]

array([list([702360, 1609412, 1198853, 1969824, 501048, 88545, 987327, 657474, 999089, 929184]),
       list([1354549, 9643, 1944909, 37223, 1567936, 649361, 1600440, 120284, 863572, 412]),
       list([882697, 1710131, 197884, 859574, 1371799, 1332849, 1098739, 882697, 98853, 119703]),
       list([1769201, 1140650, 2082809, 1344440, 1563480, 1297894, 2042014, 896262, 1244919, 1803825]),
       list([236729, 1839702, 1055848, 1017181, 2071025, 1713316, 1039418, 149802, 1841823, 1694827]),
       list([1417407, 1781650, 2058598, 1263983, 457734, 1151909, 1617443, 412, 292, 291]),
       list([533194, 451314, 1533900, 116771, 583139, 1506037, 1336864, 122562, 195578, 315761]),
       list([1699907, 430252, 1887359, 1779261, 585850, 2076567, 573279, 200082, 469802, 1202417]),
       list([1771696, 613068, 2065319, 1469766, 1735418, 7244, 393734, 777247, 1561930, 132800]),
       list([1317448, 1080242, 64837, 868959, 441418, 351968, 2069194, 1331091, 1705893, 1044110])],
      dtype=obje

In [31]:
y_true = df_test_split['item_bought'].values

In [32]:
%%time
score = ndcg_score(y_true, y_pred, item_data, n_predictions=10)

CPU times: user 3.28 s, sys: 256 ms, total: 3.54 s
Wall time: 3.55 s


In [33]:
score

0.24133640993122316

### Ordered

In [35]:
predict_ = partial(predict_ordered, cols_domain=cols_feat_domain,
                   cols_item=cols_item,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [36]:
%%time
y_pred = df_test_split.apply(predict_, axis=1).values

CPU times: user 5min 48s, sys: 270 ms, total: 5min 48s
Wall time: 5min 50s


In [37]:
y_pred[:10]

array([list([702360, 1609412, 1198853, 1969824, 501048, 624374, 1818325, 100484, 88545, 331585]),
       list([9643, 1944909, 37223, 1567936, 1628496, 1433166, 649361, 1197718, 615995, 1600440]),
       list([882697, 1710131, 859574, 2028013, 1371799, 301390, 1332849, 795772, 1098739, 1523773]),
       list([1769201, 1140650, 1344440, 37718, 1563480, 1843174, 1297894, 362475, 2042014, 163051]),
       list([236729, 1839702, 1055848, 1628484, 1017181, 1494195, 2071025, 347256, 1713316, 1145230]),
       list([1781650, 2058598, 1263983, 457734, 443430, 1673835, 1151909, 1020256, 766628, 412]),
       list([533194, 451314, 1533900, 340102, 116771, 1385320, 583139, 754274, 1506037, 29021]),
       list([1699907, 430252, 1887359, 1404005, 1779261, 100838, 717628, 585850, 1142113, 2047913]),
       list([1771696, 613068, 2065319, 1469766, 1735418, 7244, 393734, 777247, 1561930, 132800]),
       list([64837, 868959, 441418, 2037039, 412, 292, 291, 259, 256, 255])],
      dtype=object)

In [38]:
y_true = df_test_split['item_bought'].values

In [39]:
%%time
score = ndcg_score(y_true, y_pred, item_data, n_predictions=10)

CPU times: user 3.55 s, sys: 266 ms, total: 3.82 s
Wall time: 3.83 s


In [40]:
score

0.22479317017843778

## No last searched feature

In [64]:
cols_feat_domain_no_last = [c for c in cols_feat_domain
                            if c not in cols_searched_domain]

### Simple

In [65]:
predict_ = partial(predict_simple, cols_domain=cols_feat_domain_no_last,
                   cols_item=cols_item,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [66]:
%%time
y_pred = df_test_split.apply(predict_, axis=1).values

CPU times: user 3min 11s, sys: 284 ms, total: 3min 11s
Wall time: 3min 12s


In [67]:
y_pred[:10]

array([list([702360, 1609412, 1198853, 1969824, 501048, 624374, 1818325, 100484, 88545, 331585]),
       list([1354549, 9643, 1944909, 37223, 1567936, 1628496, 1433166, 649361, 1197718, 615995]),
       list([882697, 1710131, 197884, 859574, 2028013, 1371799, 301390, 1332849, 795772, 1098739]),
       list([1769201, 1140650, 2082809, 1344440, 37718, 1563480, 1843174, 1297894, 362475, 2042014]),
       list([236729, 1839702, 1055848, 1628484, 1017181, 1494195, 2071025, 347256, 1713316, 1145230]),
       list([1417407, 1781650, 2058598, 1263983, 457734, 443430, 1673835, 1151909, 1020256, 766628]),
       list([533194, 451314, 1533900, 340102, 116771, 1385320, 583139, 754274, 1506037, 29021]),
       list([1699907, 430252, 1887359, 1404005, 1779261, 100838, 717628, 585850, 1142113, 2047913]),
       list([1771696, 613068, 2065319, 1469766, 1735418, 7244, 393734, 777247, 1561930, 132800]),
       list([1317448, 1080242, 64837, 868959, 441418, 2037039, 412, 292, 291, 259])],
      dtype=obj

In [68]:
y_true = df_test_split['item_bought'].values

In [69]:
%%time
score = ndcg_score(y_true, y_pred, item_data, n_predictions=10)

CPU times: user 3.73 s, sys: 266 ms, total: 4 s
Wall time: 4.01 s


In [70]:
score

0.23140759403311448

### Vote

In [71]:
predict_ = partial(predict_vote, cols_domain=cols_feat_domain_no_last,
                   cols_item=cols_item,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [72]:
%%time
y_pred = df_test_split.apply(predict_, axis=1).values

CPU times: user 2min 52s, sys: 246 ms, total: 2min 52s
Wall time: 2min 52s


In [73]:
y_pred[:10]

array([list([702360, 1609412, 1198853, 1969824, 501048, 88545, 987327, 657474, 999089, 929184]),
       list([1354549, 9643, 1944909, 37223, 1567936, 649361, 1600440, 120284, 863572, 412]),
       list([882697, 1710131, 197884, 859574, 1371799, 1332849, 1098739, 882697, 98853, 119703]),
       list([1769201, 1140650, 2082809, 1344440, 1563480, 1297894, 2042014, 896262, 1244919, 1803825]),
       list([236729, 1839702, 1055848, 1017181, 2071025, 1713316, 1039418, 149802, 1841823, 1694827]),
       list([1417407, 1781650, 2058598, 1263983, 457734, 1151909, 1617443, 412, 292, 291]),
       list([533194, 451314, 1533900, 116771, 583139, 1506037, 1336864, 122562, 195578, 315761]),
       list([1699907, 430252, 1887359, 1779261, 585850, 2076567, 573279, 200082, 469802, 1202417]),
       list([1771696, 613068, 2065319, 1469766, 1735418, 7244, 393734, 777247, 1561930, 132800]),
       list([1317448, 1080242, 64837, 868959, 441418, 351968, 2069194, 1331091, 1705893, 1044110])],
      dtype=obje

In [74]:
y_true = df_test_split['item_bought'].values

In [75]:
%%time
score = ndcg_score(y_true, y_pred, item_data, n_predictions=10)

CPU times: user 3.54 s, sys: 228 ms, total: 3.76 s
Wall time: 3.78 s


In [76]:
score

0.24133640993122316

## No "last" feature

In [77]:
cols_item_no_last = cols_item[2:]

### Simple

In [78]:
predict_ = partial(predict_simple, cols_domain=cols_feat_domain_no_last,
                   cols_item=cols_item_no_last,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [79]:
%%time
y_pred = df_test_split.apply(predict_, axis=1).values

CPU times: user 3min 41s, sys: 284 ms, total: 3min 42s
Wall time: 3min 42s


In [80]:
y_pred[:10]

array([list([1969824, 1198853, 501048, 624374, 1818325, 100484, 88545, 331585, 1043551, 545004]),
       list([1354549, 37223, 1567936, 1628496, 1433166, 649361, 1197718, 615995, 1600440, 459433]),
       list([197884, 859574, 2028013, 1371799, 301390, 1332849, 795772, 1098739, 1523773, 882697]),
       list([2082809, 1140650, 1344440, 37718, 1563480, 1843174, 1297894, 362475, 2042014, 163051]),
       list([1839702, 1055848, 1628484, 1017181, 1494195, 2071025, 347256, 1713316, 1145230, 1039418]),
       list([1417407, 1263983, 457734, 443430, 1673835, 1151909, 1020256, 766628, 412, 292]),
       list([533194, 451314, 1533900, 340102, 116771, 1385320, 583139, 754274, 1506037, 29021]),
       list([1699907, 1887359, 1404005, 1779261, 100838, 717628, 585850, 1142113, 2047913, 2076567]),
       list([1771696, 613068, 2065319, 1469766, 1735418, 7244, 393734, 777247, 1561930, 132800]),
       list([1317448, 1080242, 441418, 2037039, 412, 292, 291, 259, 256, 255])],
      dtype=object)

In [81]:
y_true = df_test_split['item_bought'].values

In [82]:
%%time
score = ndcg_score(y_true, y_pred, item_data, n_predictions=10)

CPU times: user 3.61 s, sys: 272 ms, total: 3.89 s
Wall time: 3.91 s


In [83]:
score

0.22489347687363484

### Vote

In [84]:
predict_ = partial(predict_vote, cols_domain=cols_feat_domain_no_last,
                   cols_item=cols_item_no_last,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [85]:
%%time
y_pred = df_test_split.apply(predict_, axis=1).values

CPU times: user 4min 2s, sys: 272 ms, total: 4min 2s
Wall time: 4min 3s


In [86]:
y_pred[:10]

array([list([1969824, 1198853, 501048, 88545, 987327, 657474, 999089, 929184, 1027716, 104300]),
       list([1354549, 37223, 1567936, 649361, 1600440, 120284, 863572, 412, 292, 291]),
       list([197884, 859574, 1371799, 1332849, 1098739, 882697, 98853, 119703, 790888, 1595373]),
       list([2082809, 1140650, 1344440, 1563480, 1297894, 2042014, 896262, 1244919, 1803825, 626263]),
       list([1839702, 1055848, 1017181, 2071025, 1713316, 1039418, 149802, 1841823, 1694827, 80420]),
       list([1417407, 1263983, 457734, 1151909, 1617443, 412, 292, 291, 259, 256]),
       list([533194, 451314, 1533900, 116771, 583139, 1506037, 1336864, 122562, 195578, 315761]),
       list([1699907, 1887359, 1779261, 585850, 2076567, 573279, 200082, 469802, 1202417, 1246381]),
       list([1771696, 613068, 2065319, 1469766, 1735418, 7244, 393734, 777247, 1561930, 132800]),
       list([1317448, 1080242, 441418, 351968, 2069194, 1331091, 1705893, 1044110, 1339046, 321530])],
      dtype=object)

In [87]:
y_true = df_test_split['item_bought'].values

In [88]:
%%time
score = ndcg_score(y_true, y_pred, item_data, n_predictions=10)

CPU times: user 3.19 s, sys: 216 ms, total: 3.4 s
Wall time: 3.41 s


In [89]:
score

0.23743886787923346

## Results

Best one is: 

All features + voting heuristic.

Score: 0.24133640993122316

# Random Forest Classifier

Use a Random Forest Classifier to predict the domain of the bought item.

Use the results as a new feature (`domain_id_forest`).

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
df_train = pd.read_parquet(os.path.join(path, train_filename), columns=cols_load_train)

In [13]:
domain_mapper = {x[0]: int(x[1])
                 for x in (df_item
                           [['domain_id','domain_code']]
                           .dropna(how='all')
                           .values)}

In [14]:
for c in cols_feat_domain:
    df_train[f'encoded_{c}'] = df_train[c].map(domain_mapper).fillna(-1)

In [15]:
df_train_split, df_test_split = train_test_split(df_train, random_state=7)

In [16]:
df_most_bought = (df_train_split[['domain_id_item_bought','item_bought']]
                  .reset_index()
                  .groupby(by=['domain_id_item_bought','item_bought'])
                  .count()
                  .sort_values(by=['domain_id_item_bought','index'], ascending=False))

In [17]:
domains_most_bought_filter = ((df_train_split
 [['domain_id_item_bought','item_bought']]
 .groupby(by='domain_id_item_bought')
 .count()
 .sort_values(by='item_bought', ascending=False)
 .cumsum()/len(df_train))
 .query('item_bought < 0.6')).index

In [18]:
df_train_proc = (df_train_split
                 [df_train_split['domain_id_item_bought']
                  .isin(domains_most_bought_filter)]
                 .reset_index(drop=True)
                 .fillna(-1))

In [19]:
df_test_proc = df_test_split.fillna(-1)

In [20]:
df_train_proc['domain_id_item_bought'] = df_train_proc['domain_id_item_bought'].map(domain_mapper).fillna(-1)
df_test_proc['domain_id_item_bought'] = df_test_proc['domain_id_item_bought'].map(domain_mapper).fillna(-1)

In [23]:
df_test_proc

Unnamed: 0,item_bought,domain_id_item_bought,most_viewed_item_1,most_viewed_item_2,last_viewed_item_1,last_viewed_item_2,domain_id_most_viewed_item_1,domain_id_most_viewed_item_2,domain_id_last_viewed_item_1,domain_id_last_viewed_item_2,...,encoded_domain_id_most_viewed_item_1,encoded_domain_id_most_viewed_item_2,encoded_domain_id_last_viewed_item_1,encoded_domain_id_last_viewed_item_2,encoded_most_viewed_domain_1,encoded_most_viewed_domain_2,encoded_domain_id_most_searched_ngram_1,encoded_domain_id_most_searched_ngram_2,encoded_domain_id_last_searched_1,encoded_domain_id_last_searched_2
194770,1755076,4044,702360.0,1609412.0,1969824.0,1198853.0,MLB-HOME_SUPPLIES,MLB-HOME_SUPPLIES,MLB-ANIMAL_AND_BALL_HOPPERS,MLB-AUTOMOTIVE_EMBLEMS,...,2066.0,2066.0,88.0,228.0,2066.0,4044.0,-1.0,-1.0,-1.0,-1.0
219922,1974062,886,1944909.0,9643.0,37223.0,1354549.0,MLB-SEAT_BELTS,MLB-SEAT_BELTS,MLB-PANTS,MLB-VEHICLE_ACCESSORIES,...,3422.0,3422.0,2937.0,3983.0,3422.0,3983.0,-1.0,-1.0,-1.0,-1.0
31979,2038562,2066,882697.0,1710131.0,197884.0,197884.0,MLB-CELLPHONES,MLB-CELLPHONES,MLB-AIR_CONDITIONERS,MLB-AIR_CONDITIONERS,...,886.0,886.0,46.0,46.0,886.0,46.0,-1.0,-1.0,-1.0,-1.0
338550,1140650,3374,1140650.0,1769201.0,1140650.0,2082809.0,MLB-SANDALS_AND_FLIP_FLOPS,MLB-SANDALS_AND_FLIP_FLOPS,MLB-SANDALS_AND_FLIP_FLOPS,MLB-SANDALS_AND_FLIP_FLOPS,...,3374.0,3374.0,3374.0,3374.0,3374.0,2037.0,-1.0,-1.0,-1.0,-1.0
299605,1296948,2779,236729.0,1839702.0,1839702.0,1839702.0,MLB-COFFEE_MAKERS,MLB-COFFEE_MAKERS,MLB-COFFEE_MAKERS,MLB-COFFEE_MAKERS,...,1017.0,1017.0,1017.0,1017.0,1017.0,1015.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381198,288129,7182,1854903.0,1854903.0,1854903.0,-1.0,MLM-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,-1,MLM-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,-1,...,5163.0,-1.0,5163.0,-1.0,5163.0,-1.0,-1.0,-1.0,-1.0,-1.0
159971,556280,888,1781.0,477683.0,1158701.0,984219.0,MLB-CELLPHONE_ACCESSORIES,MLB-CELLPHONE_ACCESSORIES,MLB-CELLPHONE_ACCESSORIES,MLB-CELLPHONE_ACCESSORIES,...,888.0,888.0,888.0,888.0,888.0,-1.0,-1.0,-1.0,-1.0,-1.0
157266,2066646,813,695928.0,967886.0,695928.0,967886.0,MLB-WRISTWATCHES,MLB-WRISTWATCHES,MLB-WRISTWATCHES,MLB-WRISTWATCHES,...,4193.0,4193.0,4193.0,4193.0,4193.0,-1.0,-1.0,-1.0,-1.0,-1.0
49161,391231,3522,391231.0,307115.0,307115.0,307115.0,MLB-SNEAKERS,MLB-SNEAKERS,MLB-SNEAKERS,MLB-SNEAKERS,...,3522.0,3522.0,3522.0,3522.0,3522.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [25]:
cols_feat_domain_encoded = [f'encoded_{c}' for c in cols_feat_domain]

X_train, y_train = (df_train_proc[cols_feat_domain_encoded].astype('int32'),
                    df_train_proc['domain_id_item_bought'].astype('int32'))

X_test, y_test = (df_test_proc[cols_feat_domain_encoded].astype('int32'),
                  df_test_proc['domain_id_item_bought'].astype('int32'))

In [29]:
clf = RandomForestClassifier(max_depth=12, 
                             n_estimators=200,
                             n_jobs=-1,
                             class_weight='balanced',
                             random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=12, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [30]:
clf.score(X_test, y_test)

0.30685151658905424

In [31]:
clf.score(X_train, y_train)

0.4076285374266293

In [33]:
y_pred_tree = clf.predict(X_test)

In [34]:
df_test_split['domain_id_tree'] = y_pred_tree

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_split['domain_id_tree'] = y_pred_tree


In [39]:
domain_mapper_reverse = {value: key for (key,value) in domain_mapper.items()}

In [42]:
df_test_split['domain_id_tree'] = df_test_split['domain_id_tree'].map(domain_mapper_reverse)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_split['domain_id_tree'] = df_test_split['domain_id_tree'].map(domain_mapper_reverse)


In [43]:
df_test_split

Unnamed: 0,item_bought,domain_id_item_bought,most_viewed_item_1,most_viewed_item_2,last_viewed_item_1,last_viewed_item_2,domain_id_most_viewed_item_1,domain_id_most_viewed_item_2,domain_id_last_viewed_item_1,domain_id_last_viewed_item_2,...,encoded_domain_id_most_viewed_item_2,encoded_domain_id_last_viewed_item_1,encoded_domain_id_last_viewed_item_2,encoded_most_viewed_domain_1,encoded_most_viewed_domain_2,encoded_domain_id_most_searched_ngram_1,encoded_domain_id_most_searched_ngram_2,encoded_domain_id_last_searched_1,encoded_domain_id_last_searched_2,domain_id_tree
194770,1755076,MLB-VINYL_ROLLS,702360.0,1609412.0,1969824.0,1198853.0,MLB-HOME_SUPPLIES,MLB-HOME_SUPPLIES,MLB-ANIMAL_AND_BALL_HOPPERS,MLB-AUTOMOTIVE_EMBLEMS,...,2066.0,88.0,228.0,2066.0,4044.0,-1.0,-1.0,-1.0,-1.0,MLB-HOME_SUPPLIES
219922,1974062,MLB-CELLPHONES,1944909.0,9643.0,37223.0,1354549.0,MLB-SEAT_BELTS,MLB-SEAT_BELTS,MLB-PANTS,MLB-VEHICLE_ACCESSORIES,...,3422.0,2937.0,3983.0,3422.0,3983.0,-1.0,-1.0,-1.0,-1.0,MLB-SCOOTERS
31979,2038562,MLB-HOME_SUPPLIES,882697.0,1710131.0,197884.0,197884.0,MLB-CELLPHONES,MLB-CELLPHONES,MLB-AIR_CONDITIONERS,MLB-AIR_CONDITIONERS,...,886.0,46.0,46.0,886.0,46.0,-1.0,-1.0,-1.0,-1.0,MLB-AIR_CONDITIONERS
338550,1140650,MLB-SANDALS_AND_FLIP_FLOPS,1140650.0,1769201.0,1140650.0,2082809.0,MLB-SANDALS_AND_FLIP_FLOPS,MLB-SANDALS_AND_FLIP_FLOPS,MLB-SANDALS_AND_FLIP_FLOPS,MLB-SANDALS_AND_FLIP_FLOPS,...,3374.0,3374.0,3374.0,3374.0,2037.0,-1.0,-1.0,-1.0,-1.0,MLB-SANDALS_AND_FLIP_FLOPS
299605,1296948,MLB-NAIL_FILES,236729.0,1839702.0,1839702.0,1839702.0,MLB-COFFEE_MAKERS,MLB-COFFEE_MAKERS,MLB-COFFEE_MAKERS,MLB-COFFEE_MAKERS,...,1017.0,1017.0,1017.0,1017.0,1015.0,-1.0,-1.0,-1.0,-1.0,MLB-COFFEE_CAPSULES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381198,288129,MLM-SCHOOL_AND_OFFICE_PAPERS,1854903.0,1854903.0,1854903.0,,MLM-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,,MLM-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,,...,-1.0,5163.0,-1.0,5163.0,-1.0,-1.0,-1.0,-1.0,-1.0,MLM-COMPUTER_EQUIPMENT_AND_SPARE_PARTS
159971,556280,MLB-CELLPHONE_ACCESSORIES,1781.0,477683.0,1158701.0,984219.0,MLB-CELLPHONE_ACCESSORIES,MLB-CELLPHONE_ACCESSORIES,MLB-CELLPHONE_ACCESSORIES,MLB-CELLPHONE_ACCESSORIES,...,888.0,888.0,888.0,888.0,-1.0,-1.0,-1.0,-1.0,-1.0,MLB-CELLPHONE_ACCESSORIES
157266,2066646,MLB-CARD_PAYMENT_TERMINALS,695928.0,967886.0,695928.0,967886.0,MLB-WRISTWATCHES,MLB-WRISTWATCHES,MLB-WRISTWATCHES,MLB-WRISTWATCHES,...,4193.0,4193.0,4193.0,4193.0,-1.0,-1.0,-1.0,-1.0,-1.0,MLB-WRISTWATCHES
49161,391231,MLB-SNEAKERS,391231.0,307115.0,307115.0,307115.0,MLB-SNEAKERS,MLB-SNEAKERS,MLB-SNEAKERS,MLB-SNEAKERS,...,3522.0,3522.0,3522.0,3522.0,-1.0,-1.0,-1.0,-1.0,-1.0,MLB-SNEAKERS


In [48]:
df_most_bought = (df_train_split[['domain_id_item_bought','item_bought']]
                  .reset_index()
                  .groupby(by=['domain_id_item_bought','item_bought'])
                  .count()
                  .sort_values(by=['domain_id_item_bought','index'], ascending=False))

# Add information about the number of items bought per domain
df_most_bought = df_most_bought.join(df_most_bought
                                     .reset_index()[['domain_id_item_bought','index']]
                                     .groupby(by='domain_id_item_bought')
                                     .sum()
                                     .sort_values(by='index', ascending=False),
                                     how='left', rsuffix='_sum')

most_bought_items = [i[0] 
                     for i in 
                     (df_most_bought
                      .sort_values(by='index', ascending=False)
                      .head(10).values)]

available_domains = (df_most_bought
                     .reset_index()
                     ['domain_id_item_bought']
                     .unique())

predict_ = partial(predict_vote, cols_domain=cols_feat_domain+['domain_id_tree'],
                   cols_item=cols_item,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [49]:
%%time
y_pred = df_test_split.apply(predict_, axis=1).values

CPU times: user 2min 57s, sys: 358 ms, total: 2min 57s
Wall time: 2min 57s


In [50]:
y_pred[:10]

array([list([702360, 1609412, 1198853, 1969824, 501048, 88545, 987327, 657474, 999089, 929184]),
       list([1354549, 9643, 1944909, 37223, 1567936, 649361, 1600440, 120284, 863572, 412]),
       list([882697, 1710131, 197884, 2028013, 301390, 795772, 1523773, 451393, 1951279, 641284]),
       list([1769201, 1140650, 2082809, 1344440, 1563480, 1297894, 2042014, 896262, 1244919, 1803825]),
       list([236729, 1839702, 1055848, 1017181, 2071025, 1713316, 1039418, 149802, 1841823, 1694827]),
       list([1417407, 1781650, 2058598, 1263983, 457734, 1151909, 1617443, 412, 292, 291]),
       list([533194, 451314, 1533900, 116771, 583139, 1506037, 1336864, 122562, 195578, 315761]),
       list([1699907, 430252, 1887359, 1404005, 717628, 2047913, 215683, 349404, 706521, 1009625]),
       list([1771696, 613068, 2065319, 1469766, 1735418, 7244, 393734, 777247, 1561930, 132800]),
       list([1317448, 1080242, 64837, 868959, 441418, 351968, 2069194, 1331091, 1705893, 1044110])],
      dtype=obj

In [51]:
y_true = df_test_split['item_bought'].values

In [56]:
%%time
score = ndcg_score(y_true, y_pred, item_data, n_predictions=10)

CPU times: user 3.8 s, sys: 109 ms, total: 3.9 s
Wall time: 3.9 s


In [57]:
score

0.24149850227976313

#### Test

Using only the tree predicted domain.

In [58]:
predict_ = partial(predict_vote, cols_domain=['domain_id_tree'],
                   cols_item=cols_item,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [59]:
%%time
y_pred = df_test_split.apply(predict_, axis=1).values

CPU times: user 1min 53s, sys: 157 ms, total: 1min 54s
Wall time: 1min 54s


In [60]:
%%time
score = ndcg_score(y_true, y_pred, item_data, n_predictions=10)

CPU times: user 3.27 s, sys: 116 ms, total: 3.39 s
Wall time: 3.39 s


In [61]:
score

0.2291374253956851

# Submission

## All Features + Voting

In [8]:
path = '../'
train_filename = 'data/interim/train_dataset_features.parquet'
test_filename = 'data/interim/test_dataset_features.parquet'

cols_load = ['item_bought', 'domain_id_item_bought']
cols_feat_domain = []

cols_item = [f'{i}_viewed_item_{j}' for i in ['most','last'] for j in range(1,3)]
cols_load.extend(cols_item)

cols_item_domain = [f'domain_id_{c}' for c in cols_item]
cols_load.extend(cols_item_domain)
cols_feat_domain.extend(cols_item_domain)

cols_domain = [f'most_viewed_domain_{i}' for i in range(1,3)]
cols_load.extend(cols_domain)
cols_feat_domain.extend(cols_domain)

cols_ngram_domain = [f'domain_id_most_searched_ngram_{i}' for i in range(1,3)]
cols_load.extend(cols_ngram_domain)
cols_feat_domain.extend(cols_ngram_domain)

cols_searched_domain = [f'domain_id_last_searched_{i}' for i in range(1,3)]
cols_load.extend(cols_searched_domain)
cols_feat_domain.extend(cols_searched_domain)

In [16]:
df_train = pd.read_parquet(os.path.join(path, train_filename), columns=cols_load)
df_test = pd.read_parquet(os.path.join(path, test_filename), columns=cols_load[2:]+['user_id'])

In [17]:
df_most_bought = (df_train[['domain_id_item_bought','item_bought']]
                  .reset_index()
                  .groupby(by=['domain_id_item_bought','item_bought'])
                  .count()
                  .sort_values(by=['domain_id_item_bought','index'], ascending=False))

# Add information about the number of items bought per domain
df_most_bought = df_most_bought.join(df_most_bought
                                     .reset_index()[['domain_id_item_bought','index']]
                                     .groupby(by='domain_id_item_bought')
                                     .sum()
                                     .sort_values(by='index', ascending=False),
                                     how='left', rsuffix='_sum')

most_bought_items = [i[0] 
                     for i in 
                     (df_most_bought
                      .sort_values(by='index', ascending=False)
                      .head(10).values)]

available_domains = (df_most_bought
                     .reset_index()
                     ['domain_id_item_bought']
                     .unique())

predict_ = partial(predict_vote, cols_domain=cols_domain,
                   cols_item=cols_item,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [18]:
df_test = df_test.set_index('user_id').sort_index()

In [19]:
y_pred = df_test.apply(predict_, axis=1).values

In [20]:
df_y_pred = pd.DataFrame(list(y_pred))

In [21]:
df_y_pred

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,685656,1194894,1572239,1439187,1864599,350187,1006988,1717880,457681,758382
1,943786,831243,849692,361733,1140688,1056120,353783,1676401,1007213,1386246
2,1614538,1986443,991246,1219935,659207,1453414,1425924,1260633,251150,731397
3,859574,1371799,119703,1332849,882697,1098739,98853,790888,967194,1595373
4,1398419,1695773,868571,753509,914038,1197614,509032,795435,834557,1696062
...,...,...,...,...,...,...,...,...,...,...
177065,24604,88545,501048,987327,657474,999089,1027716,929184,1328040,1873494
177066,331472,1651522,1384867,1907343,1711350,558,384,371,348,344
177067,1061535,2033823,679654,460028,1395589,1550304,289602,1492333,1426798,359542
177068,1803169,762367,1802089,894197,2029390,877276,1116998,514188,170257,613975


In [22]:
from datetime import datetime

In [28]:
now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model = input()
submission_filename = f'../data/submission/{model}_{now}.csv'
print(submission_filename)

 heuristic_vote


../data/submission/heuristic_vote_2020-11-30_17-23-38.csv


In [29]:
df_y_pred.to_csv(submission_filename, index=False, header=False)

## All Features + Tree + Voting

In [2]:
path = '../'
train_filename = 'data/interim/train_dataset_features.parquet'
test_filename = 'data/interim/test_dataset_features.parquet'
item_filename = 'data/interim/item_data.parquet'

cols_load_test = ['item_id', 'domain_id', 'domain_code']

cols_load_train = ['item_bought', 'domain_id_item_bought']
cols_feat_domain = []

cols_item = [f'{i}_viewed_item_{j}' for i in ['most','last'] for j in range(1,3)]
cols_load_train.extend(cols_item)

cols_item_domain = [f'domain_id_{c}' for c in cols_item]
cols_load_train.extend(cols_item_domain)
cols_feat_domain.extend(cols_item_domain)

cols_domain = [f'most_viewed_domain_{i}' for i in range(1,3)]
cols_load_train.extend(cols_domain)
cols_feat_domain.extend(cols_domain)

cols_ngram_domain = [f'domain_id_most_searched_ngram_{i}' for i in range(1,3)]
cols_load_train.extend(cols_ngram_domain)
cols_feat_domain.extend(cols_ngram_domain)

cols_searched_domain = [f'domain_id_last_searched_{i}' for i in range(1,3)]
cols_load_train.extend(cols_searched_domain)
cols_feat_domain.extend(cols_searched_domain)

In [3]:
df_train = pd.read_parquet(os.path.join(path, train_filename), columns=cols_load_train)
df_test = pd.read_parquet(os.path.join(path, test_filename), columns=cols_load_train[2:]+['user_id'])
df_item = pd.read_parquet(os.path.join(path, item_filename), columns=cols_load_test)

In [7]:
domain_mapper = {x[0]: int(x[1])
                 for x in (df_item
                           [['domain_id','domain_code']]
                           .dropna(how='all')
                           .values)}

domain_mapper_reverse = {value: key for (key,value) in domain_mapper.items()}

df_most_bought = (df_train[['domain_id_item_bought','item_bought']]
                  .reset_index()
                  .groupby(by=['domain_id_item_bought','item_bought'])
                  .count()
                  .sort_values(by=['domain_id_item_bought','index'], ascending=False))

# Add information about the number of items bought per domain
df_most_bought = df_most_bought.join(df_most_bought
                                     .reset_index()[['domain_id_item_bought','index']]
                                     .groupby(by='domain_id_item_bought')
                                     .sum()
                                     .sort_values(by='index', ascending=False),
                                     how='left', rsuffix='_sum')

most_bought_items = [i[0] 
                     for i in 
                     (df_most_bought
                      .sort_values(by='index', ascending=False)
                      .head(10).values)]

available_domains = (df_most_bought
                     .reset_index()
                     ['domain_id_item_bought']
                     .unique())

predict_ = partial(predict_vote, cols_domain=cols_domain+['domain_id_tree'],
                   cols_item=cols_item,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [8]:
df_test = df_test.set_index('user_id').sort_index()

In [9]:
cols_feat_domain_encoded = [f'encoded_{c}' for c in cols_feat_domain]

for c in cols_feat_domain:
    df_train[f'encoded_{c}'] = df_train[c].map(domain_mapper).fillna(-1)
    
df_train['encoded_domain_id_item_bought'] = df_train['domain_id_item_bought'].map(domain_mapper).fillna(-1)

for c in cols_feat_domain:
    df_test[f'encoded_{c}'] = df_test[c].map(domain_mapper).fillna(-1)

In [10]:
df_train

Unnamed: 0,item_bought,domain_id_item_bought,most_viewed_item_1,most_viewed_item_2,last_viewed_item_1,last_viewed_item_2,domain_id_most_viewed_item_1,domain_id_most_viewed_item_2,domain_id_last_viewed_item_1,domain_id_last_viewed_item_2,...,encoded_domain_id_most_viewed_item_2,encoded_domain_id_last_viewed_item_1,encoded_domain_id_last_viewed_item_2,encoded_most_viewed_domain_1,encoded_most_viewed_domain_2,encoded_domain_id_most_searched_ngram_1,encoded_domain_id_most_searched_ngram_2,encoded_domain_id_last_searched_1,encoded_domain_id_last_searched_2,encoded_domain_id_item_bought
0,394965,MLB-PRINTER_INKS,626904.0,1833895.0,1.0,1.0,MLB-NETWORK_PLUGS,MLB-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,MLB-BOOTS_AND_BOOTIES,MLB-BOOTS_AND_BOOTIES,...,1072.0,692.0,692.0,3838.0,2021.0,-1.0,-1.0,-1.0,-1.0,3192
1,331081,MLM-BAR_CODE_SCANNERS,331081.0,768359.0,768359.0,11.0,MLM-BAR_CODE_SCANNERS,MLM-BAR_CODE_SCANNERS,MLM-BAR_CODE_SCANNERS,MLM-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,...,4641.0,4641.0,5163.0,4641.0,5163.0,-1.0,-1.0,-1.0,-1.0,4641
2,1434557,MLB-WRENCH_SETS,1100593.0,74407.0,1021802.0,13.0,MLB-PLANTS,MLB-SPEAKERS,MLB-SPEAKERS,MLB-SPEAKERS,...,3554.0,3554.0,3554.0,3554.0,3951.0,-1.0,-1.0,-1.0,-1.0,4191
3,492271,MLM-CELLPHONE_SCREENS,15.0,106702.0,15.0,15.0,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,MLM-CAR_LIGHT_BULBS,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,...,4968.0,5811.0,5811.0,4968.0,5811.0,-1.0,-1.0,-1.0,-1.0,5017
4,1540965,MLB-MUGS,860111.0,33.0,194780.0,33.0,MLB-BALL_PIT_BALLS,MLB-INFLATABLE_BALL_PITS,MLB-KIDS_TENTS,MLB-INFLATABLE_BALL_PITS,...,2187.0,2275.0,2187.0,476.0,2187.0,-1.0,-1.0,-1.0,-1.0,2753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413158,2101186,MLB-INDUSTRIAL_AND_COMMERCIAL_EQUIPMENT,,,,,,,,,...,-1.0,-1.0,-1.0,-1.0,-1.0,2046.0,1879.0,-1.0,-1.0,2155
413159,2101232,MLB-UNIVERSAL_REMOTE_CONTROLS,,,,,,,,,...,-1.0,-1.0,-1.0,-1.0,-1.0,4169.0,4169.0,-1.0,-1.0,3967
413160,2101773,MLB-FOOTBALL_SHIRTS,,,,,,,,,...,-1.0,-1.0,-1.0,-1.0,-1.0,7675.0,3955.0,-1.0,-1.0,1778
413161,2102136,MLB-OPERATING_SYSTEMS,,,,,,,,,...,-1.0,-1.0,-1.0,-1.0,-1.0,6288.0,121.0,-1.0,-1.0,2872


In [11]:
domains_most_bought_filter = ((df_train
 [['domain_id_item_bought','item_bought']]
 .groupby(by='domain_id_item_bought')
 .count()
 .sort_values(by='item_bought', ascending=False)
 .cumsum()/len(df_train))
 .query('item_bought < 0.8')).index

In [12]:
df_train_filtered = df_train[df_train['domain_id_item_bought'].isin(domains_most_bought_filter)]

In [14]:
X_train, y_train = (df_train_filtered[cols_feat_domain_encoded].astype('int32'),
                    df_train_filtered['encoded_domain_id_item_bought'].astype('int32'))

X_test = df_test[cols_feat_domain_encoded].astype('int32')

In [15]:
clf = RandomForestClassifier(max_depth=12, 
                             n_estimators=200,
                             n_jobs=-1,
                             class_weight='balanced',
                             random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=12, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [16]:
y_pred_tree = clf.predict(X_test)

In [17]:
df_test['domain_id_tree'] = [domain_mapper_reverse.get(d) for d in y_pred_tree]

In [18]:
y_pred = df_test.apply(predict_, axis=1).values

In [19]:
df_y_pred = pd.DataFrame(list(y_pred))

In [20]:
df_y_pred

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,685656,1194894,1572239,1439187,1864599,350187,1006988,1717880,457681,758382
1,943786,831243,849692,361733,1140688,1056120,353783,1676401,1007213,1386246
2,1614538,1986443,991246,1219935,659207,1453414,1425924,1260633,251150,731397
3,389079,1077564,1947905,1178383,1862072,617149,1677522,1979925,733341,1045291
4,1398419,1695773,868571,753509,914038,1197614,509032,795435,834557,1696062
...,...,...,...,...,...,...,...,...,...,...
177065,24604,88545,501048,987327,657474,999089,1027716,929184,1328040,1873494
177066,331472,1651522,1384867,1907343,1711350,558,384,371,348,344
177067,1061535,2033823,679654,460028,1395589,1550304,289602,1492333,1426798,359542
177068,1803169,762367,1802089,894197,2029390,877276,1116998,514188,170257,613975


In [21]:
now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model = input()
submission_filename = f'../data/submission/{model}_{now}.csv'
print(submission_filename)

 heuristic_tree_vote


../data/submission/heuristic_tree_vote_2020-11-30_18-38-07.csv


In [22]:
df_y_pred.to_csv(submission_filename, index=False, header=False)

# Scores

- heuristic_ordered_2020-11-28.csv
    - 0.23168
- heuristic_unordered_2020-11-28.csv
    - 0.2339678621847048
- heuristic_vote
    - 0.2385192220688938 
- heuristic_tree_vote
    - 0.2389696814287167 