In [1]:
import pandas as pd
import numpy as np
import swifter
import gzip
import json
import os

from sklearn.model_selection import train_test_split
from functools import partial, reduce
from src.models import ndcg_score
from collections import Counter

In [2]:
path = '../'
train_filename = 'data/interim/train_dataset_features.parquet'
cols_load = ['item_bought', 'domain_id_item_bought',
        'most_viewed_1', 'most_viewed_2',
        'domain_id_most_viewed_1', 'domain_id_most_viewed_2',
        'last_viewed',
        'domain_id_last_viewed',
        'most_searched_ngram_1_domain', 'most_searched_ngram_2_domain']
df_train = pd.read_parquet(os.path.join(path, train_filename), columns=cols_load)

In [3]:
df_train

Unnamed: 0,item_bought,domain_id_item_bought,most_viewed_1,most_viewed_2,domain_id_most_viewed_1,domain_id_most_viewed_2,last_viewed,domain_id_last_viewed,most_searched_ngram_1_domain,most_searched_ngram_2_domain
0,394965,MLB-PRINTER_INKS,626904.0,1833895.0,MLB-NETWORK_PLUGS,MLB-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,1.0,MLB-BOOTS_AND_BOOTIES,,
1,492271,MLM-CELLPHONE_SCREENS,15.0,106702.0,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,MLM-CAR_LIGHT_BULBS,15.0,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,,
2,1717880,MLB-HARD_DRIVES_AND_SSDS,381867.0,753781.0,MLB-CARPET_AND_UPHOLSTERY_CLEANERS,MLB-HARD_DRIVES_AND_SSDS,21.0,MLB-BUSES,,
3,33,MLB-INFLATABLE_BALL_PITS,5896.0,33.0,MLB-INFLATABLE_BALL_PITS,MLB-INFLATABLE_BALL_PITS,33.0,MLB-INFLATABLE_BALL_PITS,,
4,1049082,MLB-INFLATABLE_BALL_PITS,1181970.0,33.0,MLB-KIDS_TENTS,MLB-INFLATABLE_BALL_PITS,33.0,MLB-INFLATABLE_BALL_PITS,,
...,...,...,...,...,...,...,...,...,...,...
413158,2101186,MLB-INDUSTRIAL_AND_COMMERCIAL_EQUIPMENT,,,,,,,MLB-PHOTOGRAPHIC_REFLECTORS,MLB-WINDOWS
413159,2101232,MLB-UNIVERSAL_REMOTE_CONTROLS,,,,,,,MLB-COMPUTER_USB_LAMPS,MLB-COMPUTER_USB_LAMPS
413160,2101773,MLB-FOOTBALL_SHIRTS,,,,,,,MLB-STAPLERS,MLB-WINDSOCKS
413161,2102136,MLB-OPERATING_SYSTEMS,,,,,,,MLB-COMPUTER_USB_LAMPS,MLB-REAR_WHEEL_HUBS_BEARING_ASSEMBLY


In [4]:
item_filename = 'data/interim/item_data.parquet'
cols_load = ['item_id', 'domain_id']
df_item = pd.read_parquet(os.path.join(path, item_filename), columns=cols_load)

In [5]:
df_item

Unnamed: 0,item_id,domain_id
0,111260,MLM-INDIVIDUAL_HOUSES_FOR_SALE
1,871377,MLM-VIDEO_GAMES
2,490232,MLM-SKIRTS
3,1150706,MLM-GRAPHICS_CARDS
4,934912,MLM-NOTEBOOKS
...,...,...
2102272,1099649,MLB-BABY_STROLLERS
2102273,1482187,MLB-KITCHEN_SUPPLIES
2102274,1118904,MLB-SOCKS
2102275,237229,MLB-DISPOSABLE_BABY_DIAPERS


# Feature

## Top most bought items for each domain

In [20]:
(df_train[['domain_id_item_bought','item_bought']]
 .reset_index()
 .groupby(by=['domain_id_item_bought','item_bought'])
 .count()
 .sort_values(by=['domain_id_item_bought','index'], ascending=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,index
domain_id_item_bought,item_bought,Unnamed: 2_level_1
MLM-YOGA_MATS,854504,8
MLM-YOGA_MATS,1895261,5
MLM-YOGA_MATS,1174364,4
MLM-YOGA_MATS,778139,1
MLM-XENON_KITS,316218,5
...,...,...
MLB-ABDOMINAL_TONING_BELTS,583001,1
MLB-ABDOMINAL_TONING_BELTS,1243213,1
MLB-ABDOMINAL_TONING_BELTS,1414850,1
MLB-3D_PRINTERS,588264,1


## Top 30 domains with the most purchases

In [10]:
(df_train[['domain_id_item_bought','item_bought']]
.groupby(by='domain_id_item_bought')
.count()
.sort_values(by='item_bought', ascending=False)).head(30)

Unnamed: 0_level_0,item_bought
domain_id_item_bought,Unnamed: 1_level_1
MLB-CELLPHONES,25070
MLB-SNEAKERS,14608
MLB-SUPPLEMENTS,9562
MLB-HEADPHONES,9053
MLB-SMARTWATCHES,7963
MLB-DOLLS,4915
MLB-VIDEO_GAMES,4637
MLM-HEADPHONES,4603
MLB-BOOTS_AND_BOOTIES,4325
MLB-STREAMING_MEDIA_DEVICES,4104


# Model

## First heuristic

### Introduction

Based on the domains from the most viewed and last viewed itens, return a mixed suggestion list.

Ex:


In [22]:
cols_feature = ['domain_id_last_viewed', 'domain_id_most_viewed_1', 
                'domain_id_most_viewed_2', 'most_searched_ngram_1_domain',
                'most_searched_ngram_2_domain']
# TODO: normalize these column names

df_train.loc[0, cols_feature]

domain_id_last_viewed                            MLB-BOOTS_AND_BOOTIES
domain_id_most_viewed_1                              MLB-NETWORK_PLUGS
domain_id_most_viewed_2         MLB-COMPUTER_EQUIPMENT_AND_SPARE_PARTS
most_searched_ngram_1_domain                                      None
most_searched_ngram_2_domain                                      None
Name: 0, dtype: object

In this case the suggested items will be divided among MLB-BOOTS_AND_BOOTIES, MLB-NETWORK_PLUGS and MLB-COMPUTER_EQUIPMENT_AND_SPARE_PARTS. So three items from each domain (+1 from the last).

The first item from each domain will come from its respective item feature (`last_viewed`, `most_viewed` or `most_searched`, if available).

In [25]:
df_train.loc[0, 'last_viewed']

1.0

In this example, the `last_viewed` item was the one with `item_id` = 1.

We still need to pick two more items from this domain. We'll pick the two most bought from the given domain (MLB-BOOTS_AND_BOOTIES in this example).

### Implementation

In [6]:
df_train_split, df_test_split = train_test_split(df_train)

In [7]:
df_most_bought = (df_train_split[['domain_id_item_bought','item_bought']]
                  .reset_index()
                  .groupby(by=['domain_id_item_bought','item_bought'])
                  .count()
                  .sort_values(by=['domain_id_item_bought','index'], ascending=False))

In [8]:
cols = ['MLM-YOGA_MATS','MLM-XENON_KITS']
[i for items in (zip(*[df_most_bought.loc[c].head(10).index.values for c in cols])) for i in items]

[854504, 316218, 1895261, 522546, 1174364, 861906, 778139, 1766808]

In [9]:
def predict(row:pd.Series, cols_domain:list,
            cols_item:list, df_most_bought:pd.DataFrame,
            available_domains:list,
            most_bought_items:list)->list:
    
    pred_list = reduce(lambda x, y: x + [row[y]]
                       if not np.isnan(row[y]) else x,
                       cols_item, [])

    # Interleave top 10 items from each viewed/searched domain
    # and then flatten
    domains = [c
               for c in row[cols_domain].values
               if c in available_domains]
    top_items = [i 
                 for items in 
                 zip(*[df_most_bought.loc[c]
                       .head(10).index.values
                       for c in domains])
                 for i in items]

    num_missing_items = 10 - len(pred_list)
    pred_list.extend(top_items[:num_missing_items])
    
    num_missing_items = 10 - len(pred_list)
    pred_list.extend(most_bought_items[:num_missing_items])
    
    pred_list = [int(x) for x in pred_list]
    
    return pred_list

In [10]:
cols_domain = ['domain_id_last_viewed', 'domain_id_most_viewed_1',
               'domain_id_most_viewed_2', 'most_searched_ngram_1_domain',
               'most_searched_ngram_2_domain']
cols_item = ['last_viewed', 'most_viewed_1',
             'most_viewed_2']
available_domains = (df_most_bought
                     .reset_index()
                     ['domain_id_item_bought']
                     .unique())
most_bought_items = [i[0] 
                     for i in 
                     (df_most_bought
                      .sort_values(by='index', ascending=False)
                      .head(10).values)]

predict_ = partial(predict, cols_domain=cols_domain,
                   cols_item=cols_item,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [18]:
%%time
y_pred = df_test_split.head(5000).apply(predict_, axis=1).values

CPU times: user 12.3 s, sys: 39.7 ms, total: 12.4 s
Wall time: 12.5 s


In [19]:
y_pred[:10]

array([list([37589, 440253, 536586, 391833, 1703908, 1703908, 934745, 1453166, 1453166, 414]),
       list([602731, 975756, 5060, 185608, 1433166, 185608, 1736865, 615995, 1736865, 2031255]),
       list([1933468, 1933468, 962540, 976235, 976235, 1207157, 1918520, 1918520, 522466, 444122]),
       list([347293, 347293, 960978, 444075, 444075, 444075, 1905042, 1905042, 1905042, 1388781]),
       list([561839, 545369, 1114400, 2032612, 545369, 545369, 777316, 877640, 877640, 768694]),
       list([1555988, 1555988, 1737735, 1630796, 1630796, 1851910, 1851910, 781928, 781928, 1754180]),
       list([1147728, 895911, 1147728, 158235, 899713, 158235, 1147728, 830693, 1147728, 906052]),
       list([1518082, 590708, 1518082, 249793, 249793, 249793, 896559, 896559, 896559, 1858548]),
       list([1890272, 1890272, 414, 294, 280, 254, 248, 242, 241, 241]),
       list([426856, 426856, 742894, 426856, 426856, 426856, 117008, 117008, 117008, 811050])],
      dtype=object)

In [24]:
y_true = df_test_split.head(5000)['item_bought'].values

In [14]:
#Auxiliary function from ML
def jl_to_list(fname):
    output = []
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [15]:
path = '../'
item_data = jl_to_list(os.path.join(path,'data/raw/item_data.jl.gz'))

In [25]:
%%time
score = ndcg_score(y_true, y_pred, item_data, n_predictions=10)

CPU times: user 1.56 s, sys: 123 ms, total: 1.68 s
Wall time: 1.68 s


In [26]:
score

0.21548472644015737

# First submission

In [27]:
df_most_bought = (df_train[['domain_id_item_bought','item_bought']]
                  .reset_index()
                  .groupby(by=['domain_id_item_bought','item_bought'])
                  .count()
                  .sort_values(by=['domain_id_item_bought','index'], ascending=False))

In [28]:
cols_domain = ['domain_id_last_viewed', 'domain_id_most_viewed_1',
               'domain_id_most_viewed_2', 'most_searched_ngram_1_domain',
               'most_searched_ngram_2_domain']
cols_item = ['last_viewed', 'most_viewed_1',
             'most_viewed_2']
available_domains = (df_most_bought
                     .reset_index()
                     ['domain_id_item_bought']
                     .unique())
most_bought_items = [i[0] 
                     for i in 
                     (df_most_bought
                      .sort_values(by='index', ascending=False)
                      .head(10).values)]

predict_ = partial(predict, cols_domain=cols_domain,
                   cols_item=cols_item,
                   df_most_bought=df_most_bought,
                   available_domains=available_domains,
                   most_bought_items=most_bought_items)

In [29]:
test_filename = 'data/interim/test_dataset_features.parquet'
cols_load = ['most_viewed_1', 'most_viewed_2',
             'domain_id_most_viewed_1', 'domain_id_most_viewed_2',
             'last_viewed',
             'domain_id_last_viewed',
             'most_searched_ngram_1_domain', 'most_searched_ngram_2_domain']
df_test = pd.read_parquet(os.path.join(path, test_filename), columns=cols_load)

In [30]:
df_test

Unnamed: 0,most_viewed_1,most_viewed_2,domain_id_most_viewed_1,domain_id_most_viewed_2,last_viewed,domain_id_last_viewed,most_searched_ngram_1_domain,most_searched_ngram_2_domain
0,4.0,937805.0,MLB-GAME_CONSOLES,MLB-T_SHIRTS,4.0,MLB-GAME_CONSOLES,,
1,1191481.0,1756634.0,MLB-SURVEILLANCE_CAMERAS,MLB-GAME_CONSOLES,4.0,MLB-GAME_CONSOLES,,
2,9.0,9.0,MLM-VIBRATORS,,9.0,MLM-VIBRATORS,,
3,1036005.0,701615.0,MLM-UNCLASSIFIED_PRODUCTS,MLM-SCHOOL_AND_OFFICE_SUPPLIES,11.0,MLM-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,,
4,1181970.0,33.0,MLB-KIDS_TENTS,MLB-INFLATABLE_BALL_PITS,33.0,MLB-INFLATABLE_BALL_PITS,,
...,...,...,...,...,...,...,...,...
177065,,,,,,,MLM-FABRICS,MLM-FABRICS
177066,,,,,,,MLB-LAMINATING_ROLLS_AND_POUCHES,MLB-ARTIFICIAL_FLOWERS
177067,,,,,,,MLB-MEDICAL_HOSES,MLB-MEDICAL_HOSES
177068,,,,,,,MLM-QUENAS,MLM-QUENAS


In [198]:
y_pred = df_test.apply(predict_, axis=1).values

In [199]:
df_y_pred = pd.DataFrame(y_pred)

In [215]:
df_y_pred = df_y_pred[0].apply(lambda x: ",".join(map(str, x)))

In [219]:
df_y_pred.to_csv('../data/submission/v0_2020-11-27.csv', index=False, header=False)

In [220]:
df_y_pred.apply(lambda x: len(x) < 5).sum()

0

In [231]:
(pd.DataFrame(list(df_y_pred.apply(lambda x: 
                 [int(y) for y in x.split(',')])))
 .to_csv("test.csv", index=False, header=False))

In [232]:
(pd.DataFrame(list(df_y_pred.apply(lambda x: 
                 [int(y) for y in x.split(',')])))
 .to_csv("../data/submission/v0_2020-11-27.csv", index=False, header=False))

In [233]:
df_y_pred

0         4,4,937805,1138806,1138806,1314932,596359,5963...
1         4,1191481,1756634,1138806,1158631,1138806,5963...
2         9,9,9,617470,617470,2080731,2080731,1696504,16...
3         11,1036005,701615,537022,1714590,1695066,12470...
4         33,1181970,33,1079796,867251,1079796,1049082,2...
                                ...                        
177065              558,384,371,348,344,330,326,324,322,322
177066    1011773,648047,2096865,303862,1756565,1218818,...
177067              558,384,371,348,344,330,326,324,322,322
177068              558,384,371,348,344,330,326,324,322,322
177069              558,384,371,348,344,330,326,324,322,322
Name: 0, Length: 177070, dtype: object