# H&M Data as Next Basket

    - source: https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/
    - transactions in: transactions_train.csv
    - customer information in: customers.csv
    - product descriptions in: articles.csv
    - product images are in: images/ folder by article-id
    
    - treat each customers 1-day interaction as baskets and predict the next basket

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
import time

from scipy import stats
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from utils import write_train_file, create_submission_file

In [3]:
data_dir = "/recsys_data/RecSys/h_and_m_personalized_fashion_recommendation"

In [4]:
df_tr = pd.read_csv(os.path.join(data_dir, "transactions_train.csv"), dtype={'article_id': str})
df_tr['t_dat'] = pd.to_datetime(df_tr['t_dat'])

In [5]:
print(f"Total {len(df_tr['customer_id'].unique())} customers and {len(df_tr['article_id'].unique())} products")
df_tr['t_dat'].min(), df_tr['t_dat'].max()

Total 1362281 customers and 104547 products


(Timestamp('2018-09-20 00:00:00'), Timestamp('2020-09-22 00:00:00'))

In [7]:
tr_data = df_tr[df_tr['t_dat'] >= pd.to_datetime('2020-08-31')].copy()  # last 3 weeks
# tr_data = df_tr[df_tr['t_dat'] >= pd.to_datetime('2020-07-31')].copy()  # last 7 weeks
# tr_data = df_tr[df_tr['t_dat'] < pd.to_datetime('2020-08-31')].copy()  # everything before the last 3 weeks

print(f"Total {len(tr_data['customer_id'].unique())} customers and {len(tr_data['article_id'].unique())} products")

Total 196319 customers and 26583 products


In [8]:
df_prod = pd.read_csv(os.path.join(data_dir, "articles.csv"), dtype={'article_id': str, 'product_code': str})
df_prod = df_prod.set_index('article_id').T.to_dict()
print(f"Total {len(df_prod)} products")

Total 105542 products


In [9]:
dfg = tr_data.groupby("customer_id")
cust_dict, prod_dict, transactions = {}, {}, {}
count_cust, count_prod = 0, 0
num_products = []
for ckey, df_c in tqdm(dfg):
    df_c = df_c.sort_values(by=['t_dat'])
    num_days = len(df_c['t_dat'].unique())
    if num_days >= 2:
        count_cust += 1
        cust_dict[ckey] = count_cust
        transactions[ckey] = {'products': [], 'days': [], 'sessions': 0}
        df_cg = df_c.groupby("t_dat")
        for ii, df_ in df_cg:
            products = df_['article_id'].tolist()
            dates = df_['t_dat'].tolist()
            for p in products:
                if p in prod_dict:
                    prod_dict[p] += 1
                else:
                    prod_dict[p] = 1
            transactions[ckey]['products'].append(products)
            transactions[ckey]['days'].append(dates)
            transactions[ckey]['sessions'] += 1
            num_products.append(len(df_['article_id'].unique()))

print(f"Total {len(cust_dict)} customers with average {np.mean(num_products):.0f} products per day")
print(f"Total {len(prod_dict)} items")

100%|██████████| 196319/196319 [05:18<00:00, 616.50it/s]

Total 48709 customers with average 3 products per day
Total 21712 items





In [10]:
filtered_customers = list(cust_dict.keys())
filtered_customers[0]

'0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37e011580a479e80aa94'

In [11]:
def flatten_list(lst):
    return [item for sublist in lst for item in sublist]

In [12]:
filtered_customers = list(cust_dict.keys())

all_seqs = [flatten_list(transactions[cust]['products']) for cust in filtered_customers]
seq_prods = set(flatten_list(all_seqs))
print(f"{len(seq_prods)} items in filtered sequences")

# items solely based on the considered interactions - tighter bounds
filtered_items = set([k for k in prod_dict if k in seq_prods])
print(f"Original {len(prod_dict)} items, remaining {len(filtered_items)} items")

# get the item counts so that most frequent item gets lowest id
item_counts = [(k, prod_dict[k]) for k in prod_dict if k in filtered_items]
item_counts = sorted(item_counts, key=lambda x: x[1], reverse=True)
print(item_counts[:5])

# create the item-dict, starting from 1
item_dict = {k[0]: ii+1 for ii, k in enumerate(item_counts)}
item_ids = list(item_dict.values())
print(f"Item-ids range from {min(item_ids)}, ... {max(item_ids)}")

21712 items in filtered sequences
Original 21712 items, remaining 21712 items
[('0909370001', 1074), ('0751471001', 932), ('0915526001', 866), ('0751471043', 857), ('0915529003', 844)]
Item-ids range from 1, ... 21712


In [13]:
write_train_file(data_dir=data_dir, 
                 file_name="hnm_3w_sessionized_orig.txt",
                 customer_list=filtered_customers, 
                 transactions=transactions, 
                 item_dict=item_dict, 
                 prod_dict=df_prod,
                 write_session_info=True,
                 write_product_meta=True,
                 original_product_name=True)

100%|██████████| 48709/48709 [01:40<00:00, 483.11it/s]

Written 364695 lines in /recsys_data/RecSys/h_and_m_personalized_fashion_recommendation/hnm_3w_sessionized_orig.txt, 48709 users and 21712 items
Sequence length, min: 2, averag: 7.49 and max: 147





In [7]:
transactions_3w = df_tr[df_tr['t_dat'] >= pd.to_datetime('2020-08-31')].copy()  # last 3 weeks
transactions_2w = df_tr[df_tr['t_dat'] >= pd.to_datetime('2020-09-07')].copy()  # last 2 weeks
transactions_1w = df_tr[df_tr['t_dat'] >= pd.to_datetime('2020-09-15')].copy()  # one week before

In [8]:
purchase_dict_3w = {}

for i,x in enumerate(zip(transactions_3w['customer_id'], transactions_3w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_3w:
        purchase_dict_3w[cust_id] = {}
    
    if art_id not in purchase_dict_3w[cust_id]:
        purchase_dict_3w[cust_id][art_id] = 0
    
    purchase_dict_3w[cust_id][art_id] += 1
    
print(len(purchase_dict_3w))

dummy_list_3w = list((transactions_3w['article_id'].value_counts()).index)[:12]

196319


In [9]:
purchase_dict_2w = {}

for i,x in enumerate(zip(transactions_2w['customer_id'], transactions_2w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_2w:
        purchase_dict_2w[cust_id] = {}
    
    if art_id not in purchase_dict_2w[cust_id]:
        purchase_dict_2w[cust_id][art_id] = 0
    
    purchase_dict_2w[cust_id][art_id] += 1
    
print(len(purchase_dict_2w))

dummy_list_2w = list((transactions_2w['article_id'].value_counts()).index)[:12]

143455


In [10]:
purchase_dict_1w = {}

for i,x in enumerate(zip(transactions_1w['customer_id'], transactions_1w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_1w:
        purchase_dict_1w[cust_id] = {}
    
    if art_id not in purchase_dict_1w[cust_id]:
        purchase_dict_1w[cust_id][art_id] = 0
    
    purchase_dict_1w[cust_id][art_id] += 1
    
print(len(purchase_dict_1w))

dummy_list_1w = list((transactions_1w['article_id'].value_counts()).index)[:12]

75481


Create examples at basket levels?

In [61]:
inv_item_dict = {v: k for k, v in item_dict.items()}  # for smaller set of products

In [54]:
res_file = os.path.join(data_dir, 'seq_test_pred.txt')

In [99]:
seq2seq = submission[['customer_id']]
prediction_list = []

dummy_list = list((transactions_1w['article_id'].value_counts()).index)[:12]
dummy_pred = ' '.join(dummy_list)
count_ids = 0

res = []
res_dict = {}
with open(res_file, 'r') as fr:
    for ii, line in tqdm(enumerate(fr)):
        cid = filtered_customers[ii]
        pred = line.strip().split()
        pred = [inv_item_dict[int(p)] for p in pred if p not in ['<start>']]
        pred = pred[:12]
        if len(pred) < 12:
            if cust_id in purchase_dict_1w:
                s = ' '.join(pred + dummy_list_1w[:(12-len(pred))])
            elif cust_id in purchase_dict_2w:
                s = ' '.join(pred + dummy_list_2w[:(12-len(pred))])
            elif cust_id in purchase_dict_3w:
                s = ' '.join(pred + dummy_list_3w[:(12-len(pred))])
        else:
            s = " ".join(pred)
        res.append(s)
        res_dict[cid] = s

48709it [00:00, 172331.63it/s]


In [79]:
submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))
submission['prediction'] = dummy_pred
submission.head()
# len(submission['customer_id'].unique())

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0924243001 0924243002 0923758001 0918522001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0924243002 0923758001 0918522001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0924243001 0924243002 0923758001 0918522001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0924243002 0923758001 0918522001 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0924243002 0923758001 0918522001 09...


In [89]:
smaller = pd.DataFrame({'customer_id': filtered_customers, 'prediction': res})
smaller.head()

Unnamed: 0,customer_id,prediction
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,0923037002 0923037002 0923037002 0923037003 09...
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,0923037002 0923037002 0923037002 0923037003 09...
2,00040239317e877c77ac6e79df42eb2633ad38fcac09fc...,0923037002 0915459002 0894703001 0909093003 09...
3,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,0573085028 0573085028 0573085028 0573085028 05...
4,0006bfcec82c6c132276c0a5549ae13645c9ab77b0243f...,0877769001 0586928001 0927751001 0927751001 08...


In [95]:
seq2seq = pd.concat([submission[~submission['customer_id'].isin(smaller['customer_id'])], smaller])

In [101]:
seq2seq.to_csv(os.path.join(data_dir, "submission_7.txt"), header=True, index=False)

In [102]:
! kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f /recsys_data/RecSys/h_and_m_personalized_fashion_recommendation/submission_7.txt -m "attempt-14"

100%|████████████████████████████████████████| 258M/258M [00:03<00:00, 73.1MB/s]
Successfully submitted to H&M Personalized Fashion Recommendations

Total 1,371,980 customers
    - 196,319 present in the last 3 weeks
    - out of that only 48,709 have more than one purchase

In [100]:
seq2seq = submission[['customer_id']]
prediction_list = []

dummy_list = list((transactions_1w['article_id'].value_counts()).index)[:12]
dummy_pred = ' '.join(dummy_list)
count_ids = 0

for i, cust_id in tqdm(enumerate(submission['customer_id'].values.reshape((-1,)))):
    if cust_id in filtered_customers:
        s = res_dict[cust_id]  # from seq2seq model
    else:
        if cust_id in purchase_dict_1w:
            l = sorted((purchase_dict_1w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s = ' '.join(l[:12])
            else:
                s = ' '.join(l+dummy_list_1w[:(12-len(l))])
        elif cust_id in purchase_dict_2w:
            l = sorted((purchase_dict_2w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s = ' '.join(l[:12])
            else:
                s = ' '.join(l+dummy_list_2w[:(12-len(l))])
        elif cust_id in purchase_dict_3w:
            l = sorted((purchase_dict_3w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s = ' '.join(l[:12])
            else:
                s = ' '.join(l+dummy_list_3w[:(12-len(l))])
        else:
            s = dummy_pred
            count_ids += 1
    prediction_list.append(s)

seq2seq['prediction'] = prediction_list
print(seq2seq.shape)
print(count_ids, "customers not in the last 3 weeks")
seq2seq.head()

1371980it [22:42, 1007.18it/s]


(1371980, 2)
1175661 customers not in the last 3 weeks


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0751471001 0909370001 0915526001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0924243002 0923758001 0918522001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0924243001 0924243002 0923758001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0924243002 0923758001 0918522001 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0924243002 0923758001 0918522001 09...


In [17]:
1371980 - 1175661

196319