In [16]:
from datetime import datetime
import numpy as np
import pandas as pd
import os

from pathlib import Path

data_dir = "/recsys_data/RecSys/h_and_m_personalized_fashion_recommendation"
data_path = Path('/recsys_data/RecSys/h_and_m_personalized_fashion_recommendation/')
res_dir = "/recsys_data/RecSys/TransformerRec"

In [2]:
transactions = pd.read_csv(
    data_path / 'transactions_train.csv',
    # set dtype or pandas will drop the leading '0' and convert to int
    dtype={'article_id': str} 
)
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
transactions['month'] = transactions['t_dat'].dt.month

submission = pd.read_csv(data_path / 'sample_submission.csv')

In [3]:
print(transactions.shape)
transactions.head()

(31788324, 6)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,month
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,9
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,9
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,9
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,9
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,9


In [4]:
transactions['t_dat'].min(), transactions['t_dat'].max()

(Timestamp('2018-09-20 00:00:00'), Timestamp('2020-09-22 00:00:00'))

In [5]:
transactions['month'].value_counts()

6     3670709
7     3158996
5     2922134
4     2817336
9     2620223
10    2543812
8     2490722
11    2468652
1     2339825
3     2334502
12    2267142
2     2154271
Name: month, dtype: int64

In [6]:
# Filtering on date..inspired from this notebook https://www.kaggle.com/hengzheng/time-is-our-best-friend
# Only picking summer months as test period is 1 week after 22nd september
# transactions = transactions.loc[transactions['t_dat']>=datetime(2020, 9, 7)]
transactions = transactions.loc[transactions['month'].isin([4,5,6,7,8,9,10])]
transactions.reset_index(drop=True, inplace=True)
print(transactions.shape)

(20223932, 6)


In [7]:
purchase_dict = {}

for i,x in enumerate(zip(transactions['customer_id'], transactions['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict:
        purchase_dict[cust_id] = {}
    
    if art_id not in purchase_dict[cust_id]:
        purchase_dict[cust_id][art_id] = 0
    
    purchase_dict[cust_id][art_id] += 1
    
print(len(purchase_dict))

1123233


In [8]:
print(submission.shape)
submission.head()

(1371980, 2)


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...


In [9]:
not_so_fancy_but_fast_benchmark = submission[['customer_id']]
prediction_list = []
dummy_list = list((transactions['article_id'].value_counts()).index)[:12]
dummy_pred = ' '.join(dummy_list)

for i, cust_id in enumerate(submission['customer_id'].values.reshape((-1,))):
    if cust_id in purchase_dict:
        l = sorted((purchase_dict[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>12:
            s = ' '.join(l[:12])
        else:
            s = ' '.join(l+dummy_list[:(12-len(l))])
    else:
        s = dummy_pred
    prediction_list.append(s)

not_so_fancy_but_fast_benchmark['prediction'] = prediction_list
print(not_so_fancy_but_fast_benchmark.shape)
not_so_fancy_but_fast_benchmark.head()

(1371980, 2)


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601006 0797065001 0697138006 0607642008 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0811835004 0723529001 0351484002 0689898002 05...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001 0750424014 0870304002 0541518023 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0742079001 0732413001 0706016001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0399061015 0634249005 0677049001 0589440005 08...


In [12]:
not_so_fancy_but_fast_benchmark.to_csv(os.path.join(res_dir, "submission_nsfbfb.txt"), header=True, index=False)

In [13]:
! kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f /recsys_data/RecSys/TransformerRec/submission_nsfbfb.txt -m "attempt-5"

100%|████████████████████████████████████████| 258M/258M [00:05<00:00, 47.6MB/s]
Successfully submitted to H&M Personalized Fashion Recommendations

Score = 0.006

Modify previous submission with updated default list

In [18]:
sub_file = os.path.join(res_dir, "submission_4.txt")

df_ss = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"))
df_tr = pd.read_csv(os.path.join(data_dir, "transactions_train.csv"))
ss_custs = set(df_ss['customer_id'].unique())

extra_custs = ss_custs - set(df_tr['customer_id'].unique())
len(extra_custs)

9699

In [27]:
df_sub = pd.read_csv(sub_file)

In [29]:
df_sub[df_sub["customer_id"].isin(extra_custs)]['prediction'] = dummy_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [30]:
df_sub.to_csv(os.path.join(res_dir, "submission_4m.txt"), header=True, index=False)

In [31]:
! kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f /recsys_data/RecSys/TransformerRec/submission_4m.txt -m "attempt-6"

100%|████████████████████████████████████████| 258M/258M [00:06<00:00, 44.8MB/s]
Successfully submitted to H&M Personalized Fashion Recommendations

Score = 0.004 (unchanged)