This is the file used for user-base recall debugging.

In [1]:
import pandas as pd
# read data into memory
transactions_train = pd.read_feather("./data/h-and-m-personalized-fashion-recommendations/transactions_train.feature")

First, we consider all the users. We use popularity factor to model the elapsed time transactions in.

In [3]:
#cut off the transactions in 16 weeks
near16 = transactions_train['t_dat'].max() - pd.Timedelta(weeks=16)
transactions_train = transactions_train[transactions_train['t_dat'] >= near16]
last_days = transactions_train['t_dat'].max()#find the latest purchase time
transactions_train["days_distance"] = (last_days - transactions_train["t_dat"]).dt.days#calculate every purchase's time distance to the latest

Calculate the popularity factor，in this block, we consider two ways
1.The popularity factor shows an inverse proportional attenuation
2.The popularity factor shows an exponential attenuation

In [4]:
import numpy as np
temperature = 3 # parameter of exponential decay
transactions_train["month"] = transactions_train["t_dat"].dt.month
transactions_train['pop_factor1'] = 1/(transactions_train["days_distance"]+1)
transactions_train['pop_factor2'] = np.exp(-(transactions_train["days_distance"]/temperature))
month_weights = [0.1,0.1,0.2,0.2,0.3,0.4,0.5,0.6,1,0.8,0.4,0.1] #weight of products bought in every month
transactions_train['pop_factor2']*=transactions_train["month"].apply(lambda x: month_weights[x-1])

In [5]:
transactions_train

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,days_distance,month,pop_factor1,pop_factor2
26679938,2020-06-02,0001f8cef6b9702d54abf66fd89eb21014bf98567065a9...,0855834001,0.015831,1,112,6,0.00885,2.445678e-17
26679939,2020-06-02,0001f8cef6b9702d54abf66fd89eb21014bf98567065a9...,0836130002,0.015831,1,112,6,0.00885,2.445678e-17
26679940,2020-06-02,0015f16aa2702e2ec13d2e38052f496b9b915d3c64e82c...,0832453006,0.016932,1,112,6,0.00885,2.445678e-17
26679941,2020-06-02,0015f16aa2702e2ec13d2e38052f496b9b915d3c64e82c...,0841260011,0.016932,1,112,6,0.00885,2.445678e-17
26679942,2020-06-02,001ef7c503e5407b6b836351b0415d3a226c587d4fb17b...,0822946002,0.026797,2,112,6,0.00885,2.445678e-17
...,...,...,...,...,...,...,...,...,...
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,0929511001,0.059305,2,0,9,1.00000,1.000000e+00
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,0891322004,0.042356,2,0,9,1.00000,1.000000e+00
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,0918325001,0.043203,1,0,9,1.00000,1.000000e+00
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,0833459002,0.006763,1,0,9,1.00000,1.000000e+00


In [5]:
#now we can calculate the most popular 16 products for a;; the users
df_train_g = transactions_train.groupby("article_id").sum().reset_index() #sum weight for every product bought
df_train_sorted=df_train_g.sort_values(by="pop_factor1",ascending=False)
products1 = df_train_sorted["article_id"].to_numpy()[:12]
print(products1)
df_train_sorted=df_train_g.sort_values(by="pop_factor2",ascending=False)
products2 = df_train_sorted["article_id"].to_numpy()[:12]
print(products2)

['0751471001' '0448509014' '0918292001' '0924243001' '0918522001'
 '0706016001' '0866731001' '0915529003' '0714790020' '0924243002'
 '0850917001' '0751471043']
['0924243001' '0924243002' '0918522001' '0751471001' '0448509014'
 '0866731001' '0714790020' '0923758001' '0915529003' '0915529005'
 '0762846027' '0918292001']


In [12]:
#generate the submission file
from tqdm import tqdm
customers=pd.read_csv("./data/h-and-m-personalized-fashion-recommendations/customers.csv")
ALL_USERS=customers.customer_id
def submit():
    preds = []
    for user in tqdm(ALL_USERS):
            preds.append((user, ' '.join(products1.tolist()+products2.tolist())))
    df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction'])
    df_preds.to_feather("submissions_user_base.feather")
    return df_preds
df_preds = submit()
display(df_preds)

100%|██████████| 1371980/1371980 [00:01<00:00, 1074020.76it/s]


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0751471001 0448509014 0918292001 0924243001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0751471001 0448509014 0918292001 0924243001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0751471001 0448509014 0918292001 0924243001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0751471001 0448509014 0918292001 0924243001 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0751471001 0448509014 0918292001 0924243001 09...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0751471001 0448509014 0918292001 0924243001 09...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0751471001 0448509014 0918292001 0924243001 09...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0751471001 0448509014 0918292001 0924243001 09...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0751471001 0448509014 0918292001 0924243001 09...


Now, we consider the popularity factor for every user.

In [None]:
#Because we consider every user, so we should group every customer
transactions_train_every=transactions_train.groupby(["customer_id","article_id"])["pop_factor1"].sum()
transactions_train_every=pd.DataFrame(transactions_train_every)
transactions_train_every.sort_values(by="pop_factor1",ascending=False,inplace=True)#sort by popularity factor
transactions_train_every.reset_index(inplace=True)
transactions_train_every["list"]=transactions_train_every.groupby("customer_id")["article_id"].transform(lambda x: " ".join([str(i) for i in x[:12]]))#convert the article_id list to string
transactions_train_every=transactions_train_every[["customer_id","list"]]