# Popularity Exponential Decay Baseline

Based on the work of https://www.kaggle.com/code/tarique7/hnm-exponential-decay-with-alternate-items

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import datetime


### Forming Train Set

In [2]:
data = pd.read_csv("./hmData/transactions_train.csv", 
                   dtype={'article_id':str}
                  )
data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


We'll drop everything except the last few(up for experimentation) days. The info from previous months are not coming of much use. 
We'll keep 4 weeks as train and the last week as validation.

In [3]:
print("All Transactions Date Range: {} to {}".format(data['t_dat'].min(), data['t_dat'].max()))

data["t_dat"] = pd.to_datetime(data["t_dat"])
train1 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,8)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
train2 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,1)) & (data['t_dat'] < datetime.datetime(2020,9,8))]
train3 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,23)) & (data['t_dat'] < datetime.datetime(2020,9,1))]
train4 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,15)) & (data['t_dat'] < datetime.datetime(2020,8,23))]
train5 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,7)) & (data['t_dat'] < datetime.datetime(2020,8,15))]

val = data.loc[data["t_dat"] >= datetime.datetime(2020,9,16)]

All Transactions Date Range: 2018-09-20 to 2020-09-22


In [4]:
articles_df = pd.read_csv('./hmData/articles.csv', 
#                           dtype={'article_id': str}
                         )

### recommend last week most popular items as alternatives to current week

In [5]:
def get_alternate_most_popular(df_data, factor, return_orig=False):
    
    next_best_match = []
    
    df = df_data.copy()
    df['article_count'] = df.groupby('article_id')['customer_id'].transform('count')
    df['article_min_price'] = df.groupby('article_id')['price'].transform('min')
    count_df = df[['article_id', 'article_count', 'article_min_price']].drop_duplicates().reset_index(drop=True)
    
    del df
    
    for article in tqdm(count_df.article_id.tolist()):
        prodname = articles_df[articles_df.article_id==int(article)]['prod_name'].iloc[0]
        other_article_list = articles_df[articles_df.prod_name==prodname]['article_id'].tolist()
        other_article_list.remove(int(article))
        k = len(other_article_list)
        if k==1:
            next_best_match.append(other_article_list[0])
        if k>1:
            if len(count_df[np.in1d(count_df['article_id'], other_article_list)])!=0:
                next_best_match.append(count_df[np.in1d(count_df['article_id'], other_article_list)].sort_values('article_count', ascending=False)['article_id'].iloc[0])
            else:
                next_best_match.append(np.nan)
        if k==0:
            next_best_match.append(np.nan)

    count_df['next_best_article'] = next_best_match
    count_df['next_best_article'] = count_df['next_best_article'].fillna(0).astype(int)
    count_df['next_best_article'] = np.where(count_df['next_best_article']==0, count_df['article_id'], str(0)+count_df['next_best_article'].astype(str))

    right_df = count_df[['next_best_article']].copy().rename(columns={'next_best_article':'article_id'})

    next_best_count = []
    next_best_price = []
    for article in tqdm(right_df['article_id']):
        if len(count_df[count_df.article_id==article]['article_count'])>0:
            next_best_count.append(count_df[count_df.article_id==article]['article_count'].iloc[0])
            next_best_price.append(count_df[count_df.article_id==article]['article_min_price'].iloc[0])
        else:
            next_best_count.append(0)
            next_best_price.append(0)

    count_df['count_next_best'] = next_best_count
    count_df['next_best_min_price'] = next_best_price
        
    more_popular_alternatives = count_df[(count_df.article_min_price >= count_df.next_best_min_price) & 
                                         (count_df.count_next_best > factor *count_df.article_count)].copy().reset_index(drop=True)
    more_popular_alt_list = more_popular_alternatives.article_id.unique().tolist()
    
    if return_orig:
        return more_popular_alt_list, more_popular_alternatives, count_df
    else:
        return more_popular_alt_list, more_popular_alternatives

In [6]:
alt_list_1v, alt_df_1v = get_alternate_most_popular(train2, 2, return_orig=False)
alt_list_2v, alt_df_2v = get_alternate_most_popular(train3, 2, return_orig=False)
alt_list_3v, alt_df_3v = get_alternate_most_popular(train4, 2, return_orig=False)
alt_list_4v, alt_df_4v = get_alternate_most_popular(train5, 2, return_orig=False)

100%|██████████| 19573/19573 [03:17<00:00, 99.33it/s] 
100%|██████████| 19573/19573 [02:01<00:00, 161.25it/s]
100%|██████████| 21332/21332 [03:52<00:00, 91.81it/s] 
100%|██████████| 21332/21332 [02:35<00:00, 137.26it/s]
100%|██████████| 19954/19954 [03:32<00:00, 93.87it/s] 
100%|██████████| 19954/19954 [02:00<00:00, 165.63it/s]
100%|██████████| 20228/20228 [03:21<00:00, 100.45it/s]
100%|██████████| 20228/20228 [01:58<00:00, 170.04it/s]


In [7]:
alt_df_1v.shape, alt_df_2v.shape, alt_df_3v.shape, alt_df_4v.shape

((460, 6), (521, 6), (457, 6), (542, 6))

Items which an user has bought in our train set time.

In [8]:
# List of all purchases per user (has repetitions)
positive_items_per_user1 = train1.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2 = train2.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3 = train3.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4 = train4.groupby(['customer_id'])['article_id'].apply(list)

Next we do exponential weighting based popularity for items. This leads to items bought more recently having more weight in the popularity list. In simple words, item A bought 5 times on the first day of the train period is inferior than item B bought 4 times on the last day of the train period.

In [9]:
train = pd.concat([train1, train2], axis=0)
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,16) - x).days)
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()

_, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

train['pop_factor'].describe()

count    557958.000000
mean          0.200478
std           0.207752
min           0.066667
25%           0.083333
50%           0.125000
75%           0.200000
max           1.000000
Name: pop_factor, dtype: float64

### Moving on to Validation ...

In [10]:
def apk(actual, predicted, k=12):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

Items bought by users in the validation period. Similar as the one for train set.

In [11]:
positive_items_val = val.groupby(['customer_id'])['article_id'].apply(list)

In [12]:
# creating validation set for metrics use case
val_users = positive_items_val.keys()
val_items = []

for i,user in tqdm(enumerate(val_users)):
    val_items.append(positive_items_val[user])
    
print("Total users in validation:", len(val_users))

68984it [00:00, 210132.82it/s]

Total users in validation: 68984





We'll now validate our algo on the validation set.

In [13]:
from collections import Counter
outputs = []
cnt = 0

popular_items = list(popular_items)

for user in tqdm(val_users):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        l = list(most_common_items_of_user.keys())
        al = []
        for j in range(0, len(l)):
            if l[j] in alt_list_1v:
                al.append(alt_df_1v[alt_df_1v.article_id==l[j]]['next_best_article'].iloc[0])
        l = l + al
        user_output += l[:12]
        
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        l = list(most_common_items_of_user.keys())
        al = []
        for j in range(0, len(l)):
            if l[j] in alt_list_2v:
                al.append(alt_df_2v[alt_df_2v.article_id==l[j]]['next_best_article'].iloc[0])
        l = l + al
        user_output += l[:12]
        
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        l = list(most_common_items_of_user.keys())
        al = []
        for j in range(0, len(l)):
            if l[j] in alt_list_3v:
                al.append(alt_df_3v[alt_df_3v.article_id==l[j]]['next_best_article'].iloc[0])
        l = l + al
        user_output += l[:12]
        
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        l = list(most_common_items_of_user.keys())
        al = []
        for j in range(0, len(l)):
            if l[j] in alt_list_4v:
                al.append(alt_df_4v[alt_df_4v.article_id==l[j]]['next_best_article'].iloc[0])
        l = l + al
        user_output += l[:12]
    
    user_output += list(popular_items[:12 - len(user_output)])    
    outputs.append(user_output)
    
print("mAP Score on Validation set:", mapk(val_items, outputs))

100%|██████████| 68984/68984 [00:08<00:00, 8297.70it/s]


mAP Score on Validation set: 0.024392270583646312


### Prediction on Test Set

In [14]:
train1 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,16)) & (data['t_dat'] < datetime.datetime(2020,9,23))]
train2 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,8)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
train3 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,31)) & (data['t_dat'] < datetime.datetime(2020,9,8))]
train4 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,23)) & (data['t_dat'] < datetime.datetime(2020,8,31))]
train5 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,15)) & (data['t_dat'] < datetime.datetime(2020,8,23))]

In [15]:
alt_list_1, alt_df_1 = get_alternate_most_popular(train2, 2, return_orig=False)
alt_list_2, alt_df_2 = alt_list_1v, alt_df_1v
alt_list_3, alt_df_3 = alt_list_2v, alt_df_2v
alt_list_4, alt_df_4 = alt_list_3v, alt_df_3v

100%|██████████| 19333/19333 [03:10<00:00, 101.45it/s]
100%|██████████| 19333/19333 [01:48<00:00, 178.48it/s]


In [16]:
alt_df_1.shape, alt_df_2.shape, alt_df_3.shape, alt_df_4.shape

((481, 6), (460, 6), (521, 6), (457, 6))

In [17]:
positive_items_per_user1 = train1.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2 = train2.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3 = train3.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4 = train4.groupby(['customer_id'])['article_id'].apply(list)

train = pd.concat([train1, train2], axis=0)
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days)
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()

_, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

user_group = pd.concat([train1, train2, train3, train4], axis=0).groupby(['customer_id'])['article_id'].apply(list)

In [18]:
submission = pd.read_csv("./hmData/sample_submission.csv")
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...


In [19]:
from collections import Counter
outputs = []
cnt = 0

for user in tqdm(submission['customer_id']):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        l = list(most_common_items_of_user.keys())
        al = []
        for j in range(0, len(l)):
            if l[j] in alt_list_1:
                al.append(alt_df_1[alt_df_1.article_id==l[j]]['next_best_article'].iloc[0])
        l = l + al
        user_output += l[:12]
        
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        l = list(most_common_items_of_user.keys())
        al = []
        for j in range(0, len(l)):
            if l[j] in alt_list_2:
                al.append(alt_df_2[alt_df_2.article_id==l[j]]['next_best_article'].iloc[0])
        l = l + al
        user_output += l[:12]
        
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        l = list(most_common_items_of_user.keys())
        al = []
        for j in range(0, len(l)):
            if l[j] in alt_list_3:
                al.append(alt_df_3[alt_df_3.article_id==l[j]]['next_best_article'].iloc[0])
        l = l + al
        user_output += l[:12]
        
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        l = list(most_common_items_of_user.keys())
        al = []
        for j in range(0, len(l)):
            if l[j] in alt_list_4:
                al.append(alt_df_4[alt_df_4.article_id==l[j]]['next_best_article'].iloc[0])
        l = l + al        
        user_output += l[:12]
    
    user_output += list(popular_items[:12 - len(user_output)])
    outputs.append(user_output)
    
str_outputs = []
for output in outputs:
    str_outputs.append(" ".join([str(x) for x in output]))

100%|██████████| 1371980/1371980 [00:57<00:00, 24034.38it/s]


In [20]:
submission['prediction'] = str_outputs
submission.to_csv("submission.csv", index=False)

In [21]:
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0924243001 0924243002 0918522001 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0924243002 0918522001 0751471001 04...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0924243001 0924243002 0918522001 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0924243002 0918522001 0751471001 04...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0924243002 0918522001 0751471001 04...


Scored 0.0217 on Kaggle