In [1]:
%pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

import scipy

# fix waring @ implicit/cpu/als.py:95
import threadpoolctl
threadpoolctl.threadpool_limits(1, "blas") 


from tqdm.auto import tqdm
tqdm.pandas()

In [3]:
train = (
    pd
    .read_csv('/kaggle/input/predicting-response/train.csv')
    .set_index('event_id', verify_integrity='unique')
    .drop(columns='Unnamed: 0')
)

display(train)

Unnamed: 0_level_0,session_id,recommendation_idx,timestamp,user_id,item_id,response
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,5709,1549316916000,0,10910,0
1,0,5709,1549316916000,0,25996,0
2,0,5709,1549316916000,0,13539,0
3,0,5709,1549316916000,0,28273,0
4,0,5709,1549316916000,0,11289,0
...,...,...,...,...,...,...
1414603,42152,112874,1555157492000,42152,11224,0
1414604,42152,112874,1555157492000,42152,6674,0
1414605,42152,112874,1555157492000,42152,28598,0
1414606,42152,112874,1555157492000,42152,4862,0


In [4]:
user_id_enc = {uid: i for i, uid in enumerate(train.user_id.sort_values().unique())}
item_id_enc = {iid: i for i, iid in enumerate(train.item_id.sort_values().unique())}

In [5]:
train['user_id_encoded'] = train.user_id.map(user_id_enc)
train['item_id_encoded'] = train.item_id.map(item_id_enc)
train

Unnamed: 0_level_0,session_id,recommendation_idx,timestamp,user_id,item_id,response,user_id_encoded,item_id_encoded
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,5709,1549316916000,0,10910,0,0,2431
1,0,5709,1549316916000,0,25996,0,0,5850
2,0,5709,1549316916000,0,13539,0,0,3033
3,0,5709,1549316916000,0,28273,0,0,6381
4,0,5709,1549316916000,0,11289,0,0,2523
...,...,...,...,...,...,...,...,...
1414603,42152,112874,1555157492000,42152,11224,0,11905,2507
1414604,42152,112874,1555157492000,42152,6674,0,11905,1466
1414605,42152,112874,1555157492000,42152,28598,0,11905,6460
1414606,42152,112874,1555157492000,42152,4862,0,11905,1071


In [6]:
clicks_cnt = (
    train
    .groupby(['user_id_encoded', 'item_id_encoded'])
    .sum('response')
    .query('response > 0')
    .response
    .astype(float)
    .reset_index()
)
clicks_cnt

Unnamed: 0,user_id_encoded,item_id_encoded,response
0,0,655,1.0
1,0,1505,1.0
2,1,2915,1.0
3,1,2964,1.0
4,1,5256,1.0
...,...,...,...
85610,11905,1071,2.0
85611,11905,1466,1.0
85612,11905,4531,1.0
85613,11905,4736,1.0


In [7]:
user_item_sparse = scipy.sparse.coo_matrix(
    (clicks_cnt.response, (clicks_cnt.user_id_encoded, clicks_cnt.item_id_encoded)),
    shape=(len(user_id_enc), len(item_id_enc))
)
user_item_sparse = user_item_sparse.tocsr()
user_item_sparse

<11906x6539 sparse matrix of type '<class 'numpy.float64'>'
	with 85615 stored elements in Compressed Sparse Row format>

In [8]:
from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares(
    factors=128,
    regularization=10,
    random_state=0,
    iterations=64,
)
threadpoolctl.threadpool_limits(1, "blas")

model.fit(user_item_sparse)

  check_blas_config()


  0%|          | 0/64 [00:00<?, ?it/s]

In [9]:
def get_recs_for_slate(model, slate, user_item_sparse):
    slate_uid_enc = slate.user_id_encoded.iloc[0]
    try:
        rec_iid_enc, scores = model.recommend(
            userid=slate_uid_enc,
            user_items=user_item_sparse.getrow(slate_uid_enc),
            items=slate.item_id_encoded,
            N=len(slate),
            filter_already_liked_items=False,
        )
    except Exception: # cold user or item
        rec_iid_enc = slate.item_id_encoded.to_numpy()
        scores = np.random.normal(slate.shape[0])
        
    res_df = pd.DataFrame({
        'item_id_encoded': rec_iid_enc,
        'rank': scores
    }).set_index('item_id_encoded')
    
    return slate.join(res_df, on='item_id_encoded')

In [10]:
val = (
    pd
    .read_csv('/kaggle/input/predicting-response/test.csv')
    .set_index('event_id', verify_integrity='unique')
    .assign(response=None)
)
val

Unnamed: 0_level_0,session_id,recommendation_idx,timestamp,user_id,item_id,response
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,149235,1549216052000,0,20638,
1,0,149235,1549216052000,0,26217,
2,0,149235,1549216052000,0,21102,
3,0,149235,1549216052000,0,14716,
4,0,149235,1549216052000,0,9522,
...,...,...,...,...,...,...
416555,42152,139262,1549649250000,42152,12436,
416556,42152,139262,1549649250000,42152,7569,
416557,42152,139262,1549649250000,42152,8369,
416558,42152,139262,1549649250000,42152,15983,


In [11]:
for user_id in val.user_id.unique():
    if user_id not in user_id_enc:
        user_id_enc[user_id] = len(user_id_enc)
        print('new user_id:', user_id, 'encoding as', user_id_enc[user_id])
        
for item_id in val.item_id.unique():
    if item_id not in item_id_enc:
        item_id_enc[item_id] = len(item_id_enc)
        print('new item_id:', item_id, 'encoding as', item_id_enc[item_id])
        
val['user_id_encoded'] = val.user_id.map(user_id_enc)
val['item_id_encoded'] = val.item_id.map(item_id_enc)
val

new user_id: 529 encoding as 11906
new user_id: 571 encoding as 11907
new user_id: 608 encoding as 11908
new user_id: 693 encoding as 11909
new user_id: 1422 encoding as 11910
new user_id: 1739 encoding as 11911
new user_id: 4560 encoding as 11912
new user_id: 4878 encoding as 11913
new user_id: 5886 encoding as 11914
new user_id: 6850 encoding as 11915
new user_id: 6899 encoding as 11916
new user_id: 7530 encoding as 11917
new user_id: 7925 encoding as 11918
new user_id: 8001 encoding as 11919
new user_id: 8216 encoding as 11920
new user_id: 8567 encoding as 11921
new user_id: 8930 encoding as 11922
new user_id: 9309 encoding as 11923
new user_id: 9687 encoding as 11924
new user_id: 10373 encoding as 11925
new user_id: 11184 encoding as 11926
new user_id: 11225 encoding as 11927
new user_id: 11551 encoding as 11928
new user_id: 12298 encoding as 11929
new user_id: 12987 encoding as 11930
new user_id: 13933 encoding as 11931
new user_id: 15142 encoding as 11932
new user_id: 15235 encod

Unnamed: 0_level_0,session_id,recommendation_idx,timestamp,user_id,item_id,response,user_id_encoded,item_id_encoded
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,149235,1549216052000,0,20638,,0,4648
1,0,149235,1549216052000,0,26217,,0,5898
2,0,149235,1549216052000,0,21102,,0,4740
3,0,149235,1549216052000,0,14716,,0,3288
4,0,149235,1549216052000,0,9522,,0,2110
...,...,...,...,...,...,...,...,...
416555,42152,139262,1549649250000,42152,12436,,11905,2793
416556,42152,139262,1549649250000,42152,7569,,11905,1679
416557,42152,139262,1549649250000,42152,8369,,11905,1862
416558,42152,139262,1549649250000,42152,15983,,11905,3567


In [12]:
pred = (
    val
    .groupby('recommendation_idx')
    .progress_apply(
        lambda slate: get_recs_for_slate(model, slate, user_item_sparse),
        include_groups=False
    )
)
pred

  0%|          | 0/33899 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,session_id,timestamp,user_id,item_id,response,user_id_encoded,item_id_encoded,rank
recommendation_idx,event_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
16620,105025,10499,1551563617000,10499,470,,2979,106,0.016531
16620,105026,10499,1551563617000,10499,18528,,2979,4161,0.002710
16620,105027,10499,1551563617000,10499,7494,,2979,1663,0.017831
16620,105028,10499,1551563617000,10499,26217,,2979,5898,0.073300
16620,105029,10499,1551563617000,10499,4281,,2979,946,0.014775
...,...,...,...,...,...,...,...,...,...
163699,371574,37637,1547450684000,37637,19462,,10640,4389,0.084925
163699,371575,37637,1547450684000,37637,25404,,10640,5717,-0.003750
163699,371576,37637,1547450684000,37637,28598,,10640,6460,0.783719
163699,371577,37637,1547450684000,37637,23099,,10640,5194,0.900549


In [13]:
(
    pred
    .reset_index()
    .set_index('event_id')
    .loc[:, ['rank']]
    .sort_index()
    .to_csv('submission.csv', index=True)
)

In [14]:
!head submission.csv

event_id,rank
0,0.05449310317635536
1,0.017722394317388535
2,0.017313756048679352
3,0.010619734413921833
4,0.052425384521484375
5,-0.0020661172457039356
6,0.016765877604484558
7,-0.005121493712067604
8,-0.0021899088751524687
