# Preparation

In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
base_path_train = "~/shared/data/project/training"


items_df = pd.read_csv(os.path.join(base_path_train, "item_features.csv"))
purchase_df = pd.read_csv(os.path.join(base_path_train, "train_purchases.csv"))
session_df = pd.read_csv(os.path.join(base_path_train, "train_sessions.csv"))

In [3]:
items_df

Unnamed: 0,item_id,feature_category_id,feature_value_id
0,2,56,365
1,2,62,801
2,2,68,351
3,2,33,802
4,2,72,75
...,...,...,...
471746,28143,68,351
471747,28143,55,390
471748,28143,11,109
471749,28143,73,91


In [4]:
items_df.item_id.nunique()

23691

In [5]:
purchase_df

Unnamed: 0,session_id,item_id,date
0,3,15085,2020-12-18 21:26:47.986
1,13,18626,2020-03-13 19:36:15.507
2,18,24911,2020-08-26 19:20:32.049
3,19,12534,2020-11-02 17:16:45.92
4,24,13226,2020-02-26 18:27:44.114
...,...,...,...
999995,4439986,2915,2021-05-13 11:56:37.464
999996,4439990,8786,2020-08-22 14:28:22.382
999997,4439994,21630,2020-11-27 20:10:28.961
999998,4439999,16962,2020-11-27 11:01:41.356


In [6]:
session_df

Unnamed: 0,session_id,item_id,date
0,3,9655,2020-12-18 21:25:00.373
1,3,9655,2020-12-18 21:19:48.093
2,13,15654,2020-03-13 19:35:27.136
3,18,18316,2020-08-26 19:18:30.833
4,18,2507,2020-08-26 19:16:31.211
...,...,...,...
4743815,4440001,20409,2020-10-30 23:37:20.658
4743816,4440001,14155,2020-10-30 23:31:56.607
4743817,4440001,14303,2020-10-30 23:36:17.934
4743818,4440001,27852,2020-10-30 23:39:55.186


In [7]:
purchase_df_processed = purchase_df.copy()
purchase_df_processed["was_bought"] = True

session_df_processed = session_df.copy()
session_df_processed["was_bought"] = False
df_processed = pd.concat([purchase_df_processed, session_df_processed]).sort_values(["session_id", "date"])
df_processed

Unnamed: 0,session_id,item_id,date,was_bought
1,3,9655,2020-12-18 21:19:48.093,False
0,3,9655,2020-12-18 21:25:00.373,False
0,3,15085,2020-12-18 21:26:47.986,True
2,13,15654,2020-03-13 19:35:27.136,False
1,13,18626,2020-03-13 19:36:15.507,True
...,...,...,...,...
4743804,4440001,19539,2020-10-30 23:37:09.46,False
4743815,4440001,20409,2020-10-30 23:37:20.658,False
4743818,4440001,27852,2020-10-30 23:39:55.186,False
4743806,4440001,20449,2020-10-30 23:40:28.149,False


In [8]:
items_processed_df = items_df.pivot_table(values='feature_value_id', index='item_id', columns='feature_category_id').reset_index()
items_processed_df.index.names = ['index']
items_processed_df.columns = ["item_id"] + [f"item_feature_{x}" for x in list(range(73))]
items_processed_df

Unnamed: 0_level_0,item_id,item_feature_0,item_feature_1,item_feature_2,item_feature_3,item_feature_4,item_feature_5,item_feature_6,item_feature_7,item_feature_8,...,item_feature_63,item_feature_64,item_feature_65,item_feature_66,item_feature_67,item_feature_68,item_feature_69,item_feature_70,item_feature_71,item_feature_72
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,,,,,,,394.0,,,...,,,,,351.0,885.0,,,75.0,
1,3,,,889.0,618.0,605.0,,452.0,,,...,,521.0,,,14.0,592.0,,,75.0,544.0
2,4,,,793.0,618.0,605.0,,837.0,,,...,,521.0,,,373.0,538.0,,,75.0,544.0
3,7,,,,,,,536.0,,,...,,,,,739.0,592.0,,,75.0,
4,8,,,793.0,618.0,605.0,,798.0,,,...,,521.0,,,351.0,592.0,,,75.0,544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23686,28139,,,793.0,618.0,605.0,,798.0,,,...,,521.0,,,106.0,805.0,,,75.0,544.0
23687,28140,,53.0,,,,,,,,...,80.0,,,349.0,351.0,,,226.0,,544.0
23688,28141,461.0,,889.0,719.0,605.0,,2.0,,,...,,,,,379.0,499.0,,,75.0,544.0
23689,28142,,,,,,,619.0,,,...,,610.0,,,895.0,740.0,,,75.0,91.0


In [9]:
df_processed = df_processed.merge(items_processed_df, how="left", on="item_id")
df_processed["was_bought"] = df_processed["was_bought"].astype(float)
df_processed

Unnamed: 0,session_id,item_id,date,was_bought,item_feature_0,item_feature_1,item_feature_2,item_feature_3,item_feature_4,item_feature_5,...,item_feature_63,item_feature_64,item_feature_65,item_feature_66,item_feature_67,item_feature_68,item_feature_69,item_feature_70,item_feature_71,item_feature_72
0,3,9655,2020-12-18 21:19:48.093,0.0,,53.0,,,,,...,,,,349.0,393.0,,,,,544.0
1,3,9655,2020-12-18 21:25:00.373,0.0,,53.0,,,,,...,,,,349.0,393.0,,,,,544.0
2,3,15085,2020-12-18 21:26:47.986,1.0,,53.0,,,,,...,,,,349.0,97.0,,,,,544.0
3,13,15654,2020-03-13 19:35:27.136,0.0,,,,618.0,,766.0,...,,521.0,,,351.0,780.0,,,219.0,
4,13,18626,2020-03-13 19:36:15.507,1.0,,,793.0,618.0,605.0,,...,,,,,739.0,805.0,,,75.0,544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5743815,4440001,19539,2020-10-30 23:37:09.46,0.0,,,,618.0,,,...,,,,,351.0,885.0,,,75.0,544.0
5743816,4440001,20409,2020-10-30 23:37:20.658,0.0,,,,618.0,,,...,,,,,351.0,885.0,,,75.0,544.0
5743817,4440001,27852,2020-10-30 23:39:55.186,0.0,,,,618.0,,778.0,...,,550.0,,,351.0,362.0,,,75.0,544.0
5743818,4440001,20449,2020-10-30 23:40:28.149,0.0,,,,618.0,,778.0,...,,550.0,,,351.0,362.0,,,75.0,544.0


In [10]:
items_processed_df = items_processed_df.fillna(0)
items_processed_df

Unnamed: 0_level_0,item_id,item_feature_0,item_feature_1,item_feature_2,item_feature_3,item_feature_4,item_feature_5,item_feature_6,item_feature_7,item_feature_8,...,item_feature_63,item_feature_64,item_feature_65,item_feature_66,item_feature_67,item_feature_68,item_feature_69,item_feature_70,item_feature_71,item_feature_72
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,0.0,0.0,0.0,0.0,0.0,0.0,394.0,0.0,0.0,...,0.0,0.0,0.0,0.0,351.0,885.0,0.0,0.0,75.0,0.0
1,3,0.0,0.0,889.0,618.0,605.0,0.0,452.0,0.0,0.0,...,0.0,521.0,0.0,0.0,14.0,592.0,0.0,0.0,75.0,544.0
2,4,0.0,0.0,793.0,618.0,605.0,0.0,837.0,0.0,0.0,...,0.0,521.0,0.0,0.0,373.0,538.0,0.0,0.0,75.0,544.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,536.0,0.0,0.0,...,0.0,0.0,0.0,0.0,739.0,592.0,0.0,0.0,75.0,0.0
4,8,0.0,0.0,793.0,618.0,605.0,0.0,798.0,0.0,0.0,...,0.0,521.0,0.0,0.0,351.0,592.0,0.0,0.0,75.0,544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23686,28139,0.0,0.0,793.0,618.0,605.0,0.0,798.0,0.0,0.0,...,0.0,521.0,0.0,0.0,106.0,805.0,0.0,0.0,75.0,544.0
23687,28140,0.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,80.0,0.0,0.0,349.0,351.0,0.0,0.0,226.0,0.0,544.0
23688,28141,461.0,0.0,889.0,719.0,605.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,379.0,499.0,0.0,0.0,75.0,544.0
23689,28142,0.0,0.0,0.0,0.0,0.0,0.0,619.0,0.0,0.0,...,0.0,610.0,0.0,0.0,895.0,740.0,0.0,0.0,75.0,91.0


In [11]:
item_id2index = dict(zip(items_processed_df.item_id, items_processed_df.index))

In [12]:
all_items = list(items_processed_df["item_id"])

In [13]:
items_processed_array = np.array(items_processed_df.drop("item_id",axis=1))
items_processed_array[item_id2index[2]]

array([  0.,   0.,   0.,   0.,   0.,   0., 394.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,  38.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0., 123.,   0.,   0.,   0., 802.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0., 123.,   0.,   0.,  76.,   0.,   0.,   6.,   0.,   0.,
       365.,   0.,   0.,   0.,   0., 462., 801.,   0.,   0.,   0.,   0.,
         0., 351., 885.,   0.,   0.,  75.,   0.])

In [14]:
candidate_items = list(pd.read_csv("candidate_items.csv")["item_id"])
candidate_items[:10]

[4, 8, 9, 19, 20, 26, 33, 40, 51, 54]

In [15]:
import pandas as pd
import os
import numpy as np

In [16]:
base_path_train = "~/shared/data/project/test"


test_df = pd.read_csv(os.path.join(base_path_train, "test_sessions.csv"))

In [17]:
test_df["rating"]=0.5
test_df = test_df.drop("date", axis=1)

In [18]:
test_sessions = test_df.session_id.unique()
len(test_sessions)

50000

In [19]:
df_processed.loc[df_processed.was_bought==True, "rating"] = 1
df_processed.loc[df_processed.was_bought==False, "rating"] = 0.5
df_processed = pd.concat([df_processed, test_df])

In [20]:
ratings_raw = df_processed[["session_id", "item_id", "rating"]]
ratings_raw

Unnamed: 0,session_id,item_id,rating
0,3,9655,0.5
1,3,9655,0.5
2,3,15085,1.0
3,13,15654,0.5
4,13,18626,1.0
...,...,...,...
197619,186456690,10471,0.5
197620,186456690,13385,0.5
197621,186456690,10471,0.5
197622,186456690,5382,0.5


In [21]:
itemIds = ratings_raw.item_id.unique()
itemIds.sort()
sessionIds = ratings_raw.session_id.unique()
sessionIds.sort()

m = sessionIds.size
n = itemIds.size
numRatings = len(ratings_raw)

print ("There are", m, "sessions,", n, "items and", numRatings, "ratings.")


## create internal ids for movies and users, that have consecutive indexes starting from 0
itemId_to_itemIDX = dict(zip(itemIds, range(0, itemIds.size)))
itemIDX_to_itemId = dict(zip(range(0, itemIds.size), itemIds))

sessionId_to_sessionIDX = dict(zip(sessionIds, range(0, sessionIds.size )))
sessionIDX_to_sessionId = dict(zip(range(0, sessionIds.size), sessionIds))

## drop timestamps
ratings = pd.concat([ratings_raw['session_id'].map(sessionId_to_sessionIDX), ratings_raw['item_id'].map(itemId_to_itemIDX), ratings_raw['rating']], axis=1)
ratings.columns = ['session', 'item', 'rating']

display(ratings.head())

There are 1049714 sessions, 23618 items and 5941444 ratings.


Unnamed: 0,session,item,rating
0,0,8092,0.5
1,0,8092,0.5
2,0,12661,1.0
3,1,13153,0.5
4,1,15667,1.0


In [22]:
test_sessions_idx = [sessionId_to_sessionIDX[s] for s in test_sessions]
candidate_items_idx = [itemId_to_itemIDX[i] for i in candidate_items if i in itemId_to_itemIDX.keys()]

In [23]:
import csv
import pandas as pd
import numpy as np
from scipy import sparse as sp
from scipy.sparse.linalg import norm
import sklearn.preprocessing as pp

In [24]:
R = sp.csr_matrix((ratings.rating, (ratings.session, ratings.item)))
R_dok = R.todok()

m = R.shape[0]
n = R.shape[1]
numRatings = R.count_nonzero()

print("There are", m, "sessions,", n, "items and", numRatings, "ratings.")

There are 1049714 sessions, 23618 items and 5284342 ratings.


In [25]:
item_sums = R.sum(axis=0).A1 ## matrix converted to 1-D array via .A1
item_cnts = (R != 0).sum(axis=0).A1
item_avgs = item_sums / item_cnts
print("item_avgs", item_avgs)

item_avgs [0.5        0.57803468 0.6381733  ... 0.64480874 0.5        0.61428571]


In [26]:
user_sums = R.sum(axis=1).A1 ## matrix converted to 1-D array via .A1
user_cnts = (R != 0).sum(axis=1).A1
user_avgs = user_sums / user_cnts
print("user_avgs", user_avgs)

user_avgs [1.    0.75  0.625 ... 0.5   0.9   0.5  ]


In [27]:
def compute_pairwise_user_similarity(u_id, v_id):
    u = R[u_id,:].copy()
    v = R[v_id,:].copy()
    
    # YOUR CODE HERE
    u.data = (u.data - np.mean(u.data))
    v.data = (v.data - np.mean(v.data))
    
    numerator = u.dot(v.T).A.item()
    denominator = norm(u) * norm(v)
    
    if denominator == 0:
        similarity = 0.;
    else:
        similarity = numerator/denominator
    
    return similarity

In [28]:
display(compute_pairwise_user_similarity(2, 6))

0.0

In [29]:
def compute_user_similarities(u_id):
    uU = np.empty((m,))

    # YOUR CODE HERE
    Rc = R.copy()
    
    # mean-centering
    mc = np.repeat(user_avgs, user_cnts)
    Rc.data -= mc
    
    # normalizing
    norms = np.array(np.sqrt(Rc.multiply(Rc).sum(axis=1)))
    norms[norms == 0.0] = 0.00001 # avoid dividing by 0
    norms = norms.reshape((m,1))
    
    nc = np.repeat(norms, user_cnts)
    nc = nc.reshape(mc.shape)

    Rc.data /= nc
    
    # extracting user similarities
    u = Rc[u_id,:]
    uU = np.array(Rc.dot(u.T).todense())
    uU = uU.reshape((m,))
    
    return uU

In [30]:
uU = compute_user_similarities(2)
display(uU[6])

0.0

In [31]:
## default values
k = 5
with_abs_sim = False

def create_user_neighborhood(u_id, i_id):
    nh = {} ## the neighborhood dict with (user id: similarity) entries
    ## nh should not contain u_id and only include users that have rated i_id; there should be at most k neighbors
    uU = compute_user_similarities(u_id)
    uU_copy = uU.copy() ## so that we can modify it, but also keep the original
    
    # YOUR CODE HERE
    uU_copy = np.nan_to_num(uU_copy)
    if with_abs_sim:
        uU_copy = np.absolute(uU_copy)

    # check which users have rated i_id
    have_rated_idx = np.array([v_id for v_id in range(m) if (v_id, i_id) in R_dok])
    
    # create new uU and fill it with -infinity and then fill it with uUnew for users having rated i_id
    uU_new = np.full(uU_copy.shape, -np.inf)
    uU_new[have_rated_idx] = uU_copy[have_rated_idx]
    # user u_id should not occur in the neighborhood
    uU_new[u_id] = -np.inf
    
    # sort in decreasing order and retrieve top k, which are greater than -infinity
    v_ids = np.argsort(uU_new)[::-1]
    v_ids = v_ids[:k]
    v_ids = v_ids[uU[v_ids] > -np.inf]
    
    # generating the neighborhood dictionary
    nh = {v_id : uU[v_id] for v_id in v_ids}
    
    return nh

In [32]:
k = 5
with_abs_sim = False
nh = create_user_neighborhood(0, 8)
print("with_abs_sim", with_abs_sim)
display(nh)
with_abs_sim = True
nh = create_user_neighborhood(0, 8)
print("with_abs_sim", with_abs_sim)
display(nh)

with_abs_sim False


{486674: 0.0, 594945: 0.0, 522336: 0.0, 306756: 0.0, 173635: 0.0}

with_abs_sim True


{486674: 0.0, 594945: 0.0, 522336: 0.0, 306756: 0.0, 173635: 0.0}

In [33]:
## a default value
with_deviations = True

def predict_rating(u_id, i_id):
    
    if (u_id, i_id) in R_dok:
        pass#print("user", u_id, "has rated item", i_id, "with", R[u_id, i_id])
    else:
        pass#print("user", u_id, "has not rated item", i_id)
    pass#print("k:", k, "with_deviations:", with_deviations, "with_abs_sim:", with_abs_sim)
    
    
    nh = create_user_neighborhood(u_id, i_id)
    
    neighborhood_weighted_avg = 0.

    # YOUR CODE HERE
    Rc = R.copy()
    
    if with_deviations:
        # mean-centering if with deviations
        mc = np.repeat(user_avgs, user_cnts)
        Rc.data -= mc
    
    # generating sparse matrix with w_uv values from user neighborhood
    row = np.array(list(nh.keys()))
    col = np.zeros(shape=(len(nh),))
    data = np.array(list(nh.values()))
    nh_vec = sp.csr_matrix((data, (row, col)), shape=(m,1)) 
    
    # calculating nominator and denominator
    nominator = nh_vec.T.dot(Rc[:,i_id]).A[0,0]
    denominator = np.sum(np.abs(data))
    
    neighborhood_weighted_avg = nominator / denominator
    
    if with_deviations:
        prediction = user_avgs[u_id] + neighborhood_weighted_avg
        #print(f'prediction {prediction:.4f} (user_avg {user_avgs[u_id]:.4f} offset {neighborhood_weighted_avg:.4f})')
    else:
        prediction = neighborhood_weighted_avg
        #print(f'prediction {prediction:.4f} (user_avg {user_avgs[u_id]:.4f})')
        
    return prediction

In [34]:
k = 50
with_abs_sim = True
with_deviations = False
predict_rating(100, 8)
with_deviations = True
predict_rating(3234, 8)

  neighborhood_weighted_avg = nominator / denominator


nan

In [35]:
predict_rating(100, 103)

  neighborhood_weighted_avg = nominator / denominator


nan

In [37]:
for x in range(5):
    for x in range(4):
        print(predict_rating(x, 10))

  neighborhood_weighted_avg = nominator / denominator


nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


## Prediction

In [36]:
def predict_session(s):
    candidate_rank_dict = {}
    for i in candidate_items_idx:
        print(i)
        try:
            p = predict_rating(s, i)
        except:
            p = np.nan
            print("NAN exception")
        try:
            candidate_item = itemIDX_to_itemId[i]
            candidate_rank_dict[candidate_item] = p
        except:
            pass
    candidate_rank_df = pd.DataFrame(candidate_rank_dict.items(), columns = ["item_id", "score"])
    candidate_rank_df = candidate_rank_df.sort_values("score", ascending=False).head(100).reset_index(drop=True)
    candidate_rank_df["rank"] = candidate_rank_df.index + 1
    session_id = sessionIDX_to_sessionId[s]
    candidate_rank_df["session_id"] = session_id
    #candidate_rank_df = candidate_rank_df.drop("score", axis=1)[["session_id", "item_id", "rank"]]
    return candidate_rank_df

In [37]:
s = test_sessions_idx[0]
candidate_rank_df = predict_session(s)
candidate_rank_df

2


  neighborhood_weighted_avg = nominator / denominator


4
5
13
14
18
24
29
38
41
50
52
54
55
63
72
74
83
85
89
103
122
130
131
138
146
148
156
157
162
165
167
168
181
184
185
190
197
206
207
208
212
216
222
228
235
242
249
255
256
261
271
278
280
283
285
289
294
298
299
305
307
308
309
317
325
326
342
344
346
348
351
360
362
364
366
368
370
374
375
378
388
397
398
400
403
406
412
413
417
425
427
430
434
440
450
451
456
457
464
471
473
474
475
478
487
488
489
490
495
509
516
517
521
526
532
533
535
536
537
539
548
551
552
553
555
562
563
564
567
574
579
581
594
595
600
602
603
608
612
615
616
619
626
632
635
638
640
652
658
660
665
668
670
672
679
683
688
697
717
722
726
728
731
733
734
738
739
742
744
746
747
749
752
754
755
756
765
774
776
783
786
787
788
791
793
806
807
809
817
822
828
834
839
841
842
846
848
852
854
856
863
867
868
873
874
876
881
885
891
898
907
909
915
916
918
919
920
921
924
927
928
930
935
936
946
948
949
951
969
973
976
983
986
998
1005
1010
1011
1012
1013
1015
1016
1024
1029
1033
1041
1044
1049
1058
1061
1070
1091


7957
7962
7963
7969
7974
7981
7982
7985
7986
7993
7996
8011
8016
8018
8020
8026
8040
8047
8054
8065
8067
8073
8074
8075
8076
8078
8080
8083
8088
8095
8098
8099
8111
8112
8115
8120
8122
8126
8137
8139
8148
8155
8156
8159
8163
8166
8176
8181
8192
8194
8209
8210
8212
8217
8233
8234
8239
8244
8245
8254
8258
8260
8264
8268
8278
8280
8291
8292
8295
8306
8310
8313
8314
8316
8320
8323
8332
8337
8349
8352
8363
8367
8374
8380
8381
8386
8394
8398
8402
8406
8409
8418
8427
8429
8445
8459
8460
8473
8487
8489
8494
8495
8500
8502
8509
8519
8528
8530
8531
8532
8535
8538
8550
8555
8561
8566
8570
8572
8575
8578
8585
8589
8602
8606
8609
8617
8624
8625
8626
8629
8630
8633
8640
8650
8653
8655
8657
8662
8663
8669
8675
8677
8678
8682
8683
8689
8708
8711
8713
8720
8722
8726
8739
8747
8748
8753
8758
8765
8767
8768
8769
8780
8782
8787
8788
8791
8792
8793
8795
8796
8803
8808
8810
8811
8812
8813
8818
8820
8829
8838
8840
8842
8843
8844
8845
8851
8852
8853
8854
8856
8860
8864
8869
8884
8887
8896
8915
8916
8917
8919


14996
15020
15022
15037
15038
15043
15058
15063
15066
15069
15072
15074
15075
15077
15082
15086
15090
15091
15094
15095
15096
15099
15108
15112
15118
15121
15124
15127
15140
15161
15164
15172
15173
15175
15179
15188
15192
15195
15208
15210
15217
15219
15220
15225
15226
15228
15231
15234
15240
15242
15243
15249
15254
15263
15267
15270
15271
15274
15289
15292
15294
15296
15297
15312
15313
15320
15323
15324
15327
15330
15332
15333
15342
15343
15349
15356
15358
15379
15388
15395
15400
15413
15415
15418
15422
15423
15424
15431
15453
15460
15474
15476
15490
15492
15495
15496
15498
15501
15509
15510
15514
15522
15525
15536
15552
15554
15559
15560
15563
15567
15569
15571
15575
15581
15598
15600
15611
15613
15616
15625
15627
15633
15636
15644
15647
15651
15665
15667
15671
15675
15681
15683
15686
15688
15693
15700
15701
15703
15707
15711
15714
15726
15733
15735
15742
15744
15746
15747
15755
15757
15761
15764
15769
15784
15786
15787
15797
15799
15800
15805
15809
15812
15820
15824
15829
15833
1583

21689
21690
21694
21696
21713
21719
21737
21739
21741
21753
21755
21756
21764
21773
21774
21778
21786
21789
21794
21795
21800
21802
21810
21811
21813
21820
21824
21827
21828
21847
21848
21857
21858
21874
21889
21894
21899
21904
21912
21919
21920
21923
21927
21928
21931
21933
21936
21940
21941
21951
21954
21959
21962
21969
21977
21982
21984
21985
21989
21993
21998
21999
22005
22006
22007
22010
22011
22013
22015
22019
22020
22029
22030
22032
22044
22045
22048
22053
22054
22056
22060
22066
22069
22072
22078
22081
22090
22092
22093
22095
22096
22098
22099
22112
22117
22123
22131
22132
22138
22157
22162
22165
22174
22175
22180
22185
22188
22189
22193
22196
22200
22201
22202
22208
22212
22218
22221
22223
22225
22227
22231
22246
22255
22262
22266
22267
22268
22269
22273
22280
22282
22289
22291
22295
22296
22301
22308
22314
22315
22323
22326
22332
22337
22341
22353
22360
22362
22371
22372
22374
22376
22378
22382
22385
22390
22392
22400
22401
22402
22403
22407
22417
22420
22440
22441
22454
2245

Unnamed: 0,item_id,score,rank,session_id
0,4,,1,126
1,8,,2,126
2,9,,3,126
3,19,,4,126
4,20,,5,126
...,...,...,...,...
95,550,,96,126
96,551,,97,126
97,557,,98,126
98,558,,99,126


In [None]:
s = test_sessions_idx[1]
candidate_rank_df = predict_session(s)
candidate_rank_df

2


  neighborhood_weighted_avg = nominator / denominator


4
5
13
14
18
24
29
38
41
50
52
54
55
63
72
74
83
85
89
103
122
130
131
138
NAN exception
146
148
156
157
162
165
167
168
181
184
185
190
197
206
207
208
212
216
222
228
NAN exception
235
NAN exception
242
NAN exception
249
NAN exception
255
NAN exception
256
261
271
278
280
283
285
289
294
298
299
305


In [None]:
if False:
    candidate_rank_dfs = []
    for s in test_sessions_idx:
        candidate_rank_df = predict_session(s)
        candidate_rank_dfs.append(candidate_rank_df)

In [None]:
max(candidate_items_idx)