# Preparation

In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
base_path_train = "~/shared/data/project/training"


items_df = pd.read_csv(os.path.join(base_path_train, "item_features.csv"))
purchase_df = pd.read_csv(os.path.join(base_path_train, "train_purchases.csv"))
session_df = pd.read_csv(os.path.join(base_path_train, "train_sessions.csv"))

In [3]:
items_df

Unnamed: 0,item_id,feature_category_id,feature_value_id
0,2,56,365
1,2,62,801
2,2,68,351
3,2,33,802
4,2,72,75
...,...,...,...
471746,28143,68,351
471747,28143,55,390
471748,28143,11,109
471749,28143,73,91


In [4]:
items_df.item_id.nunique()

23691

In [5]:
purchase_df

Unnamed: 0,session_id,item_id,date
0,3,15085,2020-12-18 21:26:47.986
1,13,18626,2020-03-13 19:36:15.507
2,18,24911,2020-08-26 19:20:32.049
3,19,12534,2020-11-02 17:16:45.92
4,24,13226,2020-02-26 18:27:44.114
...,...,...,...
999995,4439986,2915,2021-05-13 11:56:37.464
999996,4439990,8786,2020-08-22 14:28:22.382
999997,4439994,21630,2020-11-27 20:10:28.961
999998,4439999,16962,2020-11-27 11:01:41.356


In [6]:
session_df

Unnamed: 0,session_id,item_id,date
0,3,9655,2020-12-18 21:25:00.373
1,3,9655,2020-12-18 21:19:48.093
2,13,15654,2020-03-13 19:35:27.136
3,18,18316,2020-08-26 19:18:30.833
4,18,2507,2020-08-26 19:16:31.211
...,...,...,...
4743815,4440001,20409,2020-10-30 23:37:20.658
4743816,4440001,14155,2020-10-30 23:31:56.607
4743817,4440001,14303,2020-10-30 23:36:17.934
4743818,4440001,27852,2020-10-30 23:39:55.186


In [7]:
purchase_df_processed = purchase_df.copy()
purchase_df_processed["was_bought"] = True

session_df_processed = session_df.copy()
session_df_processed["was_bought"] = False
df_processed = pd.concat([purchase_df_processed, session_df_processed]).sort_values(["session_id", "date"])
df_processed

Unnamed: 0,session_id,item_id,date,was_bought
1,3,9655,2020-12-18 21:19:48.093,False
0,3,9655,2020-12-18 21:25:00.373,False
0,3,15085,2020-12-18 21:26:47.986,True
2,13,15654,2020-03-13 19:35:27.136,False
1,13,18626,2020-03-13 19:36:15.507,True
...,...,...,...,...
4743804,4440001,19539,2020-10-30 23:37:09.46,False
4743815,4440001,20409,2020-10-30 23:37:20.658,False
4743818,4440001,27852,2020-10-30 23:39:55.186,False
4743806,4440001,20449,2020-10-30 23:40:28.149,False


In [8]:
items_processed_df = items_df.pivot_table(values='feature_value_id', index='item_id', columns='feature_category_id').reset_index()
items_processed_df.index.names = ['index']
items_processed_df.columns = ["item_id"] + [f"item_feature_{x}" for x in list(range(73))]
items_processed_df

Unnamed: 0_level_0,item_id,item_feature_0,item_feature_1,item_feature_2,item_feature_3,item_feature_4,item_feature_5,item_feature_6,item_feature_7,item_feature_8,...,item_feature_63,item_feature_64,item_feature_65,item_feature_66,item_feature_67,item_feature_68,item_feature_69,item_feature_70,item_feature_71,item_feature_72
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,,,,,,,394.0,,,...,,,,,351.0,885.0,,,75.0,
1,3,,,889.0,618.0,605.0,,452.0,,,...,,521.0,,,14.0,592.0,,,75.0,544.0
2,4,,,793.0,618.0,605.0,,837.0,,,...,,521.0,,,373.0,538.0,,,75.0,544.0
3,7,,,,,,,536.0,,,...,,,,,739.0,592.0,,,75.0,
4,8,,,793.0,618.0,605.0,,798.0,,,...,,521.0,,,351.0,592.0,,,75.0,544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23686,28139,,,793.0,618.0,605.0,,798.0,,,...,,521.0,,,106.0,805.0,,,75.0,544.0
23687,28140,,53.0,,,,,,,,...,80.0,,,349.0,351.0,,,226.0,,544.0
23688,28141,461.0,,889.0,719.0,605.0,,2.0,,,...,,,,,379.0,499.0,,,75.0,544.0
23689,28142,,,,,,,619.0,,,...,,610.0,,,895.0,740.0,,,75.0,91.0


In [9]:
df_processed = df_processed.merge(items_processed_df, how="left", on="item_id")
df_processed["was_bought"] = df_processed["was_bought"].astype(float)
df_processed

Unnamed: 0,session_id,item_id,date,was_bought,item_feature_0,item_feature_1,item_feature_2,item_feature_3,item_feature_4,item_feature_5,...,item_feature_63,item_feature_64,item_feature_65,item_feature_66,item_feature_67,item_feature_68,item_feature_69,item_feature_70,item_feature_71,item_feature_72
0,3,9655,2020-12-18 21:19:48.093,0.0,,53.0,,,,,...,,,,349.0,393.0,,,,,544.0
1,3,9655,2020-12-18 21:25:00.373,0.0,,53.0,,,,,...,,,,349.0,393.0,,,,,544.0
2,3,15085,2020-12-18 21:26:47.986,1.0,,53.0,,,,,...,,,,349.0,97.0,,,,,544.0
3,13,15654,2020-03-13 19:35:27.136,0.0,,,,618.0,,766.0,...,,521.0,,,351.0,780.0,,,219.0,
4,13,18626,2020-03-13 19:36:15.507,1.0,,,793.0,618.0,605.0,,...,,,,,739.0,805.0,,,75.0,544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5743815,4440001,19539,2020-10-30 23:37:09.46,0.0,,,,618.0,,,...,,,,,351.0,885.0,,,75.0,544.0
5743816,4440001,20409,2020-10-30 23:37:20.658,0.0,,,,618.0,,,...,,,,,351.0,885.0,,,75.0,544.0
5743817,4440001,27852,2020-10-30 23:39:55.186,0.0,,,,618.0,,778.0,...,,550.0,,,351.0,362.0,,,75.0,544.0
5743818,4440001,20449,2020-10-30 23:40:28.149,0.0,,,,618.0,,778.0,...,,550.0,,,351.0,362.0,,,75.0,544.0


In [10]:
items_processed_df = items_processed_df.fillna(0)
items_processed_df

Unnamed: 0_level_0,item_id,item_feature_0,item_feature_1,item_feature_2,item_feature_3,item_feature_4,item_feature_5,item_feature_6,item_feature_7,item_feature_8,...,item_feature_63,item_feature_64,item_feature_65,item_feature_66,item_feature_67,item_feature_68,item_feature_69,item_feature_70,item_feature_71,item_feature_72
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,0.0,0.0,0.0,0.0,0.0,0.0,394.0,0.0,0.0,...,0.0,0.0,0.0,0.0,351.0,885.0,0.0,0.0,75.0,0.0
1,3,0.0,0.0,889.0,618.0,605.0,0.0,452.0,0.0,0.0,...,0.0,521.0,0.0,0.0,14.0,592.0,0.0,0.0,75.0,544.0
2,4,0.0,0.0,793.0,618.0,605.0,0.0,837.0,0.0,0.0,...,0.0,521.0,0.0,0.0,373.0,538.0,0.0,0.0,75.0,544.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,536.0,0.0,0.0,...,0.0,0.0,0.0,0.0,739.0,592.0,0.0,0.0,75.0,0.0
4,8,0.0,0.0,793.0,618.0,605.0,0.0,798.0,0.0,0.0,...,0.0,521.0,0.0,0.0,351.0,592.0,0.0,0.0,75.0,544.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23686,28139,0.0,0.0,793.0,618.0,605.0,0.0,798.0,0.0,0.0,...,0.0,521.0,0.0,0.0,106.0,805.0,0.0,0.0,75.0,544.0
23687,28140,0.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,80.0,0.0,0.0,349.0,351.0,0.0,0.0,226.0,0.0,544.0
23688,28141,461.0,0.0,889.0,719.0,605.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,379.0,499.0,0.0,0.0,75.0,544.0
23689,28142,0.0,0.0,0.0,0.0,0.0,0.0,619.0,0.0,0.0,...,0.0,610.0,0.0,0.0,895.0,740.0,0.0,0.0,75.0,91.0


In [11]:
from sklearn.preprocessing import OneHotEncoder

to_encode = items_processed_df.drop("item_id", axis=1)

encoded = OneHotEncoder().fit_transform(to_encode)
print(to_encode.shape)
print(encoded.shape)

(23691, 73)
(23691, 1399)


In [12]:
items_encoded = np.array(encoded.toarray())
items_encoded

array([[1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.]])

In [13]:
item_id2index = dict(zip(items_processed_df.item_id, items_processed_df.index))

In [14]:
all_items = list(items_processed_df["item_id"])

In [15]:
items_processed_array = np.array(items_processed_df.drop("item_id",axis=1))
items_processed_array[item_id2index[2]]

array([  0.,   0.,   0.,   0.,   0.,   0., 394.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,  38.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0., 123.,   0.,   0.,   0., 802.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0., 123.,   0.,   0.,  76.,   0.,   0.,   6.,   0.,   0.,
       365.,   0.,   0.,   0.,   0., 462., 801.,   0.,   0.,   0.,   0.,
         0., 351., 885.,   0.,   0.,  75.,   0.])

In [16]:
candidate_items = list(pd.read_csv("candidate_items.csv")["item_id"])
candidate_items[:10]

[4, 8, 9, 19, 20, 26, 33, 40, 51, 54]

# Word2vec

In [17]:
session_ids_cand = df_processed[(df_processed.was_bought==1) & (df_processed.item_id.isin(candidate_items))].session_id

In [18]:
#df_w2v = df_processed[df_processed.session_id < 10000].sort_values(["session_id", "date"])[["session_id", "item_id", "was_bought"]]
df_w2v = df_processed[(df_processed.session_id.isin(session_ids_cand)) & (df_processed.session_id < 50000)].sort_values(["session_id", "date"])[["session_id", "item_id", "was_bought"]]
df_w2v

Unnamed: 0,session_id,item_id,was_bought
3,13,15654,0.0
4,13,18626,1.0
42,31,25972,0.0
43,31,16289,0.0
44,31,2069,0.0
...,...,...,...
64212,49975,2072,0.0
64213,49975,2072,0.0
64214,49975,23565,1.0
64224,49993,19048,0.0


In [19]:
item_list = list(df_w2v.item_id.sort_values().unique())
item_dict = {v: k for k, v in dict(zip(range(len(item_list)), item_list)).items()}
item_dict[4]

1

In [20]:
item_dict_rev = {v: k for k, v in item_dict.items()}

In [21]:
def get_values(x):
    x = x["item_id"]
    return [item_dict[x] for x in list(x.values.ravel())]

all_words = df_w2v.groupby('session_id').apply(get_values).to_list()

def get_values(x):
    x = x["item_id"]
    return [item_dict[x] for x in list(x.values.ravel())]

prev_words = df_w2v[(df_w2v.was_bought==0)].groupby('session_id').apply(get_values).to_list()

def get_values(x):
    x = x["item_id"]
    return item_dict[x.values.ravel()[0]]

next_words = df_w2v[(df_w2v.was_bought==1)].groupby('session_id').apply(get_values).to_list()

In [22]:
prev_words[:5]

[[3943],
 [6554, 4100, 519, 519, 6688, 63, 6983, 1105],
 [5219, 5698, 2622, 4410],
 [1614, 5014, 540, 3968, 5381, 4338],
 [5002, 2634, 4018, 5002, 2634, 4187, 5002, 4187]]

In [23]:
next_words[:5]

[4714, 2148, 2993, 5360, 6556]

In [24]:
unique_words = [item_dict[x] for x in df_w2v["item_id"].unique()]
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))
len(unique_words)

7130

In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
max_seq_length = max([len(x) for x in prev_words])
input_seqs = np.array(pad_sequences(prev_words, maxlen=max_seq_length, padding='pre'))

print(max_seq_length)
print(input_seqs[:5])

90
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0 3943]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0 6554 4100
   519  519 6688   63 6983 1105]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0 

In [27]:
from gensim.models.word2vec import Word2Vec

In [28]:
w2v = Word2Vec(all_words, min_count=1)

In [29]:
#list the vocabulary words
words = list(w2v.wv.index_to_key)

print(words[:5])

[6799, 2078, 5052, 2054, 5274]


In [30]:
my_dict = dict({})
for idx, key in enumerate(w2v.wv.index_to_key):
    my_dict[key] = w2v.wv[key]
    # Or my_dict[key] = model.wv.get_vector(key)
    # Or my_dict[key] = model.wv.word_vec(key, use_norm=False)

In [31]:
embeddings_matrix = np.zeros((len(words), 1399))

for i, word in enumerate(words):
    embedding_vector = my_dict[word]
    item_id = item_dict_rev[word]
    idx = items_processed_df[items_processed_df.item_id == item_id].index[0]
    embedding_vector = items_encoded[idx]
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

In [32]:
embeddings_matrix

array([[1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.]])

In [33]:
embeddings_matrix.shape

(7130, 1399)

In [34]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam, SGD

In [35]:
model = tf.keras.Sequential(
    [tf.keras.layers.Embedding(input_dim = len(words), output_dim=1399, weights=[embeddings_matrix], input_length=max_seq_length, trainable=False),
     tf.keras.layers.LSTM(256),
     tf.keras.layers.Dropout(0.2),
     tf.keras.layers.Dense(128, activation='relu'),
     tf.keras.layers.Dense(len(words) , activation='softmax')
    ])

opt = SGD(lr=10**(-6))
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

2022-06-16 18:34:47.468023: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-06-16 18:34:47.468075: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-06-16 18:34:47.468098: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jupyter-e01304563): /proc/driver/nvidia/version does not exist
2022-06-16 18:34:47.468381: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [36]:
input_seqs.shape

(5042, 90)

In [37]:
next_words = np.array(next_words)

In [38]:
next_words.shape

(5042,)

In [39]:
next_words

array([4714, 2148, 2993, ...,  821, 5948, 6799])

In [40]:
outputs = tf.keras.utils.to_categorical(next_words, num_classes=len(words))
outputs.shape

(5042, 7130)

In [41]:
tf.config.run_functions_eagerly(True)

In [42]:
history = model.fit(input_seqs, outputs, epochs=10, validation_split=0.2, verbose=1, batch_size=256)

2022-06-16 18:34:48.003610: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
df_w2v_test = df_processed[(df_processed.session_id > 50000) & (df_processed.session_id < 51000)].sort_values(["session_id", "date"])[["session_id", "item_id", "was_bought"]]
df_w2v_test

Unnamed: 0,session_id,item_id,was_bought
64229,50002,15425,0.0
64230,50002,18324,0.0
64231,50002,20933,0.0
64232,50002,12742,0.0
64233,50002,15738,0.0
...,...,...,...
65525,50983,12179,0.0
65526,50983,10126,0.0
65527,50983,23282,1.0
65528,50996,25990,0.0


In [44]:
def get_values(x):
    x = x["item_id"]
    return [item_dict[x] for x in list(x.values.ravel()) if x in item_dict.keys()]

prev_words_test = df_w2v_test[(df_w2v_test.was_bought==0)].groupby('session_id').apply(get_values).to_list()

In [45]:
prev_words_test

[[3868,
  4643,
  5324,
  3208,
  3968,
  4802,
  4690,
  4690,
  6668,
  6405,
  6668,
  3749,
  2072,
  3749,
  1201,
  5575],
 [6917],
 [916],
 [2873],
 [2248, 6015, 2248, 2248, 6405, 6180, 2801, 2982, 2982, 3504],
 [18, 6113],
 [],
 [345, 4895, 709],
 [2591],
 [1344],
 [3599, 3599, 3599],
 [3013, 5062, 448, 665, 1770],
 [1535, 257],
 [2712, 4422, 1886, 2989, 5147, 4682, 285],
 [3359],
 [5843,
  5688,
  2666,
  2649,
  186,
  2232,
  4096,
  5185,
  885,
  5503,
  2656,
  5770,
  3771,
  2212,
  6835,
  2895,
  6902,
  5915,
  2750,
  2666,
  1609,
  2212],
 [],
 [],
 [1113, 1711],
 [6183],
 [3985, 3985, 3985],
 [],
 [5392],
 [2627, 982, 4907, 1980, 2068, 153, 153],
 [],
 [3643],
 [3615],
 [],
 [3563, 4038, 1687, 1687, 2787, 5092, 3052, 1930, 3301],
 [6906],
 [5955],
 [1173,
  2989,
  6331,
  5505,
  1574,
  67,
  1025,
  3054,
  1464,
  5296,
  1979,
  6437,
  780,
  7003,
  1035,
  3273,
  1035,
  6718,
  4356,
  1979],
 [4790, 5639, 548, 548, 5639],
 [2448],
 [4751, 2803, 2803, 6

In [46]:
input_seqs_test = np.array(pad_sequences(prev_words_test, maxlen=max_seq_length, padding='pre'))

In [47]:
input_seqs_test

array([[   0,    0,    0, ..., 3749, 1201, 5575],
       [   0,    0,    0, ...,    0,    0, 6917],
       [   0,    0,    0, ...,    0,    0,  916],
       ...,
       [   0,    0,    0, ...,    0,    0, 3832],
       [   0,    0,    0, ...,    0, 5962, 3072],
       [   0,    0,    0, ...,    0,    0,    0]], dtype=int32)

In [48]:
preds = model.predict(input_seqs_test)



In [49]:
preds.shape

(223, 7130)

In [50]:
preds

array([[0.00014118, 0.00013514, 0.00013757, ..., 0.00013797, 0.00014673,
        0.00014414],
       [0.00014181, 0.00013367, 0.00013819, ..., 0.00013818, 0.00014492,
        0.00014376],
       [0.00014252, 0.00013392, 0.0001373 , ..., 0.00013725, 0.00014602,
        0.00014273],
       ...,
       [0.00014242, 0.00013321, 0.00013714, ..., 0.00013777, 0.00014584,
        0.00014274],
       [0.00014243, 0.00013421, 0.00013763, ..., 0.00013711, 0.00014547,
        0.00014298],
       [0.0001435 , 0.00013281, 0.00013907, ..., 0.00013688, 0.00014593,
        0.00014271]], dtype=float32)

In [51]:
input_seqs_test.shape

(223, 90)

In [52]:
pd.Series(np.argmax(preds, axis=-1)).unique()

array([4272, 2929, 1540, 1785,  935, 3519, 6247, 6292, 2787, 4797, 1466,
       2494, 4534, 6974])

In [53]:
preds.argsort()[:,-100:]

array([[6833, 3780, 2645, ..., 4798, 6974, 4272],
       [1171, 4264, 5509, ..., 1466,  935, 2929],
       [3299, 4344, 5052, ..., 2929, 1540, 4272],
       ...,
       [ 810, 4270, 4604, ..., 2929, 4272, 1540],
       [2855, 6549, 1024, ..., 1540, 4272, 2929],
       [6668, 6472, 2701, ..., 4955, 4272, 2929]])

In [54]:
arr = preds.argsort().astype("float32")

In [55]:
arr

array([[2523., 5470., 1975., ..., 4798., 6974., 4272.],
       [6671., 5891., 2428., ..., 1466.,  935., 2929.],
       [2428., 4832., 6671., ..., 2929., 1540., 4272.],
       ...,
       [2428., 6671., 5891., ..., 2929., 4272., 1540.],
       [6671., 2428., 5891., ..., 1540., 4272., 2929.],
       [6671., 2428., 6322., ..., 4955., 4272., 2929.]], dtype=float32)

In [56]:
len(candidate_items)

4990

In [57]:
candidate_items_keys = [item_dict[x] for x in candidate_items if x in item_dict.keys()]
candidate_items_keys[:4]

[1, 2, 4, 5]

In [58]:
len(candidate_items_keys)

3903

In [59]:
cond = np.isin(arr.astype(int), candidate_items_keys)
cond

array([[False, False,  True, ..., False,  True,  True],
       [False,  True,  True, ...,  True, False, False],
       [ True,  True, False, ..., False,  True,  True],
       ...,
       [ True, False,  True, ..., False,  True,  True],
       [False,  True,  True, ...,  True,  True, False],
       [False,  True, False, ..., False,  True, False]])

In [60]:
arr[~cond] = np.nan
arr

array([[  nan,   nan, 1975., ...,   nan, 6974., 4272.],
       [  nan, 5891., 2428., ..., 1466.,   nan,   nan],
       [2428., 4832.,   nan, ...,   nan, 1540., 4272.],
       ...,
       [2428.,   nan, 5891., ...,   nan, 4272., 1540.],
       [  nan, 2428., 5891., ..., 1540., 4272.,   nan],
       [  nan, 2428.,   nan, ...,   nan, 4272.,   nan]], dtype=float32)

In [61]:
df_w2v_test.session_id.unique()[:3]

array([50002, 50013, 50016])

In [62]:
arrlist = arr.tolist()
candidate_rank_dfs = []

for i in range(3):
    rank_dict = {}
    session_id  = df_w2v_test.session_id.unique()[i]
    
    my_list = arrlist[i]
    y = [x for x in my_list if x>=0 and x in item_dict.keys()]
    ranked_list = [item_dict_rev[x] for x in y[-100:]]
    
    for num, item in enumerate(ranked_list):
        rank_dict[item] = len(ranked_list) - num
    candidate_rank_df = pd.DataFrame(rank_dict.items(), columns = ["item_id", "rank"]).sort_values("rank")
    candidate_rank_df["session_id"] = session_id
    candidate_rank_dfs.append(candidate_rank_df)
    
candidate_rank_df = pd.concat(candidate_rank_dfs)
candidate_rank_df = candidate_rank_df[["session_id", "item_id", "rank"]].reset_index(drop=True)
candidate_rank_df

Unnamed: 0,session_id,item_id,rank
0,50002,24914,1
1,50002,5907,2
2,50002,13638,3
3,50002,15610,4
4,50002,1020,5
...,...,...,...
295,50016,19960,96
296,50016,25806,97
297,50016,27738,98
298,50016,25947,99


# Test data

In [63]:
import pandas as pd
import os
import numpy as np

base_path_test = "~/shared/data/project/test"


test_df = pd.read_csv(os.path.join(base_path_test, "test_sessions.csv"))

test_df.session_id.max()

186479748

In [64]:
session_id_limits = list(range(0, test_df.session_id.max() + 10000000, 10000000))

In [65]:
test_dfs = []
for i in range(len(session_id_limits)-1):
    session_id_limit1 = session_id_limits[i]
    session_id_limit2 = session_id_limits[i+1]
    test_df_small = test_df[(test_df.session_id >= session_id_limit1) & (test_df.session_id < session_id_limit2)]
    test_dfs.append(test_df_small)

In [66]:
len(test_df)

197624

In [67]:
def get_values(x):
    x = x["item_id"]
    return [item_dict[x] for x in list(x.values.ravel()) if x in item_dict.keys()]

In [68]:
candidate_rank_dfs = []
for test_df_small in test_dfs:
    
    prev_words_test = test_df_small.groupby('session_id').apply(get_values).to_list()
    
    input_seqs_test = np.array(pad_sequences(prev_words_test, maxlen=max_seq_length, padding='pre'))

    input_seqs_test

    preds = model.predict(input_seqs_test)

    preds.shape

    preds

    input_seqs_test.shape

    np.argmax(preds, axis=-1)

    preds.argsort()[:,-100:]

    arr = preds.argsort().astype("float32")

    arr

    cond = np.isin(arr.astype(int), candidate_items_keys)
    cond

    arr[~cond] = np.nan
    arr

    test_df.session_id.unique()[:3]

    arrlist = arr.tolist()

    
    for i in range(len(test_df_small.session_id.unique())):
        rank_dict = {}
        session_id  = test_df_small.session_id.unique()[i]

        my_list = arrlist[i]
        y = [x for x in my_list if x>=0 and x in item_dict.keys()]
        ranked_list = [item_dict_rev[x] for x in y[-100:]]

        for num, item in enumerate(ranked_list):
            rank_dict[item] = len(ranked_list) - num
        candidate_rank_df = pd.DataFrame(rank_dict.items(), columns = ["item_id", "rank"]).sort_values("rank")
        candidate_rank_df["session_id"] = session_id
        candidate_rank_dfs.append(candidate_rank_df)
    #display(len(pd.concat(candidate_rank_dfs).session_id.unique()))
    #display(pd.concat(candidate_rank_dfs))

candidate_rank_df = pd.concat(candidate_rank_dfs)
candidate_rank_df = candidate_rank_df[["session_id", "item_id", "rank"]].reset_index(drop=True)
candidate_rank_df



Unnamed: 0,session_id,item_id,rank
0,126,5907,1
1,126,5581,2
2,126,2652,3
3,126,17969,4
4,126,1640,5
...,...,...,...
4999995,186479748,25166,96
4999996,186479748,25537,97
4999997,186479748,14236,98
4999998,186479748,4598,99


In [69]:
candidate_rank_df

Unnamed: 0,session_id,item_id,rank
0,126,5907,1
1,126,5581,2
2,126,2652,3
3,126,17969,4
4,126,1640,5
...,...,...,...
4999995,186479748,25166,96
4999996,186479748,25537,97
4999997,186479748,14236,98
4999998,186479748,4598,99


In [70]:
len(candidate_rank_df.session_id.unique())

50000

In [71]:
(~candidate_rank_df.item_id.isin(candidate_items)).sum()

0

In [72]:
len(test_df.session_id.unique())

50000

In [73]:
candidate_rank_df.to_csv("results_lstm_with_contents.csv", index=False)

In [74]:
candidate_rank_df[candidate_rank_df["rank"] == 1].item_id.unique()

array([ 5907, 24914,  1640,  5581, 15610,  2652,  1020,  3582, 25593,
        8345, 13638, 16909,  1778, 20901, 19509, 26982, 13676,  4853,
       23269,  7624])