In [1]:
import pandas as pd
from scipy import sparse
from scipy.sparse import csr_matrix, lil_matrix

## Load and View

In [2]:
df_view = pd.read_csv('data/train-item-views.csv', delimiter=';', usecols=[0,2])
df_click = pd.read_csv('data/train-clicks.csv', delimiter=';', usecols=[0,2])
df_purchase = pd.read_csv('data/train-purchases.csv', delimiter=';', usecols=[0,5])
df_queries = pd.read_csv('data/train-queries.csv', delimiter=';', usecols=[0,1])

In [3]:
df_view

Unnamed: 0,sessionId,itemId
0,1,81766
1,1,31331
2,1,32118
3,1,9654
4,1,32627
...,...,...
1235375,600684,42906
1235376,600684,33312
1235377,600684,33312
1235378,600684,5227


In [4]:
df_purchase

Unnamed: 0,sessionId,itemId
0,150,25911
1,151,175874
2,156,35324
3,179,31233
4,246,34677
...,...,...
18020,600131,40804
18021,600250,33777
18022,600432,17176
18023,600432,13662


In [5]:
df_click # Missing 'sessionId' column - need to map queryId -> sessionId with df_queries

Unnamed: 0,queryId,itemId
0,1,24857
1,46255,30792
2,46689,8252
3,46731,33969
4,46748,7837
...,...,...
1127759,980493,42906
1127760,980493,33312
1127761,980493,33312
1127762,980493,5227


In [6]:
df_queries[df_queries['sessionId'] == 1]

Unnamed: 0,queryId,sessionId
0,1,1
46248,46255,1
46682,46689,1
46724,46731,1
46741,46748,1
46761,46768,1
108054,111737,1
127885,132852,1
187432,196431,1
213325,224052,1


## Correct and map indices

In [7]:
query_session = {query: session for query, session in df_queries.itertuples(index=False)} # Map queryId to sessionId
df_click['sessionId'] = df_click['queryId'].map(query_session) # Create sessionId column for 'click' data
df_click = df_click.drop(columns='queryId')

In [8]:
df_click # Now has 'sessionId' column

Unnamed: 0,itemId,sessionId
0,24857,1
1,30792,1
2,8252,1
3,33969,1
4,7837,1
...,...,...
1127759,42906,600684
1127760,33312,600684
1127761,33312,600684
1127762,5227,600684


In [9]:
item_ids = pd.concat((df_view['itemId'], df_click['itemId'], df_purchase['itemId']))
item_index = {item: index for index, item in enumerate(item_ids.unique())}

session_ids = pd.concat((df_view['sessionId'], df_click['sessionId'], df_purchase['sessionId']))
session_index = {session: index for index, session in enumerate(session_ids.unique())}

## Aggregate

In [10]:
# Adjust these values as needed
view_value = 1
click_value = 2
purchase_value = 3

df_view['rating'] = [view_value] * len(df_view)
df_click['rating'] = [click_value] * len(df_click)
df_purchase['rating'] = [purchase_value] * len(df_purchase)

In [11]:
df_all = pd.concat([df_view,df_click,df_purchase], ignore_index=True)

In [12]:
ratings_matrix = csr_matrix((df_all['rating'], (df_all['sessionId'].map(session_index), df_all['itemId'].map(item_index))),
                            shape=(len(df_all['sessionId'].unique()), len(item_index)))

In [13]:
sparsity = ratings_matrix.nnz / (ratings_matrix.shape[0] * ratings_matrix.shape[1])
print(f"{sparsity * 100} % non-zero")
print(f"{1 - sparsity * 100} % zero")

0.0028768047330218517 % non-zero
0.9971231952669781 % zero


Note from csr_matrix documentation: 'Duplicate entries are summed together.' That means if a user interacts with the same item multiple times within the same session, the values for the different actions are summed together.

For example, if a user views an item twice, then clicks, then purchases, the value for that session-item pair is:  
`(2 * view) + (1 * click) + (1 * purchase)`

In [14]:
ratings_matrix[0].data

array([ 9, 11,  1,  9,  1,  1,  1,  1,  1,  1,  2,  4,  2,  6,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  4,  2,  2,  4,  2,  4,
        2], dtype=int64)

## Saving (and loading)

In [15]:
sparse.save_npz("data/ratings_matrix.npz", ratings_matrix)
# ratings_matrix = sparse.load_npz("data/ratings_matrix.npz")

## Skip-Gram context/target caching
Generating the contexts and targets for skip-gram takes some time, which makes training take forever when calculating for every single sample.  
I decided to generate them here to cache for future use, instead of generating them during training.  
While I was doing this, I figured it would be simpler to also cache the train/test split matrices for non-skip-gram training runs as well.

In [16]:
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [17]:
sessions_train, sessions_test = train_test_split(ratings_matrix, test_size=0.2)
sparse.save_npz("data/sessions_train.npz", sessions_train)
sparse.save_npz("data/sessions_test.npz", sessions_test)

sessions_train.shape, sessions_test.shape

((295752, 128910), (73939, 128910))

In [18]:
def generate_context_target_pairs(session_row):
    num_items = session_row.shape[1]
    context_target_pairs = []

    session_array = session_row.toarray().flatten()

    for i, rating in enumerate(session_array):
        if rating != 0:
            context = np.zeros(num_items, dtype='int32')
            context[i] = rating

            target_indices = np.concatenate([np.arange(0, i), np.arange(i + 1, num_items)])
            target = np.zeros(num_items, dtype='int32')
            target[target_indices] = session_array[target_indices]

            context_target_pairs.append((context, target))

    return context_target_pairs

In [19]:
# Example session ratings row
session_row = csr_matrix([[1, 0, 2, 0, 3, 1, 0]])

# Generate context-target pairs
pairs = generate_context_target_pairs(session_row)

print("Original:", session_row.toarray()[0], "\n")
for context, target in pairs:
    print("Context:", context)
    print("Target:", target, "\n")

Original: [1 0 2 0 3 1 0] 

Context: [1 0 0 0 0 0 0]
Target: [0 0 2 0 3 1 0] 

Context: [0 0 2 0 0 0 0]
Target: [1 0 0 0 3 1 0] 

Context: [0 0 0 0 3 0 0]
Target: [1 0 2 0 0 1 0] 

Context: [0 0 0 0 0 1 0]
Target: [1 0 2 0 3 0 0] 



In [None]:
# Only need to do this for training data - testing data is embedded from its original state
session_contexts_train = lil_matrix((sessions_train.nnz, sessions_train.shape[1]), dtype='int32')
session_targets_train = lil_matrix((sessions_train.nnz, sessions_train.shape[1]), dtype='int32')

idx = 0
for i, session in tqdm(enumerate(sessions_train), total=sessions_train.shape[0]):
    pairs = generate_context_target_pairs(session)
    for context, target in pairs:
        session_contexts_train[idx] = context
        session_targets_train[idx] = target
        idx += 1

In [None]:
sparse.save_npz("data/session_train_contexts.npz", session_contexts_train.tocsr())
sparse.save_npz("data/session_train_targets.npz", session_targets_train.tocsr())