In [86]:
! pip install implicit
! pip install ipywidgets



In [153]:
import pandas as pd
from scipy import sparse
import implicit
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv("interactions.csv")
df_items = pd.read_csv("items.csv")

# drop columns which we don't need
df = df.drop(["TIMESTAMP", "DISCOUNT"], axis=1)

# add confidence scores
event_type_confidence = {
   "View": 1.0,
   "AddToCart": 2.0, 
   "ViewCart": 3.0, 
   "StartCheckout": 4.0,
   "Purchase": 5.0,  
}

df["CONFIDENCE"] = df["EVENT_TYPE"].apply(lambda x: event_type_confidence[x])
# 90,000 interactions
# print(df)

# this removes duplicates and adds up the confidence => down to 26K unique user-item interactions + confidence
grouped_df = df.groupby(["ITEM_ID", "USER_ID"]).sum("CONFIDENCE").reset_index()
grouped_df = grouped_df[["USER_ID", "ITEM_ID", "CONFIDENCE"]] # re-order columns
# print(grouped_df)

# train
grouped_df["USER_ID"] = grouped_df["USER_ID"].astype("category")
grouped_df["ITEM_ID"] = grouped_df["ITEM_ID"].astype("category")
print(f"Number of unique users: {grouped_df['USER_ID'].nunique()}")
print(f"Number of unique items: {grouped_df.ITEM_ID.nunique()}")
grouped_df["USER_IDX"] = grouped_df["USER_ID"].cat.codes
grouped_df["ITEM_IDX"] = grouped_df["ITEM_ID"].cat.codes
print(f"Min value user index: {grouped_df['USER_IDX'].min()}")
print(f"Max value user index: {grouped_df['USER_IDX'].max()}")
print(f"Min value item index: {grouped_df['ITEM_IDX'].min()}")
print(f"Max value item index: {grouped_df['ITEM_IDX'].max()}")
# print(grouped_df)

# Find items purchased by a given user
# try_user = 168
# try_result = grouped_df[grouped_df.USER_ID == try_user]
# print("Try result:")
# print(try_result)

sparse_person_content = sparse.csr_matrix((grouped_df["CONFIDENCE"].astype(float), (grouped_df["USER_IDX"], grouped_df["ITEM_IDX"])))
sparse_person_content
grouped_df

Number of unique users: 1000
Number of unique items: 2426
Min value user index: 0
Max value user index: 999
Min value item index: 0
Max value item index: 2425


Unnamed: 0,USER_ID,ITEM_ID,CONFIDENCE,USER_IDX,ITEM_IDX
0,33,00096972-5f6b-44df-917b-f7d21ae5644c,48.0,32,0
1,239,00096972-5f6b-44df-917b-f7d21ae5644c,50.0,238,0
2,379,00096972-5f6b-44df-917b-f7d21ae5644c,16.0,378,0
3,417,00096972-5f6b-44df-917b-f7d21ae5644c,33.0,416,0
4,729,00096972-5f6b-44df-917b-f7d21ae5644c,65.0,728,0
...,...,...,...,...,...
13575,146,ffdb2dc7-ea75-4197-9826-e81ccd42f578,99.0,145,2425
13576,289,ffdb2dc7-ea75-4197-9826-e81ccd42f578,34.0,288,2425
13577,371,ffdb2dc7-ea75-4197-9826-e81ccd42f578,64.0,370,2425
13578,414,ffdb2dc7-ea75-4197-9826-e81ccd42f578,12.0,413,2425


In [154]:
import random
import numpy as np

def make_train_test(sparse_matrix, pct_test=0.2):
  test_set = sparse_matrix.copy()  # Make a copy of the original set to be the test set.
  # test_set[test_set != 0] = 1  # Store the test set as a binary preference matrix
  training_set = (
      sparse_matrix.copy()
  )  # Make a copy of the original data we can alter as our training set.
  nonzero_inds = (
      training_set.nonzero()
  )  # Find the indices in the ratings data where an interaction exists
  nonzero_pairs = list(
      zip(nonzero_inds[0], nonzero_inds[1])
  )  # Zip these pairs together of user,item index into list
  random.seed(42)  # Set the random seed to zero for reproducibility
  num_samples = int(
      np.ceil(pct_test * len(nonzero_pairs))
  )  # Round the number of samples needed to the nearest integer
  print(f"Number of samples: {num_samples}")
  samples = random.sample(
      nonzero_pairs, num_samples
  )  # Sample a random number of user-item pairs without replacement
  print(f"Length nonzero pairs: {len(nonzero_pairs)}")
  print(f"Length samples: {len(samples)}")
  print(f"user-item masked samples: {samples[0:10]}" )
  user_inds = [index[0] for index in samples]  # Get the user row indices
  item_inds = [index[1] for index in samples]  # Get the item column indices
  print(f"Length user index samples: {len(user_inds)}")
  print(f"Length item index samples: {len(item_inds)}")
  training_set[
      user_inds, item_inds
  ] = 0  # Assign all of the randomly chosen user-item pairs to zero
  training_set.eliminate_zeros()  # Get rid of zeros in sparse array storage after update to save space
  return (
      training_set,
      test_set,
      list(set(user_inds)),
  )  # Output the unique list of user rows that were altered
  
training_set, test_set, product_users_altered =  make_train_test(sparse_person_content, pct_test=0.2)

Number of samples: 2716
Length nonzero pairs: 13580
Length samples: 2716
user-item masked samples: [(718, 402), (124, 450), (27, 1795), (832, 723), (307, 1846), (273, 870), (249, 381), (155, 2209), (826, 1110), (114, 1245)]
Length user index samples: 2716
Length item index samples: 2716


In [155]:
alpha = 15
model = implicit.als.AlternatingLeastSquares(alpha=alpha, factors=20, regularization=0.1, iterations=50)
user_vecs = grouped_df["USER_IDX"]
item_vecs = grouped_df["ITEM_IDX"]

data = (training_set).astype('double')

In [170]:
# calculate the sparsity of our sparse matrix
print(training_set.shape)
matrix_size = training_set.shape[0]*training_set.shape[1] # Number of possible interactions in the matrix
num_confidence_interactions = len(training_set.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_confidence_interactions/matrix_size))
sparsity

(1000, 2426)


99.55218466611706

In [171]:
model.fit(data)

  0%|          | 0/50 [00:00<?, ?it/s]

In [172]:
# Get popular items (to use as baseline)
popular_items = grouped_df.ITEM_ID.value_counts(sort=True).keys()[:50]
top_ten_popular_items = []
for item in popular_items:
    item_desc = df_items.PRODUCT_DESCRIPTION.loc[df_items.ITEM_ID == item].iloc[0]
    item_index = grouped_df.ITEM_IDX.loc[grouped_df.ITEM_ID == item].iloc[0]
    print(f"Item ID: {item}, Desc: {item_desc}")
    top_ten_popular_items.append(item_index)
top_ten_popular_items
    


Item ID: 0de9bba0-1149-40e9-b1a6-7dcecaf68194, Desc: Thick and fine smoothy, satisfying and healthy
Item ID: 5afced84-ed2d-4520-a06d-dcfeab382e52, Desc: Made with ginseng and joy
Item ID: aff05423-76e8-4339-a478-fc17d51ed985, Desc: 16oz fountain soda always hits the spot
Item ID: 0790267c-c708-424d-81f5-46903a9c8444, Desc: Slice of delicious pepperoni pizza
Item ID: 31eef6b5-781e-4353-a851-d73ff9c22f9a, Desc: Learn to cook Chinese cuisine with these easy recipes
Item ID: 53ce7597-bb59-45e0-a3a3-ca3ef6f7ce1c, Desc: Plan your next trip with this essential reference book on France
Item ID: ec02b332-05a5-40dd-ae9d-2b0672baaa6e, Desc: Plan your next trip with this essential reference book on Japan
Item ID: 62206a61-b821-47ea-be0c-c4fbf57e091e, Desc: Plan your next trip with this essential reference book on Mexico
Item ID: 9c1a2048-7aac-4565-b836-d8d4f726322c, Desc: Crunchy and yummy! Crumbs in your lap!
Item ID: 4adabef5-293b-42c0-b6d1-1cf853f6391d, Desc: Plan your next trip with this essen

[124,
 872,
 1682,
 69,
 461,
 783,
 2249,
 939,
 1508,
 708,
 908,
 1046,
 387,
 2061,
 1766,
 1176,
 2139,
 2273,
 746,
 919,
 630,
 1272,
 350,
 342,
 2375,
 1738,
 2415,
 1522,
 1510,
 1840,
 513,
 964,
 895,
 2398,
 643,
 1613,
 751,
 969,
 859,
 2133,
 666,
 86,
 831,
 274,
 1699,
 1708,
 1062,
 1751,
 886,
 791]

In [194]:
num_recomm = 20
user_idx = 155 # try these user-item indices [(718, 402), (124, 450), (27, 1795), (832, 723), (307, 1846), (273, 870), (249, 381), (155, 2209), (826, 1110), (114, 1245)]
train = model.recommend(user_idx, training_set[user_idx], N=num_recomm)
train_set_predictions = train[0][:num_recomm] # these are the top x0 preditictions using the train set

print(train)

print(f"User idx: {user_idx}")
for idx in range(0, num_recomm):
    item_id = grouped_df.ITEM_ID.loc[grouped_df.ITEM_IDX == train[0][idx]].iloc[0]
    item_desc = df_items.PRODUCT_DESCRIPTION.loc[df_items.ITEM_ID == item_id].iloc[0]
    print(f" \
          {item_id}, \
          {train[1][idx]}, \
          {item_desc}")
    


(array([1922,  253, 1631, 1844,   47, 1134, 2309, 1326, 1406,  768, 1061,
       1151,  186, 1123, 1952, 1485, 1948, 2120,  163, 1130], dtype=int32), array([0.85376114, 0.76218545, 0.7443303 , 0.74205965, 0.73244005,
       0.72602624, 0.7238099 , 0.71864474, 0.6823832 , 0.6812058 ,
       0.67385745, 0.6716193 , 0.6557997 , 0.6528224 , 0.65126425,
       0.64499354, 0.6444813 , 0.6408257 , 0.6395861 , 0.63956565],
      dtype=float32))
User idx: 155
           c8956c20-9214-4e38-9398-26e8d05b4924,           0.8537611365318298,           This gray set cushion is a must-have for your house
           1bb74d0b-fe39-42f0-9870-d4195a90a32c,           0.7621854543685913,           Unsurpassed beige chair for your office
           aa5be513-89e8-4518-aba0-a40c8544adbc,           0.7443302869796753,           A must-have for your kitchen
           c0966977-5b71-4f5e-92c7-04a86dbd7d71,           0.7420596480369568,           This wall clock is a must-have
           05948514-25d3-49a7-ade2-03

In [185]:
# try the test set
data = (training_set).astype('double')
model.fit(data)

  0%|          | 0/50 [00:00<?, ?it/s]

In [195]:
# try user-item held out in the training set
user_idx = 155 # try these user-item indices [(718, 402), (124, 450), (27, 1795), (832, 723), (307, 1846), (273, 870), (249, 381), (155, 2209), (826, 1110), (114, 1245)]
test = model.recommend(user_idx, test_set[user_idx], N=num_recomm)
test_set_predictions = test[0][:num_recomm] # these are the top x0 preditictions using the test set

print(test)

print(f"User idx: {user_idx}")
for idx in range(0, num_recomm):
    item_id = grouped_df.ITEM_ID.loc[grouped_df.ITEM_IDX == test[0][idx]].iloc[0]
    item_desc = df_items.PRODUCT_DESCRIPTION.loc[df_items.ITEM_ID == item_id].iloc[0]
    print(f" \
          {item_id}, \
          {test[1][idx]}, \
          {item_desc}")
    



(array([1922, 1631, 1844,   47, 1134, 2309, 1326, 1406,  768, 1061, 1151,
        186, 1123, 1952, 1485, 1948, 2120,  163, 1130, 1814], dtype=int32), array([0.85376114, 0.7443303 , 0.74205965, 0.73244005, 0.72602624,
       0.7238099 , 0.71864474, 0.6823832 , 0.6812058 , 0.67385745,
       0.6716193 , 0.6557997 , 0.6528224 , 0.65126425, 0.64499354,
       0.6444813 , 0.6408257 , 0.6395861 , 0.63956565, 0.6343557 ],
      dtype=float32))
User idx: 155
           c8956c20-9214-4e38-9398-26e8d05b4924,           0.8537611365318298,           This gray set cushion is a must-have for your house
           aa5be513-89e8-4518-aba0-a40c8544adbc,           0.7443302869796753,           A must-have for your kitchen
           c0966977-5b71-4f5e-92c7-04a86dbd7d71,           0.7420596480369568,           This wall clock is a must-have
           05948514-25d3-49a7-ade2-0368a9b1d525,           0.7324400544166565,           This table lamp will enhance your home
           7684731e-eca7-4679-8425-ba8

In [196]:
def AverageReciprocalHitRank(train_set_predictions, test_set_predictions):
    '''
    Calcluate the Average Reciprocal Hit Rank 
    by comparing the top 20 predictions from both the
    training set and the test set. The training set contains the
    masked out items using function make_train_test
    '''
    summation = 0
    total = 0
    # For each left-out rating
    for test_set_item_id in test_set_predictions:
        # Is it in the predicted top N for this user?
        hitRank = 0
        rank = 0
        for train_set_item_id in train_set_predictions:
            rank = rank + 1
            if (test_set_item_id == train_set_item_id):
                hitRank = rank
                break
        if (hitRank > 0) :
                summation += 1.0 / hitRank

        total += 1

    return round(summation / total, 5)

print("Average Reciprocal Hit Rank: ", AverageReciprocalHitRank(train_set_predictions, test_set_predictions))

print("Average Reciprocal Hit Rank (Popular Items): ", AverageReciprocalHitRank(train_set_predictions, top_ten_popular_items))


Average Reciprocal Hit Rank:  0.15489
Average Reciprocal Hit Rank (Popular Items):  0.0


In [1]:
# https://nbviewer.org/github/jmsteinw/Notebooks/blob/master/RecEngine_NB.ipynb
# This notebook shows awesome extensive process for implicit feedback systems. 

In [10]:
! pip install rank-eval

Collecting rank-eval
  Downloading rank_eval-0.1.3-py3-none-any.whl (17 kB)
Installing collected packages: rank-eval
Successfully installed rank-eval-0.1.3


In [15]:
from rank_eval import Qrels, Run, evaluate

qrels = Qrels()
qrels.add_multi(
    q_ids=["q_1", "q_2"],
    doc_ids=[
        ["doc_12", "doc_25"],  # q_1 relevant documents
        ["doc_11", "doc_2"],  # q_2 relevant documents
    ],
    scores=[
        [5, 3],  # q_1 relevance judgements
        [6, 1],  # q_2 relevance judgements
    ],
)

run = Run()
run.add_multi(
    q_ids=["q_1", "q_2"],
    doc_ids=[
        ["doc_12", "doc_23", "doc_25", "doc_36", "doc_32", "doc_35"],
        ["doc_12", "doc_11", "doc_25", "doc_36", "doc_2",  "doc_35"],
    ],
    scores=[
        [0.9, 0.8, 0.7, 0.6, 0.5, 0.4],
        [0.9, 0.8, 0.7, 0.6, 0.5, 0.4],
    ],
)
evaluate(qrels, run, ["mrr"])


0.75