In [1]:
! pip install implicit
! pip install ipywidgets
! pip install xeus-python



In [2]:
import pandas as pd
from scipy import sparse
import implicit
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv("interactions.csv")
df_items = pd.read_csv("items.csv")

# drop columns which we don't need
df = df.drop(["TIMESTAMP", "DISCOUNT"], axis=1)

# add confidence scores
event_type_confidence = {
   "View": 1.0,
   "AddToCart": 2.0, 
   "ViewCart": 3.0, 
   "StartCheckout": 4.0,
   "Purchase": 5.0,  
}

df["CONFIDENCE"] = df["EVENT_TYPE"].apply(lambda x: event_type_confidence[x])
# 90,000 interactions
# print(df)

# this removes duplicates and adds up the confidence => down to 26K unique user-item interactions + confidence
grouped_df = df.groupby(["ITEM_ID", "USER_ID"]).sum("CONFIDENCE").reset_index()
grouped_df = grouped_df[["USER_ID", "ITEM_ID", "CONFIDENCE"]] # re-order columns
# print(grouped_df)

# train
grouped_df["USER_ID"] = grouped_df["USER_ID"].astype("category")
grouped_df["ITEM_ID"] = grouped_df["ITEM_ID"].astype("category")
print(f"Number of unique users: {grouped_df['USER_ID'].nunique()}")
print(f"Number of unique items: {grouped_df.ITEM_ID.nunique()}")
grouped_df["USER_IDX"] = grouped_df["USER_ID"].cat.codes
grouped_df["ITEM_IDX"] = grouped_df["ITEM_ID"].cat.codes
print(f"Min value user index: {grouped_df['USER_IDX'].min()}")
print(f"Max value user index: {grouped_df['USER_IDX'].max()}")
print(f"Min value item index: {grouped_df['ITEM_IDX'].min()}")
print(f"Max value item index: {grouped_df['ITEM_IDX'].max()}")
# print(grouped_df)

# Find items purchased by a given user
# try_user = 168
# try_result = grouped_df[grouped_df.USER_ID == try_user]
# print("Try result:")
# print(try_result)

sparse_person_content = sparse.csr_matrix((grouped_df["CONFIDENCE"].astype(float), (grouped_df["USER_IDX"], grouped_df["ITEM_IDX"])))
sparse_person_content
grouped_df

Number of unique users: 1000
Number of unique items: 2339
Min value user index: 0
Max value user index: 999
Min value item index: 0
Max value item index: 2338


Unnamed: 0,USER_ID,ITEM_ID,CONFIDENCE,USER_IDX,ITEM_IDX
0,375,00096972-5f6b-44df-917b-f7d21ae5644c,4.0,374,0
1,539,00096972-5f6b-44df-917b-f7d21ae5644c,19.0,538,0
2,657,00096972-5f6b-44df-917b-f7d21ae5644c,6.0,656,0
3,698,00096972-5f6b-44df-917b-f7d21ae5644c,6.0,697,0
4,74,0016fde3-0910-4cc1-8ef6-90e15f271073,3.0,73,1
...,...,...,...,...,...
8273,295,ffcc4cc8-a094-49ea-b9f2-8bf056261868,12.0,294,2337
8274,389,ffcc4cc8-a094-49ea-b9f2-8bf056261868,12.0,388,2337
8275,39,ffdb2dc7-ea75-4197-9826-e81ccd42f578,9.0,38,2338
8276,309,ffdb2dc7-ea75-4197-9826-e81ccd42f578,8.0,308,2338


In [3]:
import random
import numpy as np

def make_train_test(sparse_matrix, pct_test=0.2):
  test_set = sparse_matrix.copy()  # Make a copy of the original set to be the test set.
  training_set = sparse_matrix.copy() # Make a copy of the original data we can alter as our training set.
  nonzero_inds = (
      training_set.nonzero()
  )  # Find the indices in the ratings data where an interaction exists
  nonzero_pairs = list(
      zip(nonzero_inds[0], nonzero_inds[1])
  )  # Zip these pairs together of user,item index into list
  random.seed(42)  # Set the random seed to zero for reproducibility
  num_samples = int(
      np.ceil(pct_test * len(nonzero_pairs))
  )  # Round the number of samples needed to the nearest integer
  print(f"Number of samples: {num_samples}")
  samples = random.sample(
      nonzero_pairs, num_samples
  )  # Sample a random number of user-item pairs without replacement
  print(f"Length nonzero pairs: {len(nonzero_pairs)}")
  print(f"Length samples: {len(samples)}")
  print(f"user-item masked samples: {samples[0:10]}" )
  user_inds = [index[0] for index in samples]  # Get the user row indices
  item_inds = [index[1] for index in samples]  # Get the item column indices
  print(f"Length user index samples: {len(user_inds)}")
  print(f"Length item index samples: {len(item_inds)}")
  training_set[
      user_inds, item_inds
  ] = 0  # Assign all of the randomly chosen user-item pairs to zero
  training_set.eliminate_zeros()  # Get rid of zeros in sparse array storage after update to save space
  return (
      training_set,
      test_set,
      list(set(item_inds)),
  )  # Output the unique list of user rows that were altered
  
training_set, test_set, product_users_altered =  make_train_test(sparse_person_content, pct_test=0.2)
# training_set, test_set =  make_train_test(sparse_person_content, pct_test=0.2)
# print(training_set)
# print("-------------")
# print(test_set)


Number of samples: 1656
Length nonzero pairs: 8278
Length samples: 1656
user-item masked samples: [(207, 1061), (47, 20), (509, 866), (451, 584), (411, 2231), (259, 1460), (190, 127), (161, 2330), (783, 1170), (60, 1552)]
Length user index samples: 1656
Length item index samples: 1656


In [4]:
alpha = 15
model_training = implicit.als.AlternatingLeastSquares(alpha=alpha, factors=20, regularization=0.1, iterations=50)

training_data = (training_set).astype('double')
model_training.fit(training_data)



  0%|          | 0/50 [00:00<?, ?it/s]

In [5]:
# calculate the sparsity of our sparse matrix
print(training_set.shape)
matrix_size = training_set.shape[0]*training_set.shape[1] # Number of possible interactions in the matrix
num_confidence_interactions = len(training_set.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_confidence_interactions/matrix_size))
sparsity

(1000, 2339)


99.7168875587858

In [6]:
# Get popular items (to use as baseline)
popular_items = grouped_df.ITEM_ID.value_counts(sort=True).keys()[:20]
top_n_popular_items = []
for item in popular_items:
    item_desc = df_items.PRODUCT_DESCRIPTION.loc[df_items.ITEM_ID == item].iloc[0]
    item_index = grouped_df.ITEM_IDX.loc[grouped_df.ITEM_ID == item].iloc[0]
    print(f"Item ID: {item}, Desc: {item_desc}")
    top_n_popular_items.append(item_index)
# top_n_popular_items
    


Item ID: aff05423-76e8-4339-a478-fc17d51ed985, Desc: 16oz fountain soda always hits the spot
Item ID: 0790267c-c708-424d-81f5-46903a9c8444, Desc: Slice of delicious pepperoni pizza
Item ID: 9c1a2048-7aac-4565-b836-d8d4f726322c, Desc: Crunchy and yummy! Crumbs in your lap!
Item ID: 24c62ad2-6977-4f69-be75-e37d897c1434, Desc: Warms you to your bowels and delivers vitamins to your organs
Item ID: 575c0ac0-5494-4c64-a886-a9c0cf8b779a, Desc: Lentils with potatos, carrots and spices; a joy to your palate and super healthy
Item ID: 0987bfa1-0a23-4b90-8882-8a6e9bd91e24, Desc: Juicy prawns with spicy sauce and rice
Item ID: b20ba076-58a7-4602-9b56-4bee46e98388, Desc: The best nachos north of Mexico
Item ID: 5afced84-ed2d-4520-a06d-dcfeab382e52, Desc: Made with ginseng and joy
Item ID: 4496471c-b098-4915-9a1a-8b9e60043737, Desc: The taste of summer and energy
Item ID: 0de9bba0-1149-40e9-b1a6-7dcecaf68194, Desc: Thick and fine smoothy, satisfying and healthy
Item ID: 25d7bbf6-7dd3-4912-93a7-4186e

In [40]:
def call_recommend(model, user_items, N=20):
    num_recomm = N
    user_idx = 27 # try these user-item indices [(718, 402), (124, 450), (27, 1795), (832, 723), (307, 1846), (273, 870), (249, 381), (155, 2209), (826, 1110), (114, 1245)]
    recommendations_raw = model.recommend(user_idx, user_items[user_idx], N=num_recomm)
    predictions = recommendations_raw[0][:num_recomm] # these are the top x0 preditictions using the train set

    print(recommendations_raw)

    print(f"User idx: {user_idx}")
    for idx in range(0, num_recomm):
        item_id = grouped_df.ITEM_ID.loc[grouped_df.ITEM_IDX == recommendations_raw[0][idx]].iloc[0]
        item_desc = df_items.PRODUCT_DESCRIPTION.loc[df_items.ITEM_ID == item_id].iloc[0]
        print(f" \
              {item_id}, \
              {recommendations_raw[1][idx]}, \
              {item_desc}")
    return predictions

train_set_predictions = call_recommend(model_training, training_set)

(array([ 342, 2223, 1081,  884, 1938, 2038, 1874, 1748,  453, 1486, 2173,
       2238, 1714,  863, 1385, 1632,    9,  482, 1600,   96], dtype=int32), array([0.8645317 , 0.71201605, 0.7022458 , 0.6850225 , 0.6736598 ,
       0.6689093 , 0.6683763 , 0.6681689 , 0.66321975, 0.602503  ,
       0.5995737 , 0.5941973 , 0.5918359 , 0.5884895 , 0.5775304 ,
       0.5773646 , 0.5768895 , 0.57248485, 0.57224053, 0.57223445],
      dtype=float32))
User idx: 27
               26b2bda2-9397-4d0c-b5bb-9190dbba3acb,               0.8645316958427429,               This television is perfect for every room
               f1c73a48-b83e-45b7-b812-8b0b7f372208,               0.7120160460472107,               This pair of gray boots is sans pareil for the spring
               7397bcf5-06c0-4aee-922f-8d111c10e090,               0.7022457718849182,               Pair of strappy heels for women
               5ecc8c5f-16b7-421a-997b-136f40fecb5b,               0.6850224733352661,               Take this trus

In [41]:
# try the test set
model_testing = implicit.als.AlternatingLeastSquares(alpha=alpha, factors=20, regularization=0.1, iterations=50)

testing_data = (test_set).astype('double')
model_testing.fit(testing_data)


  0%|          | 0/50 [00:00<?, ?it/s]

In [42]:
test_set_predictions = call_recommend(model_testing, test_set)


(array([ 453, 1684,  342, 1081, 2223,  108, 1874, 1938, 1488, 1748, 1600,
         28,  884,  244,   96,  482, 1445, 1575, 1869, 1796], dtype=int32), array([0.8264533 , 0.80102104, 0.7680993 , 0.7656698 , 0.75157213,
       0.7370718 , 0.720719  , 0.70890903, 0.70353127, 0.68903124,
       0.66724336, 0.65598506, 0.6492932 , 0.64101374, 0.62479734,
       0.620091  , 0.6179529 , 0.6098746 , 0.6051255 , 0.603426  ],
      dtype=float32))
User idx: 27
               329a1a85-57dd-48c4-a00a-c5e7f6e9ea12,               0.8264533281326294,               Authoritative pair of headphones for listening to your music
               b4a323f8-0571-4bde-afe7-a27bc5394603,               0.8010210394859314,               Accurate pair of headphones for listening to your music
               26b2bda2-9397-4d0c-b5bb-9190dbba3acb,               0.768099308013916,               This television is perfect for every room
               7397bcf5-06c0-4aee-922f-8d111c10e090,               0.7656698226928711

In [43]:
from IPython.core.debugger import set_trace
def AverageReciprocalHitRank(train_set_predictions, test_set_predictions):
    '''
    Calcluate the Average Reciprocal Hit Rank 
    by comparing the top 20 predictions from both the
    training set and the test set. The training set contains the
    masked out items using function make_train_test
    '''
    summation = 0
    rank = 1
    index = 0
    # For each left-out rating
    for test_set_item_id in test_set_predictions:
        # Is it in the predicted top N for this user?
        train_set_item_id = train_set_predictions[index]
        # set_trace()
        if (test_set_item_id == train_set_item_id):
            summation += 1.0 / rank
        rank = rank + 1
        index = index + 1
        # print(f"{index}, {len(test_set_predictions)-1}")
        if (index > (len(test_set_predictions)-1)):
            break

    return round(summation, 5)

def average_precision_at_k(y_true, y_pred, k_max=0):

  # Check if all elements in lists are unique
  if len(set(y_true)) != len(y_true):
    raise ValueError("Values in y_true are not unique")

  if len(set(y_pred)) != len(y_pred):
    raise ValueError("Values in y_pred are not unique")

  if k_max != 0:
    y_pred = y_pred[:k_max]


  correct_predictions = 0
  running_sum = 0

  for i, yp_item in enumerate(y_pred):
    
    k = i+1 # our rank starts at 1
    
    if yp_item in y_true:
      correct_predictions += 1
      running_sum += correct_predictions/k

  return round(running_sum/len(y_true), 5)

print("APK: ", average_precision_at_k(train_set_predictions, test_set_predictions))
print("APK (Popular Items): ", average_precision_at_k(train_set_predictions, top_n_popular_items))

APK:  0.40773
APK (Popular Items):  0.0


In [44]:
print("APK: ", average_precision_at_k(test_set_predictions, test_set_predictions))
print("APK (Popular Items): ", average_precision_at_k(test_set_predictions, top_n_popular_items))

APK:  1.0
APK (Popular Items):  0.0


In [45]:
check_predictions = call_recommend(model_training, training_set)

print("APK: ", average_precision_at_k(check_predictions, test_set_predictions))
print("APK (Popular Items): ", average_precision_at_k(check_predictions, top_n_popular_items))

(array([ 342, 2223, 1081,  884, 1938, 2038, 1874, 1748,  453, 1486, 2173,
       2238, 1714,  863, 1385, 1632,    9,  482, 1600,   96], dtype=int32), array([0.8645317 , 0.71201605, 0.7022458 , 0.6850225 , 0.6736598 ,
       0.6689093 , 0.6683763 , 0.6681689 , 0.66321975, 0.602503  ,
       0.5995737 , 0.5941973 , 0.5918359 , 0.5884895 , 0.5775304 ,
       0.5773646 , 0.5768895 , 0.57248485, 0.57224053, 0.57223445],
      dtype=float32))
User idx: 27
               26b2bda2-9397-4d0c-b5bb-9190dbba3acb,               0.8645316958427429,               This television is perfect for every room
               f1c73a48-b83e-45b7-b812-8b0b7f372208,               0.7120160460472107,               This pair of gray boots is sans pareil for the spring
               7397bcf5-06c0-4aee-922f-8d111c10e090,               0.7022457718849182,               Pair of strappy heels for women
               5ecc8c5f-16b7-421a-997b-136f40fecb5b,               0.6850224733352661,               Take this trus

In [12]:
# https://nbviewer.org/github/jmsteinw/Notebooks/blob/master/RecEngine_NB.ipynb
# This notebook shows awesome extensive process for implicit feedback systems. 

In [13]:
! pip install rank-eval



In [14]:
from rank_eval import Qrels, Run, evaluate

qrels = Qrels()
qrels.add_multi(
    q_ids=["q_1", "q_2"],
    doc_ids=[
        ["doc_12", "doc_25"],  # q_1 relevant documents
        ["doc_11", "doc_2"],  # q_2 relevant documents
    ],
    scores=[
        [5, 3],  # q_1 relevance judgements
        [6, 1],  # q_2 relevance judgements
    ],
)

run = Run()
run.add_multi(
    q_ids=["q_1", "q_2"],
    doc_ids=[
        ["doc_12", "doc_23", "doc_25", "doc_36", "doc_32", "doc_35"],
        ["doc_12", "doc_11", "doc_25", "doc_36", "doc_2",  "doc_35"],
    ],
    scores=[
        [0.9, 0.8, 0.7, 0.6, 0.5, 0.4],
        [0.9, 0.8, 0.7, 0.6, 0.5, 0.4],
    ],
)
evaluate(qrels, run, ["mrr"])


0.75