In [76]:
#from google.colab import drive
#drive.mount('/content/drive')
#PATH = '/content/drive/MyDrive/SkillFactory/DS_ML_data/'

In [77]:
#!pip install numpy
#!pip install pandas
#!pip install scipy
#!pip install scikit-learn
#!pip install --upgrade implicit
#!pip install pickle

In [78]:
import numpy as np
import pandas as pd
import os
import scipy
from scipy.sparse import coo_matrix
import implicit
from implicit.evaluation import mean_average_precision_at_k
from implicit.als import AlternatingLeastSquares
from sklearn.model_selection import train_test_split
import pickle

In [79]:
PATH = f'../data/'

In [80]:
pwd

'/home/as/PycharmProjects/RecSys_ASL/model'

# data upload

In [81]:
events = pd.read_csv(PATH + 'events.csv')

In [82]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   timestamp      int64  
 1   visitorid      int64  
 2   event          object 
 3   itemid         int64  
 4   transactionid  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 105.1+ MB


In [83]:
events.head(3)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,


In [84]:
#object lables encoding
events.event.replace('view', '1', inplace=True)
events.event.replace('addtocart', '2', inplace=True)
events.event.replace('transaction', '3', inplace=True)
events['event'] = events.event.astype(int)

In [85]:
#definition of valid users, who have not only looked up the items, but also have bought
viewed_users = events[events['event'] == 1]['visitorid']
bought_users = events[events['event'] == 3]['visitorid']
users = viewed_users[viewed_users.isin(bought_users)].unique()
print(f' {len(users)}  actual users IDs which have not only viewed but also bought the items in the sesions of the dataset')

 11291  actual users IDs which have not only viewed but also bought the items in the sesions of the dataset


In [86]:
#filtered sessions by actual users ids
events = events[events['visitorid'].isin(users)]
items = events['itemid'].unique()
print(f' {len(items)} item IDs in the datasets which have been actioned by actual users')
print(f' {len(events)} actual datasets')

 36204 item IDs in the datasets which have been actioned by actual users
 229708 actual datasets


In [87]:
df = pd.DataFrame()

top_3 = list(events.itemid.value_counts()[:3].index)

user_ids = dict(list(enumerate(users)))
item_ids = dict(list(enumerate(items)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

df['user_id'] = events['visitorid'].map(user_map)
df['item_id'] = events['itemid'].map(item_map)

  top_3 = list(events.itemid.value_counts()[:3].index)


In [88]:
top_3

[461686, 119736, 312728]

# create coo_matrix (user x item) and csr matrix (user x item)

In [89]:
row = df['user_id'].values
col = df['item_id'].values
data = np.ones(df.shape[0])
coo_train = coo_matrix((data, (row, col)), shape=(len(users), len(items)))
coo_train

<11291x36204 sparse matrix of type '<class 'numpy.float64'>'
	with 229708 stored elements in COOrdinate format>

## check the coo_ matrix

In [90]:
%%time
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2)
model.fit(coo_train)



  0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 2.2 s, sys: 5.25 s, total: 7.45 s
Wall time: 554 ms


# model & validation

In [91]:
def to_user_item_coo(df):
    """ Turn a dataframe with transactions into a COO sparse items x users matrix"""
    row = df['user_id'].values
    col = df['item_id'].values
    data = np.ones(df.shape[0])
    coo = coo_matrix((data, (row, col)), shape=(len(users), len(items)))
    return coo

In [92]:
def get_val_matrices(df, test_size= 0.2):
    """ Split into training and validation and create various matrices
        
        Returns a dictionary with the following keys:
            coo_train: training data in COO sparse format and as (users x items)
            csr_train: training data in CSR sparse format and as (users x items)
            csr_val:  validation data in CSR sparse format and as (users x items)
    
    """
    df_train, df_val = train_test_split(df, test_size= test_size, shuffle=False)


    coo_train = to_user_item_coo(df_train)
    coo_val = to_user_item_coo(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val
          }

In [93]:
def validate(matrices, factors=200, iterations=20, regularization=0.01, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension) 
    for <<iterations>> over matrices and validate with MAP@3
    """
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    
    # The MAPK by implicit doesn't allow to calculate allowing repeated items, which is the case.
    # TODO: change MAP@3to a library that allows repeated items in prediction
    map3= mean_average_precision_at_k(model, csr_train, csr_val, K=3, show_progress=show_progress, num_threads=4)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> MAP@3: {map3:6.5f}")
    return map3

In [94]:
matrices = get_val_matrices(df)

# tuning the parameters

In [95]:

%%time
best_map3 = 0
for factors in [40, 50, 60, 100, 200, 500, 1000]:
    for iterations in [3, 12, 14, 15, 20]:
        for regularization in [0.01]:
            map3 = validate(matrices, factors, iterations, regularization, show_progress=False)
            if map3 > best_map3:
                best_map3 = map3
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best MAP@3 found. Updating: {best_params}")




Factors:  40 - Iterations:  3 - Regularization: 0.010 ==> MAP@3: 0.00610
Best MAP@3 found. Updating: {'factors': 40, 'iterations': 3, 'regularization': 0.01}




Factors:  40 - Iterations: 12 - Regularization: 0.010 ==> MAP@3: 0.00750
Best MAP@3 found. Updating: {'factors': 40, 'iterations': 12, 'regularization': 0.01}




Factors:  40 - Iterations: 14 - Regularization: 0.010 ==> MAP@3: 0.00738




Factors:  40 - Iterations: 15 - Regularization: 0.010 ==> MAP@3: 0.00740




Factors:  40 - Iterations: 20 - Regularization: 0.010 ==> MAP@3: 0.00756
Best MAP@3 found. Updating: {'factors': 40, 'iterations': 20, 'regularization': 0.01}




Factors:  50 - Iterations:  3 - Regularization: 0.010 ==> MAP@3: 0.00792
Best MAP@3 found. Updating: {'factors': 50, 'iterations': 3, 'regularization': 0.01}




Factors:  50 - Iterations: 12 - Regularization: 0.010 ==> MAP@3: 0.00779




Factors:  50 - Iterations: 14 - Regularization: 0.010 ==> MAP@3: 0.00777




Factors:  50 - Iterations: 15 - Regularization: 0.010 ==> MAP@3: 0.00796
Best MAP@3 found. Updating: {'factors': 50, 'iterations': 15, 'regularization': 0.01}




Factors:  50 - Iterations: 20 - Regularization: 0.010 ==> MAP@3: 0.00822
Best MAP@3 found. Updating: {'factors': 50, 'iterations': 20, 'regularization': 0.01}




Factors:  60 - Iterations:  3 - Regularization: 0.010 ==> MAP@3: 0.00692




Factors:  60 - Iterations: 12 - Regularization: 0.010 ==> MAP@3: 0.00760




Factors:  60 - Iterations: 14 - Regularization: 0.010 ==> MAP@3: 0.00773




Factors:  60 - Iterations: 15 - Regularization: 0.010 ==> MAP@3: 0.00765




Factors:  60 - Iterations: 20 - Regularization: 0.010 ==> MAP@3: 0.00756




Factors: 100 - Iterations:  3 - Regularization: 0.010 ==> MAP@3: 0.00751




Factors: 100 - Iterations: 12 - Regularization: 0.010 ==> MAP@3: 0.00800




Factors: 100 - Iterations: 14 - Regularization: 0.010 ==> MAP@3: 0.00800




Factors: 100 - Iterations: 15 - Regularization: 0.010 ==> MAP@3: 0.00800




Factors: 100 - Iterations: 20 - Regularization: 0.010 ==> MAP@3: 0.00788




Factors: 200 - Iterations:  3 - Regularization: 0.010 ==> MAP@3: 0.00802




Factors: 200 - Iterations: 12 - Regularization: 0.010 ==> MAP@3: 0.00817




Factors: 200 - Iterations: 14 - Regularization: 0.010 ==> MAP@3: 0.00789




Factors: 200 - Iterations: 15 - Regularization: 0.010 ==> MAP@3: 0.00787




Factors: 200 - Iterations: 20 - Regularization: 0.010 ==> MAP@3: 0.00818




Factors: 500 - Iterations:  3 - Regularization: 0.010 ==> MAP@3: 0.01019
Best MAP@3 found. Updating: {'factors': 500, 'iterations': 3, 'regularization': 0.01}




Factors: 500 - Iterations: 12 - Regularization: 0.010 ==> MAP@3: 0.00842




Factors: 500 - Iterations: 14 - Regularization: 0.010 ==> MAP@3: 0.00828




Factors: 500 - Iterations: 15 - Regularization: 0.010 ==> MAP@3: 0.00811




Factors: 500 - Iterations: 20 - Regularization: 0.010 ==> MAP@3: 0.00804




Factors: 1000 - Iterations:  3 - Regularization: 0.010 ==> MAP@3: 0.00966




Factors: 1000 - Iterations: 12 - Regularization: 0.010 ==> MAP@3: 0.00698




Factors: 1000 - Iterations: 14 - Regularization: 0.010 ==> MAP@3: 0.00680




Factors: 1000 - Iterations: 15 - Regularization: 0.010 ==> MAP@3: 0.00696




Factors: 1000 - Iterations: 20 - Regularization: 0.010 ==> MAP@3: 0.00648
CPU times: user 1h 35min 13s, sys: 1h 18min 14s, total: 2h 53min 27s
Wall time: 11min 4s


In [96]:
best_params

{'factors': 500, 'iterations': 3, 'regularization': 0.01}

In [97]:
del matrices

## training over the full dataset

In [98]:
coo_train = to_user_item_coo(df)
csr_train = coo_train.tocsr()

In [99]:
def train(coo_train, factors=200, iterations=15, regularization=0.01, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    return model

In [100]:
import json

# Serialize of scipy.sparse.csr_matrix into file:
#pickle.dump(csr_train, open(PATH + "csr_train.pkl","wb"))

# Read of scipy.sparse.csr_matrix from file:
#csr_train = pickle.load(open(PATH + "csr_train.pkl","rb"))

# Serialize list of valid users into file:
#with open(PATH + 'valid_users.npy', 'wb') as f:
  #np.save(f, users)

# Read list of valid users from file:
#with open(PATH + 'valid_users.npy', 'rb') as f:
  #users = np.load(f)

# Serialize list of items into file:
#with open(PATH + 'items.npy', 'wb') as f:
  #np.save(f, items)

# Read list of items from file:
#with open(PATH + 'items.npy', 'rb') as f:
  #items = np.load(f)

# Serialize top_3 items into file:
with open(PATH + 'top_3_items.npy', 'wb') as f:
  np.save(f, top_3)

# Read list of top_3 items from file:
with open(PATH + 'top_3_items.npy', 'rb') as f:
  top_3 = np.load(f)

# Serialize best_params into file:
#json.dump(best_params, open(PATH + "best_params.json", 'w' ) )

# Read best_params from file:
#best_params = json.load(open(PATH + "best_params.json" ) )

In [101]:
model = train(coo_train, **best_params)



  0%|          | 0/3 [00:00<?, ?it/s]

In [102]:
# Serialize model into file:
#with open(PATH + 'model.pkl', 'wb') as saved_model:
    #pickle.dump(model, saved_model)

# Read model from file:
#with open(PATH +'model.pkl', 'rb') as loaded_model:
    #model = pickle.load(loaded_model)

# submission

In [103]:
def submit(model, csr_train, submission_name="submissions.csv"):
    preds = []
    batch_size = 2000
    to_generate = np.arange(len(users))
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx : startidx + batch_size]
        ids, scores = model.recommend(batch, csr_train[batch], N=3, filter_already_liked_items=False)
        for i, userid in enumerate(batch):
            customer_id = user_ids[userid]
            user_items = ids[i]
            article_ids = [item_ids[item_id] for item_id in user_items]
            preds.append((customer_id, ','.join(str(v) for v in article_ids)))

    df_preds = pd.DataFrame(preds, columns=['user_id', 'recommended_items'])
    df_preds.to_csv(PATH + submission_name, index=False)
    
    return df_preds

In [104]:
def recommends(_user_id):
      while True:
        #_user_id = int(input("Enter user id (0 for exit):\n").strip(" \n"))
        if _user_id == 0:
          print("Here is no user with user_id '0'")
          break

        elif _user_id in users:
          pred_df = submit(model, csr_train)
          return pred_df[pred_df['user_id']== _user_id]
        
        else:
          top_3 = list(events.itemid.value_counts()[:3].index)
          df_preds = pd.DataFrame({'user_id': [_user_id], 'recommended_items': [top_3] })
          return df_preds


In [105]:
recommends(820159)

Unnamed: 0,user_id,recommended_items
5,820159,398489369447309275


In [106]:
pd.read_csv(PATH + 'submissions.csv').head(3)

Unnamed: 0,user_id,recommended_items
0,1076270,36035262799269430
1,361387,43485352541409804
2,712443,346655445821349140


In [107]:
top_3

array([461686, 119736, 312728])

In [109]:
print(f' The average precision is {map3}')

 The average precision is 0.006475258420783712
