In [1]:
import pandas as pd
import numpy as np
import math
import os
import re
import itertools

In [2]:
N_MOVIES = 1000
N_USERS = 10000

In [3]:
def get_input_matrix():
    '''
    Get the input matrix

    Return
    ----------
    (data, W): (np.array(N_USERS, N_MOVIES), np.array(N_USERS, N_MOVIES))
        The input array with the true ratings and Nan where no ratings where given and the 
        array containing 1 where the entries are given and 0 otherwise.
    '''

    data_pd = pd.read_csv('../data/data_train.csv') 

    # get users, movies
    users, movies = [np.squeeze(arr) 
                    for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    # get predictions
    predictions = data_pd.Prediction.values

    # create data matrix
    data = np.full((N_USERS, N_MOVIES), np.nan)
    W = np.full((N_USERS, N_MOVIES), 0)

    # populate data matrix
    for user, movie, pred in zip(users, movies, predictions): 
        data[user][movie] = pred
        W[user][movie] = 1 if not math.isnan(pred) else 0
    
    return (data, W)

In [4]:
X, W = get_input_matrix()

In [5]:
X.shape

(10000, 1000)

In [6]:
def compute_cosine_similarity(X, user=True):
    '''
    Compute the cosine similarity between every pair of users or items
    
    Parameters
    ----------
    X : np.array(N_USERS, N_MOVIES)
        The matrix with ratings.
        
    user : bool, default True
           a boolean which says whether we want to compute the cosine similarity between every users or 
           between every items.

    Return
    ----------
    similarity: np.array(N_USERS, N_USERS) if user = True else np.array(N_MOVIES, N_MOVIES):
                The similarity score (between -1 and 1 if X has negative values, where -1 means two vectors 
                going in the opposite direction or between 0 and 1 if we only have positive values where 0 means
                orthogonal vectors) for each user-user or item-item pair. The returned matrix is therefore 
                symmetric.
    '''
    X = np.nan_to_num(X) # Replace Nan by 0 (the dot product will hence be 0, what we want)
    
    if not user:
        X = X.T
    
    similarity = np.zeros((X.shape[0], X.shape[0]))
    for i,user in enumerate(X):
        similarity[i, :] = (X@user)/(np.linalg.norm(X, axis=1)*np.linalg.norm(user))
    
    return similarity

In [7]:
def compute_pearson_correlation_coefficient(X, user=True, statistic_to_use="mean"):
    '''
    Compute the pearson correlation coefficient between every pair of users or items
    
    Parameters
    ----------
    X : np.array(N_USERS, N_MOVIES)
        The matrix with ratings.
        
    user : bool, default True
           a boolean which says whether we want to compute the cosine similarity between every users or 
           between every items.
    statistic_to_use: String, either 'mean' or 'median', default 'mean'
                      The method use to center the data points.

    Return
    ----------
    similarity: np.array(N_USERS, N_USERS) if user = True else np.array(N_MOVIES, N_MOVIES):
                The similarity score (between -1 and 1) for each user-user or item-item pair.
                The returned matrix is therefore symmetric.
    '''
    if not user:
        X = X.T
    
    if statistic_to_use == "mean":
        statistic = np.nanmean(X, axis=1)
    elif statistic_to_use == "median":
        statistic = np.nanmedian(X, axis=1)
    else:
        raise ValueError(f"{statistic_to_use} is not a valid statistic! Should be 'mean' or 'median'")
    
    centered_X = X-statistic.reshape(-1,1)
    
    return compute_cosine_similarity(centered_X, user=True) # Always True since we have already taken the transpose in this method

In [8]:
def compute_SigRA(X, W, user=True):
    '''
    Compute the SiGra (https://ieeexplore.ieee.org/document/8250351) similarity between every pair of users or 
    items. Note that this method already uses weighting and hence should not be followed by the 
    similarity_weighting function.
    
    Parameters
    ----------
    X : np.array(N_USERS, N_MOVIES)
        The matrix with ratings.
    
    W : np.array(N_USERS, N_MOVIES):
        The mask containing 1 for rated items and 0 for unrated items.
        
    user : bool, default True
           a boolean which says whether we want to compute the cosine similarity between every users or 
           between every items.

    Return
    ----------
    similarity: np.array(N_USERS, N_USERS) if user = True else np.array(N_MOVIES, N_MOVIES):
                The similarity score for each user-user or item-item pair.
                The returned matrix is therefore symmetric.
    '''    
    if not user:
        X = X.T
        W = W.T
    
    similarity = np.zeros((X.shape[0], X.shape[0]))

    number_ratings = np.sum(W, axis=1)
    
    for i, (uw, ux) in enumerate(zip(W, X)):
        for j, (vw, vx) in enumerate(zip(W, X)):
            common_ratings = np.logical_and(uw, vw)
            number_common_ratings = np.sum(common_ratings)
            if number_common_ratings == 0:
                similarity[i, j] = 0
            else:
                ratios_sum = np.sum(np.minimum(ux[common_ratings], vx[common_ratings])/np.maximum(ux[common_ratings], vx[common_ratings]))
                weight = 1.0/(1+np.exp(-(number_ratings[i] + number_ratings[j])/(2*number_common_ratings))) #Why number_common_ratings in the denominator? Would make more sense to inverse numerator and denominator, but like that in the paper
                similarity[i, j] = weight*ratios_sum/number_common_ratings
    
    return similarity

In [9]:
def similarity_weighting(similarity, W, method="weighting", threshold=10):
    '''
    Weight the similarity matrix based on the number of ratings of each entry. Without weighting, users having 
    just few entries are often considered as closer, which this method tries to prevent.
    
    Parameters
    ----------
    similarity : np.array(N_USERS, N_USERS) or np.array(N_MOVIES, N_MOVIES)
                 The matrix with similarity between users or movies
        
    W : np.array(N_USERS, N_MOVIES):
        The mask containing 1 for rated items and 0 for unrated items.
        
    method: String, either 'weighting', 'significance' or 'sigmoid', default 'weighting'
                      The method use to weight the similarity.
                      Weighting weights all entries based on the number of common rated items and number of rated items.
                      Significance only reduce importance when number of common rated items is below the threshold.
                      Sigmoid reduce weight when users have only few common rated items. It keeps most of the similarity
                      measure almost untouched and hence is the softest weighting method.
    
    threshold: int, default 10
               Only used if method is 'significance'. Minimum number of common rated items needed to not have
               a decrease in importance. Needs to be adapted depending on the number of common users/items. For
               user-based similarity, should be around 7, for item-based similarity, around 70. 
               TODO: read paper to have more knowledge about what a good threshold is.

    Return
    ----------
    weighted_similarity: np.array(N_USERS, N_USERS) or np.array(N_MOVIES, N_MOVIES) depending of shape of similarity:
                The weighted similarity score (between -1 and 1) for each user-user or item-item pair.
                The returned matrix is therefore symmetric.
    '''
    assert (similarity.shape[0] == W.shape[0] or similarity.shape[0] == W.shape[1]) and similarity.shape[0] == similarity.shape[1]
    
    weighted_similarity = np.zeros_like(similarity)
    if similarity.shape[0] != W.shape[0]:
        W=W.T # We were using the items and not the users for the similarity
    
    number_ratings = np.sum(W, axis=1)
    
    for i, u in enumerate(W):
        for j, v in enumerate(W):
            number_common_ratings = np.sum(np.logical_and(u, v))
            
            if method == "weighting":
                weight = 2*number_common_ratings/(number_ratings[i] + number_ratings[j]) if (number_ratings[i] + number_ratings[j]) != 0 else 0
            elif method == "significance":
                weight = np.minimum(number_common_ratings, threshold)/threshold
            elif method == "sigmoid":
                weight = 1.0/(1+np.exp(-number_common_ratings/2))
            else:
                raise ValueError(f"{method} is not a valid method! Should be 'weighting', 'significance' or 'sigmoid'")

            weighted_similarity[i, j] = weight*similarity[i, j]
    
    return weighted_similarity

In [10]:
similarity = compute_cosine_similarity(X)

In [11]:
similarity_items = compute_cosine_similarity(X, False)

In [12]:
similarity_users_pearson=compute_pearson_correlation_coefficient(X, statistic_to_use="median")

In [13]:
similarity_users_pearson

array([[ 1.        , -0.01367172,  0.09147787, ...,  0.14142136,
         0.10314212,  0.05800419],
       [-0.01367172,  1.        ,  0.0125066 , ...,  0.03625262,
         0.0141013 ,  0.0528678 ],
       [ 0.09147787,  0.0125066 ,  1.        , ...,  0.03234231,
         0.03774089,  0.15033949],
       ...,
       [ 0.14142136,  0.03625262,  0.03234231, ...,  1.        ,
         0.10939874,  0.0341793 ],
       [ 0.10314212,  0.0141013 ,  0.03774089, ...,  0.10939874,
         1.        ,  0.08974013],
       [ 0.05800419,  0.0528678 ,  0.15033949, ...,  0.0341793 ,
         0.08974013,  1.        ]])

In [14]:
similarity_items_pearson=compute_pearson_correlation_coefficient(X, False, statistic_to_use="median")

In [15]:
similarity_items_pearson

array([[ 1.        , -0.02009841,  0.04217509, ...,  0.02842744,
         0.0526932 ,  0.03357856],
       [-0.02009841,  1.        , -0.01603365, ..., -0.02378553,
         0.00502976,  0.02908714],
       [ 0.04217509, -0.01603365,  1.        , ...,  0.05456939,
         0.04203638, -0.01824104],
       ...,
       [ 0.02842744, -0.02378553,  0.05456939, ...,  1.        ,
         0.08803759,  0.01628506],
       [ 0.0526932 ,  0.00502976,  0.04203638, ...,  0.08803759,
         1.        ,  0.03081192],
       [ 0.03357856,  0.02908714, -0.01824104, ...,  0.01628506,
         0.03081192,  1.        ]])

In [16]:
weighted_similarity_items_pearson = similarity_weighting(similarity_items_pearson, W)

In [17]:
weighted_similarity_users_pearson = similarity_weighting(similarity_users_pearson, W)

In [18]:
significance_similarity = similarity_weighting(similarity_items, W, method="significance", threshold=7)

In [19]:
sigmoid_similarity = similarity_weighting(similarity_items, W, method="sigmoid")

In [20]:
similartiry_items_SiGra = compute_SigRA(X, W, user=False)

In [21]:
similartiry_items_SiGra

array([[0.73105858, 0.72079709, 0.73050547, ..., 0.68383528, 0.72625998,
        0.78522803],
       [0.72079709, 0.73105858, 0.68952065, ..., 0.67338861, 0.74773731,
        0.75743673],
       [0.73050547, 0.68952065, 0.73105858, ..., 0.71954167, 0.76983944,
        0.69953118],
       ...,
       [0.68383528, 0.67338861, 0.71954167, ..., 0.73105858, 0.77151787,
        0.71935246],
       [0.72625998, 0.74773731, 0.76983944, ..., 0.77151787, 0.73105858,
        0.75868866],
       [0.78522803, 0.75743673, 0.69953118, ..., 0.71935246, 0.75868866,
        0.73105858]])

In [22]:
def weighted_average_predict(X, W, similarity, k=10, with_std = False, verbose=True, min_similarity_neighbor=0):
    '''
    Predict the missing values by a weighted average of the ratings of the k nearest neighbors with a weight 
    corresponding to their similarity. Take into account the mean value of the user (respectively item). Uses
    only item based similarity matrix or user based similarity matrix
    
    
    Parameters
    ----------
    X : np.array(N_USERS, N_MOVIES)
        The matrix with ratings.
    
    W : np.array(N_USERS, N_MOVIES):
        The mask containing 1 for rated items and 0 for unrated items.
    
    similarity : np.array(N_USERS, N_USERS) or np.array(N_MOVIES, N_MOVIES)
                 The matrix with similarity between users or movies
        
    k: int, default 10
       Number of nearest neighbors
    
    with_std: bool, default False
              If set to true, take into account the std to compute a Z-score when computing the weights

    verbose: bool, default True:
             If set to True, print update messages
    
    min_similarity_neighbor: int, default 0:
            Minimum value needed for the similarity to be considered as a neighbor. For PCC, should be 0, for 
            similarities ranging between 0 and 1 should be close to 0.5.
    
    Return
    ----------
    predictions: np.array(N_USERS, N_MOVIES)
                 The predictions for the missing values
    
    confidence: np.array(N_USERS, N_MOVIES)
                 The confidence based on the similarity of neighbors used to compute it (if neighbors have high
                 similarity, will give high confidence).
    '''
    MIN_POSSIBLE_RATING = 1
    MAX_POSSIBLE_RATING = 5
    
    was_transposed=False
    printing_interval=200
    
    if similarity.shape[0] != W.shape[0]: # We were using the items and not the users for the similarity 
        was_transposed=True
        W=W.T   
        X=X.T
        printing_interval=30
    
    predictions = X.copy()
    confidence = np.ones_like(predictions)

    for i in range(X.shape[0]):
        
        user_mean = np.nanmean(X[i, :])
        user_mean = np.nan_to_num(user_mean, nan=(MAX_POSSIBLE_RATING+MIN_POSSIBLE_RATING)/2) # Replace Nan by medium value if a row was full of Nan

        if with_std:
            number_items_rated = np.sum(W[i, :])
            user_std = np.sqrt(np.nansum((X[i, :]-user_mean)**2)/(number_items_rated-1)) if number_items_rated > 1 else 1
        
        for j in range(X.shape[1]):
            if not W[i, j]:
                possible_neighbors = np.where(np.logical_and(W[:, j], similarity[i, :]>min_similarity_neighbor))[0]
                sorted_possible_neighbors = possible_neighbors[np.flip(np.argsort(similarity[i, possible_neighbors]))]
                nearest_neighbors = sorted_possible_neighbors[:k]

                if nearest_neighbors.shape[0] == 0:
                    predictions[i, j] = user_mean
                elif with_std:                                        
                    neighbors_number_item_rated = np.sum(W[nearest_neighbors, :], axis=1)
                    neighbors_means = np.nanmean(X[nearest_neighbors, :], axis=1)
                    neighbors_means = np.nan_to_num(neighbors_means, nan=(MAX_POSSIBLE_RATING+MIN_POSSIBLE_RATING)/2) # Replace Nan by medium value if a row was full of Nan
                    neighbors_number_item_rated[neighbors_number_item_rated<=1]=2 #To avoid division by 0 problems, should not happen frequently, set std to 1 later
                    
                    neighbors_stds = np.sqrt(np.nansum((X[nearest_neighbors, :]-neighbors_means.reshape(-1,1))**2, axis=1)/(neighbors_number_item_rated-1))
                    neighbors_stds[neighbors_number_item_rated<=1] = 1 #Set std to 1 if it was the only rating

                    predictions[i, j] = user_mean + user_std * np.sum(np.multiply(similarity[i, nearest_neighbors], (X[nearest_neighbors, j]-np.nanmean(X[nearest_neighbors, :], axis=1))/neighbors_stds))/np.sum(similarity[i, nearest_neighbors])
                else:  
                    predictions[i, j] = user_mean + np.sum(np.multiply(similarity[i, nearest_neighbors], X[nearest_neighbors, j]-np.nanmean(X[nearest_neighbors, :], axis=1)))/np.sum(similarity[i, nearest_neighbors])
                
                confidence[i, j] = np.sum(similarity[i, nearest_neighbors]) if nearest_neighbors.shape[0] != 0 else 0
                
                
        if verbose and (i==X.shape[0]-1 or (not i%printing_interval and i!=0)):
            similarity_type = "user" if not was_transposed else "item"
            print(f"Done with {similarity_type} {i}/{X.shape[0]}")
    
    predictions = np.clip(predictions, MIN_POSSIBLE_RATING, MAX_POSSIBLE_RATING) # Might exceed it, so we clip to correct range
    
    if was_transposed:
        predictions = predictions.T
        confidence = confidence.T
    
    return predictions, confidence

In [23]:
def min_max_normalization(X):
    '''
    Apply min-max normalization to the given matrix, not taking into account the Nan values.
    
    
    Parameters
    ----------
    X : np.array(N_USERS, N_MOVIES)
        The matrix with ratings.
    
    Return
    ----------
    X_norm: np.array(N_USERS, N_MOVIES)
        The matrix with ratings normalized in range [0, 1].
    '''
    min_val = np.nanmin(X)
    max_val = np.nanmax(X)
    return (X-min_val)/(max_val-min_val)

In [84]:
def compute_CSR(users_similarity, items_similarity, X, W, alpha=0.5 , epsilon=0.1, max_iter=10, min_similarity_positive=0, verbose=True):
    """
    Implement the Comprehensive Similarity Reinforcement algorithm, which refine the user similarity using items similarity
    and vice versa. See paper at https://dl.acm.org/doi/pdf/10.1145/3062179 for more informations.
    
    Parameters
    ----------
    users_similarity : np.array(N_USERS, N_USERS)
                 The matrix with similarity between users. In paper, they use a weighted PCC.
    
    items_similarity: np.array(N_MOVIES, N_MOVIES)
                 The matrix with similarity between items. In paper, they use a weighted PCC.
    
    X : np.array(N_USERS, N_MOVIES)
        The matrix with ratings.
    
    W : np.array(N_USERS, N_MOVIES):
        The mask containing 1 for rated items and 0 for unrated items.
        
    alpha: float in range (0,1), default 0.5
        Update parameter. If small, the reinforced matrix will change only slightly at each iteration.
    
    epsilon: float, default 0.1
        Threshold used to stop the iterations when the Frobenius norm of the difference between two 
        iterations is below it for both user and item matrix.

    max_iter: int, default 10
        Maximum number of iterations.

    min_similarity_positive: int, default 0:
        Minimum value needed for the similarity to be considered as positive. For PCC, should be 0, for 
        similarities ranging between 0 and 1 should be close to 0.5.

    verbose: bool, default True:
        If set to True, print update messages
    
    
    Return
    ----------
    users_reinforced_similarity: np.array(N_USERS, N_USERS)
                 The reinforced user similarity matrix
                 
    items_reinforced_similarity: np.array(N_MOVIES, N_MOVIES)
                 The reinforced item similarity matrix
    """
    assert alpha >= 0 and alpha <= 1
    users_reinforced_similarity = users_similarity.copy()
    items_reinforced_similarity = items_similarity.copy()
    
    X = min_max_normalization(X)
    
    for iter_cur in range(max_iter):
        last_users_reinforced_similarity = users_reinforced_similarity.copy()
        last_items_reinforced_similarity = items_reinforced_similarity.copy()
        
        #Update users
        for user_0 in range(X.shape[0]):
            rated_items_user0 = np.where(W[user_0, :])[0]
            if rated_items_user0.shape[0]==0: #Should not happen
                break

            for user_1 in range(user_0+1, X.shape[0]):
                rated_items_user1 = np.where(W[user_1, :])[0]
                if rated_items_user1.shape[0]==0: #Should not happen
                    break
                
                indices = np.array(list(itertools.product(rated_items_user0, rated_items_user1))) # All pairs of indices, shape (rated_items_user0*rated_items_user1, 2)
                pos_neg_indices_users = indices[last_items_reinforced_similarity[indices[:, 0], indices[:, 1]]!=min_similarity_positive]
                w_users = 1-2*np.abs(X[user_0, pos_neg_indices_users[:,0]]-X[user_1, pos_neg_indices_users[:,1]])
                total_w_users = np.sum(np.abs(w_users))

                if total_w_users != 0:
                    update_users = np.sum(w_users*last_items_reinforced_similarity[pos_neg_indices_users[:,0], pos_neg_indices_users[:,1]])/total_w_users
                    new_sim_users = (1-alpha)*last_users_reinforced_similarity[user_0, user_1] + alpha * update_users
                    users_reinforced_similarity[user_0, user_1] = new_sim_users
                    users_reinforced_similarity[user_1, user_0] = new_sim_users
        
        if verbose:
            print(f"Done with users of iteration {iter_cur+1}/{max_iter}")
        
        #Update items
        for item_0 in range(X.shape[1]):
            rated_users_item0 = np.where(W[:, item_0])[0]
            if rated_users_item0.shape[0]==0: #Should not happen
                break

            for item_1 in range(item_0+1, X.shape[1]):
                rated_users_item1 = np.where(W[:,item_1])[0]
                if rated_users_item1.shape[0]==0: #Should not happen
                    break

                indices = np.array(list(itertools.product(rated_users_item0, rated_users_item1))) # All pairs of indices, shape (rated_users_item0*rated_users_item1, 2)
                pos_neg_indices_items = indices[users_reinforced_similarity[indices[:, 0], indices[:, 1]]!=min_similarity_positive]
                w_items = 1-2*np.abs(X[pos_neg_indices_items[:,0], item_0]-X[pos_neg_indices_items[:,1], item_1])
                total_w_items = np.sum(np.abs(w_items))

                if total_w_items != 0:
                    update_items = np.sum(w_items*users_reinforced_similarity[pos_neg_indices_items[:,0], pos_neg_indices_items[:,1]])/total_w_items
                    new_sim = (1-alpha)*last_items_reinforced_similarity[item_0, item_1] + alpha * update_items
                    items_reinforced_similarity[item_0, item_1] = new_sim
                    items_reinforced_similarity[item_1, item_0] = new_sim
        
        if verbose:
            print(f"Done with iteration {iter_cur+1}/{max_iter}")
        
        #Check for stopping condition
        dU = np.linalg.norm(users_reinforced_similarity-last_users_reinforced_similarity, ord="fro")
        dI = np.linalg.norm(items_reinforced_similarity-last_items_reinforced_similarity, ord="fro")
        
        if dU < epsilon and dI < epsilon:
            if verbose:
                print(f"Early stopping due to convergence (difference between two runs smaller than epsilon = {epsilon})")
            break
        
    return users_reinforced_similarity, items_reinforced_similarity    

In [85]:
n_userrrr = 100
n_itemsss = 300
users_reinforced_similarity, items_reinforced_similarity = compute_CSR(weighted_similarity_users_pearson[:n_userrrr, :n_userrrr], weighted_similarity_items_pearson[:n_itemsss, :n_itemsss], X[:n_userrrr, :n_itemsss], W[:n_userrrr, :n_itemsss], max_iter=5)

Done with users of iteration 1/5
Done with iteration 1/5
Done with users of iteration 2/5
Done with iteration 2/5
Done with users of iteration 3/5
Done with iteration 3/5
Done with users of iteration 4/5
Done with iteration 4/5
Early stopping due to convergence (difference between two runs smaller than epsilon = 0.1)


In [83]:
users_reinforced_similarity

array([[1.        , 0.00839795, 0.0099411 , ..., 0.0322515 , 0.00354424,
        0.01389177],
       [0.00839795, 1.        , 0.00898864, ..., 0.00932259, 0.01243429,
        0.00497444],
       [0.0099411 , 0.00898864, 1.        , ..., 0.00894849, 0.00684133,
        0.00844839],
       ...,
       [0.0322515 , 0.00932259, 0.00894849, ..., 1.        , 0.0123078 ,
        0.00723837],
       [0.00354424, 0.01243429, 0.00684133, ..., 0.0123078 , 1.        ,
        0.00563348],
       [0.01389177, 0.00497444, 0.00844839, ..., 0.00723837, 0.00563348,
        1.        ]])

(array([  0,   0,   0, ..., 299, 299, 299]),
 array([  1,   4,   5, ..., 284, 288, 293]))

In [360]:
n_userrrr = 50
n_itemsss = 30
users_pred, users_confidence = weighted_average_predict(X[:n_userrrr, :n_itemsss], W[:n_userrrr, :n_itemsss], similarity_users_pearson[:n_userrrr, :n_userrrr])
items_pred, items_confidence = weighted_average_predict(X[:n_userrrr, :n_itemsss], W[:n_userrrr, :n_itemsss], similarity_items_pearson[:n_itemsss, :n_itemsss])

Done with user 49/50
Done with item 29/30


  user_mean = np.nanmean(X[i, :])


In [348]:
def predict_using_users_and_items(users_pred, items_pred, users_confidence, items_confidence, user_weight=0.5):
    """
    Compute the final prediction using both user and items predictions.
    
    Parameters
    ----------
    users_pred : np.array(N_USERS, N_MOVIES)
                 The matrix of predictions based on the neighbors of the users
    
    items_pred: np.array(N_USERS, N_MOVIES)
                 The matrix of predictions based on the neighbors of the items
    
    users_confidence : np.array(N_USERS, N_MOVIES)
                 The matrix of confidence based on the neighbors of the users
    
    items_confidence: np.array(N_USERS, N_MOVIES)
                 The matrix of confidence based on the neighbors of the items
        
    user_weight: float in range [0,1], default 0.5
        Used to add manually more weight to user or items prediction
    
    
    Return
    ----------
    final_predictions: np.array(N_USERS, N_MOVIES)
                 The Final prediction
    """
    assert user_weight>=0 and user_weight<=1
    
    no_predictions = np.where((user_weight*users_confidence + (1-user_weight)*items_confidence)==0) #Should not happen
    
    #To avoid division by 0, we put same weight to both predictions, which will just be the mean of the user respectively item, so it makes sense
    if no_predictions[0].shape[0] != 0:
        items_confidence[no_predictions]=1
        users_confidence[no_predictions]=1
    
    weight_users = (user_weight*users_confidence)/(user_weight*users_confidence + (1-user_weight)*items_confidence)
    weight_items = ((1-user_weight)*items_confidence)/(user_weight*users_confidence + (1-user_weight)*items_confidence)

    final_predictions = users_pred*weight_users + items_pred*weight_items
    return final_predictions

In [363]:
predict_using_users_and_items(users_pred, items_pred, users_confidence, items_confidence)

array([[4.92857143, 4.48859473, 4.83333333, ..., 4.11538462, 5.        ,
        5.        ],
       [4.72389621, 2.81285549, 4.30962288, ..., 4.36084074, 3.026917  ,
        4.58977164],
       [2.01847706, 1.02956358, 1.62142332, ..., 2.55555556, 2.09398842,
        2.40257751],
       ...,
       [3.51505848, 2.18800346, 3.01577568, ..., 2.63415318, 3.07987385,
        3.03146004],
       [3.58266908, 2.22780132, 3.51063618, ..., 2.40858109, 3.06530814,
        3.75771876],
       [3.15462872, 2.53945976, 3.32887286, ..., 1.37194277, 2.93427735,
        4.        ]])

In [176]:
def generate_submission(pred, submission_name="submission", compression_type ="zip"):
    sample = pd.read_csv('../data/sampleSubmission.csv') # load sample submission
    sample = sample.astype({"Prediction": float}, errors='raise')
    for index, row in sample.iterrows():
        r, c = re.findall(r'r(\d+)_c(\d+)', row["Id"])[0]
        sample.at[index, "Prediction"] = pred[int(r)-1][int(c)-1]
    
    sample.to_csv(submission_name, compression=compression_type, float_format='%.3f', index = None)

In [570]:
def submit_on_kaggle(name="submission.zip", message=None):
    '''
    Submit a solution on kaggle.

    Parameters
    ----------
    name: str (optional)
        name of the file to submit
    message: str (optional)
        Message to use with the submission. Makes easier to 
        understand what each submission is about
    '''
    command = f"kaggle competitions submit -c cil-collaborative-filtering-2022 -f {name}"

    if not message is None:
        command = command + f" -m {message}"

    os.system(command)

In [568]:
generate_submission(pred, submission_name="sigmoid_cosine_similarity_items_10nn")

In [571]:
submit_on_kaggle(name="sigmoid_cosine_similarity_items_10nn", message="Similarity based using items, cosine similarity with sigmoid weighting and 10 nearest neighbors")

In [559]:
pred

array([[3.48457718, 3.46575115, 3.29454848, ..., 2.79584825, 2.99129092,
        3.33164412],
       [3.34920169, 2.47014907, 2.84239086, ..., 5.        , 3.        ,
        3.        ],
       [2.27405046, 3.02060159, 2.87631228, ..., 2.10074353, 2.37203414,
        2.56367779],
       ...,
       [2.91337448, 3.36292369, 3.19491513, ..., 2.76052163, 2.91109604,
        3.4841525 ],
       [2.80468284, 3.44195667, 3.54328259, ..., 2.66757643, 2.83965279,
        3.04405327],
       [2.87667227, 3.44069647, 3.30077702, ..., 2.98914642, 3.08887743,
        3.        ]])