In [1]:
# Import the necessary dependencies

# Operating System
import os
import copy 
import json

# Numpy, Pandas and Scipy
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, coo_matrix, save_npz, load_npz

# Scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Model Evaluation
from evaluation import evaluate

In [2]:
def read_users_history(features=None) -> pd.DataFrame:
    """Imports the listening history for each user.
    
    Returns:
        data (pd.DataFrame): DataFrame with the user for each user. 
                             The rows are tuples of (user, song_id, rating).
                             
    """
    path = os.path.join('data', 'train_reviews.csv')
    data = pd.read_csv(path, sep=',')
    data["review_date"] = data.review_date.apply(lambda x: pd.to_datetime(x))
    data = data.sort_values(by=["review_date"])
    if features != None:
        return data[features]
    else:
        return data

data = read_users_history(['user_id', 'product_id', 'overall'])
data.head()

Unnamed: 0,user_id,product_id,overall
195,A1M2T0J45TTE64,B00004S9I0,5
5362,A11I1I9QLMAM1A,B0000D80FM,4
146,AGFW667QNHDOY,B00000IURU,5
193,A1MR1VMK999I6O,B00004S9I0,5
1024,A96JD9312DHWC,B00005JD40,5


In [3]:
data.shape

(268169, 3)

In [4]:
def read_test_users() -> pd.DataFrame:
    """Imports the list of users for which we need to predict.
    
    Returns:
        users_to_pred (pd.DataFrame): DataFrame with the users for which we will recommend songs. 
    """

    path = os.path.join('data', 'test_users.csv')
    users_to_pred_ = pd.read_csv(path) #names = ['users to recommend products']
    users_to_pred_ = users_to_pred_.rename(columns={"user_id": "users to recommend products"})
    return users_to_pred_


users_to_pred = read_test_users()
users_to_pred.head()

Unnamed: 0,users to recommend products
0,A0029274J35Q1MYNKUWO
1,A0103849GBVWICKXD4T6
2,A01685981QK9IX1Q16YZY
3,A02904661A62AP64S46MT
4,A036147939NFPC389VLK


In [5]:
print(f"We have {len(users_to_pred)} users in need for better products!")

We have 9225 users in need for better products!


In [6]:
def get_indices_from_users_to_pred(users_to_pred: pd.DataFrame, data: pd.DataFrame):
    """Get the indices of users_to_pred for which we have data and for which we don't.
    
    Args:
        users_to_pred (pd.DataFrame): DataFrame containing the list of users we are going to recommend items.
        data (pd.DataFrame): Original of listening history for the users.
        
    Returns:
        index_users_in_data (Int64Index): Index that filters the users_to_pred to get the user_id's with training data.
        index_users_not_in_data (Int64Index): Index that filters the users_to_pred to get the user_id's without training data.
        
    """
    index_users_in_data = users_to_pred[users_to_pred.isin(data.user_id.values).values].index
    index_users_not_in_data = users_to_pred[~users_to_pred.isin(data.user_id.values).values].index
    
    return index_users_in_data, index_users_not_in_data

index_users_in_data, index_users_not_in_data = get_indices_from_users_to_pred(users_to_pred, data)

In [7]:
# For further inspection, we advise you to look at the objects themselves.
print(f"The index for users which we have training data has length of {len(index_users_in_data)}.")
print(f"The index for users which we don't have training data has length of {len(index_users_not_in_data)}.")

The index for users which we have training data has length of 8343.
The index for users which we don't have training data has length of 882.


In [8]:
def get_users_to_pred_by_index(users_to_pred, index_users_in_data):
    """DataFrame with user_id's in test set for for which we have training data.

    Args: 
        users_to_pred (pd.DataFrame): DataFrame containing the list of users we are going to recommend items.
        index_users_in_data (Int64Index): Index that filters the users_to_pred to get the user_id's with training data.
    Returns:
        users_in_data (pd.DataFrame): Dataframe containing the list of user_id's with training data.
    
    """
    return users_to_pred.iloc[index_users_in_data].reset_index(drop=True)

# Get the test users with training data
test_users_in_data = get_users_to_pred_by_index(users_to_pred, index_users_in_data)
test_users_in_data.head()

Unnamed: 0,users to recommend products
0,A0029274J35Q1MYNKUWO
1,A0103849GBVWICKXD4T6
2,A01685981QK9IX1Q16YZY
3,A02904661A62AP64S46MT
4,A036147939NFPC389VLK


In [9]:
# Get the test users without training data
test_users_not_in_data = get_users_to_pred_by_index(users_to_pred, index_users_not_in_data)
print(test_users_not_in_data.shape)
test_users_not_in_data.head()

(882, 1)


Unnamed: 0,users to recommend products
0,A04904273OLXHXFW1AQK2
1,A100US3LDAJU51
2,A10885J2DS1NQ6
3,A108M7R9UBH5LT
4,A10W7NGRNY5GTR


In [10]:
# Let's store a Series with the unique user id's that we have in the original data.
def get_unique_users(data):
    """Get unique users in training data.
    
    Args:
        data (pd.DataFrame):  listening history for the users.
        
    Returns:
        unique_users (pd.DataFrame): DataFrame of one column with unique users in training data.
    
    """
    return pd.DataFrame(np.unique(data.iloc[:, 0].values), columns=["users to recommend products"])


unique_users_training_data = get_unique_users(data)
unique_users_training_data.head()

Unnamed: 0,users to recommend products
0,A00046902LP5YSDV0VVNF
1,A0029274J35Q1MYNKUWO
2,A004507634IQ8TNQ3YTUI
3,A005011233SVRED9Q0VY0
4,A00676013IBZZQ3UYWK47


In [11]:
def get_indices_in_ratings_for_test_users_in_data(data: pd.DataFrame, test_users_in_data: pd.DataFrame):
    """Returns the index of the ratings matrix for the test users for which we have training data.
    
    Args:
        data (pd.DataFrame): DataFrame with the user for each user. 
                             The rows are tuples of (user, song_id, rating).
                             
        test_users_in_data (pd.DataFrame): DataFrame containing the list of test users for which we have training data.
        
    Returns:
        indices_ratings_tests_users_in_data (np.array): Indices of users in test set with training data for ratings matrix.
    
    """
    unique_users = get_unique_users(data).iloc[:, 0]
    indices_ratings_tests_users_in_data = unique_users[unique_users.isin(test_users_in_data.iloc[:, 0])].index.to_numpy()                                                                                       
    return indices_ratings_tests_users_in_data

indices_ratings_tests_users_in_data = get_indices_in_ratings_for_test_users_in_data(data, test_users_in_data)

In [12]:
len(indices_ratings_tests_users_in_data)

8343

In [13]:
print(f"As expected, the length of the indices should be {len(indices_ratings_tests_users_in_data)}, matching the number of users in test set with training data.")

As expected, the length of the indices should be 8343, matching the number of users in test set with training data.


In [14]:
def make_ratings(data: pd.DataFrame, shape: tuple = None) -> csr_matrix:
    """Creates the ratings matrix of listening history with optional shape
    
    2nd version of the make_ratings which handles the previous TODO.
    Creates the ratings matrix from the listening history imported using the read_users_history() method.
    
    Args:
        data (pd.DataFrame):  Listening history for the users.
        shape (tuple): The overall (n_users, n_items) shape desired for the matrix. 
                       If None, define the shape with the (n_users, n_items) from data argument.
        
    Returns:
        ratings (csr_matrix): Ratings matrix with shape (n_users, n_items).
    
    """
    # You have probably seen this before
    users, user_pos = np.unique(data.iloc[:, 0].values, return_inverse=True)
    items, item_pos = np.unique(data.iloc[:, 1].values, return_inverse=True)
    values = data.iloc[:, 2].fillna(0).values
    
    if shape == None:
        shape = (len(users), len(items))

    R_ = coo_matrix((values, (user_pos, item_pos)), shape=shape).tocsr()
    return R_, items


R, items = make_ratings(data)
R, items

(<34713x18264 sparse matrix of type '<class 'numpy.int64'>'
 	with 268169 stored elements in Compressed Sparse Row format>,
 array(['1881509818', '2094869245', '7245456259', ..., 'B00J77V5LK',
        'B00JF69NI0', 'B00JG07IKU'], dtype=object))

In [15]:
# Percentage of listening history used for validation.
test_size = 0.2

def make_train_val_split(data: pd.DataFrame, test_size : float = 0.2, shape: tuple = None):
    """Split the data into train and validation and returns the ratings matrixes accordingly.
    
    Args:
        data (pd.DataFrame): Listening history for the users.
        test_size (float): Percentage of listening history used for validation.
        shape (tuple): The overall (n_users, n_items) shape desired for the matrix. 
                       If None, define the shape with the (n_users, n_items) from 'data' argument.
    
    Returns:
        ratings_train (csr_matrix): Ratings matrix for train.
        ratings_val (csr_matrix): Ratings matrix for validation.
    
    """
    train_data, val_data = train_test_split(data, test_size=test_size, random_state=8)
    R_train, items = make_ratings(train_data, shape=shape)
    R_val, items = make_ratings(val_data, shape=shape)
    return R_train, R_val

ratings_train, ratings_val = make_train_val_split(data, test_size=test_size, shape=R.shape)

In [16]:
# After the train/validation split, let's compare the number of ratings available in each matrix.
print(f"After the split we have {ratings_train.nnz:,} ratings in the train set and {ratings_val.nnz:,} ratings in the validation set.")

After the split we have 214,535 ratings in the train set and 53,634 ratings in the validation set.


## 1) Non-Personalized

In [17]:
def get_most_rated(ratings: csr_matrix, n: int) -> np.matrix:
    """Returns the n most rated items in a ratings matrix.
    
    Args:
        ratings (csr_matrix): A sparse ratings matrix
        n (int): The number of top-n items we should retrieve.
        
    Returns:
        most_rated (np.matrix): An array of the most rated items.
    
    """
    def is_rating(R_: csr_matrix) -> csr_matrix:
        """Returns a sparse matrix of booleans 
        
        Args:
            R_ (csr_matrix): A sparse ratings matrix
            
        Returns:
            is_rating (csr_matrix): A sparse matrix of booleans.
        """
        return R_ > 0
    
    def count_ratings(R_: csr_matrix):
        """Returns an array with the count of ratings
        
        The attribute ".A1" of a numpy matrix returns self as a flattened ndarray.
        
        Args:
            R_ (csr_matrix): A sparse matrix of booleans
        
        Returns:
            count_ratings (np.darray): Count of ratings by item
        """
        return R_.sum(axis=0).A1
    
    ratings_ = is_rating(ratings)
    ratings_ = count_ratings(ratings_)
    return np.negative(ratings_).argsort()[:n]


non_pers_most_rated = get_most_rated(ratings_train, 50)
non_pers_most_rated

array([ 7098,  7947,  4065,  4783,  9732,   248,  2454,  9247, 10054,
        7099,  8181, 13077,  5630, 15069, 13094, 10955,  1177,  8221,
        5853,  5818, 14988,  2873,  1370,  6220,  8355,  2133,  8872,
       12978, 12372, 11708,  8620,  2235,  7395,   500, 10052,  2276,
        9064,  6668,  3385,  6776,  9475,  6716,  8163,  8799,  9838,
        4037,  4421,  6281,  5516,  3081])

In [18]:
def convert_non_pers_recommendations_to_df(non_pers_recs: np.array, users_to_pred: pd.DataFrame) -> pd.DataFrame:
    """
    Converts the non-personalized most rated to an DataFrame with the users and the recommendations.
    We will basically repeat the non_pers_recs array for the number of users in need.
    
    Args:
        non_pers_recs (np.array): Array of indices for the best non-personalized items to recommend.
        users_to_pred (pd.DataFrame): DataFrame containing the users which need recommendations.
        
    Returns:
        non_pers_most_rated_matrix (np.array): Two dimensional array of (n_users, top_n_items)
    
    """
    non_pers_df = pd.DataFrame(np.zeros((len(users_to_pred), 1), dtype=non_pers_recs.dtype) + non_pers_recs)
    non_pers_df = pd.concat([users_to_pred, non_pers_df], axis=1)
    non_pers_df = non_pers_df.set_index("users to recommend products")
    
    return non_pers_df


non_pers_most_rated_df = convert_non_pers_recommendations_to_df(non_pers_most_rated, unique_users_training_data)
non_pers_most_rated_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
users to recommend products,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00046902LP5YSDV0VVNF,7098,7947,4065,4783,9732,248,2454,9247,10054,7099,...,9475,6716,8163,8799,9838,4037,4421,6281,5516,3081
A0029274J35Q1MYNKUWO,7098,7947,4065,4783,9732,248,2454,9247,10054,7099,...,9475,6716,8163,8799,9838,4037,4421,6281,5516,3081
A004507634IQ8TNQ3YTUI,7098,7947,4065,4783,9732,248,2454,9247,10054,7099,...,9475,6716,8163,8799,9838,4037,4421,6281,5516,3081
A005011233SVRED9Q0VY0,7098,7947,4065,4783,9732,248,2454,9247,10054,7099,...,9475,6716,8163,8799,9838,4037,4421,6281,5516,3081
A00676013IBZZQ3UYWK47,7098,7947,4065,4783,9732,248,2454,9247,10054,7099,...,9475,6716,8163,8799,9838,4037,4421,6281,5516,3081


In [19]:
def create_dict_preds(preds_df: pd.DataFrame) -> dict:
    """Convert the predictions DataFrame (index:users -> columns: items) to a dictionary of key (user->list of items).
    
    Args: 
        preds_df (pd.DataFrame): DataFrame containing the users and the ordered predictions.
        
    Returns:
        preds_dict (dict): Dict of (user_id: list of items) used for evaluating the performance.
    
    """
    return {preds_df.index[i]: preds_df.values[i].tolist() for i in range(len(preds_df))}


non_pers_dict = create_dict_preds(non_pers_most_rated_df)

In [20]:
def get_y_true(R_val_, users_to_pred, n=100):
    """Get the ground truth (best recommendations) of the users in the validation set.
    
    Args:
        R_val_ (csr_matrix): Validation set ratings matrix.
        n (int): Number of top-n items.
        
    Returns:
        y_true_df (pd.DataFrame): DataFrame which returns the y_true items.
        
    """
    top_from_R_val = pd.DataFrame(np.negative(R_val_).toarray().argsort()[:, :n])
    y_true_df = pd.concat([users_to_pred, top_from_R_val], axis=1)
    y_true_df = y_true_df.set_index("users to recommend products")
    return y_true_df


y_true_df = get_y_true(ratings_val, unique_users_training_data, n=100)
y_true_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
users to recommend products,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00046902LP5YSDV0VVNF,6759,12166,12167,12168,12169,12170,12171,12172,12173,12174,...,12267,12266,12265,12264,12263,12262,12252,12261,12259,12258
A0029274J35Q1MYNKUWO,8978,13602,12177,12176,12175,12174,12173,12178,0,12171,...,12253,12267,12266,12265,12264,12263,12262,12252,12261,12259
A004507634IQ8TNQ3YTUI,2195,0,12179,12178,12177,12176,12175,12180,12174,12172,...,12268,12267,12266,12265,12264,12263,12253,12262,12260,12259
A005011233SVRED9Q0VY0,11985,12167,12168,12169,12170,12171,12172,12173,12174,12175,...,12267,12266,12265,12264,12263,12237,12262,12260,12259,12258
A00676013IBZZQ3UYWK47,9988,12168,12169,12170,12171,12172,12173,12174,12175,12176,...,12268,12267,12266,12265,12264,12238,12263,12261,12260,12259


In [21]:
y_true_dict = create_dict_preds(y_true_df)

In [22]:
evaluate(y_true_dict, non_pers_dict)

2.9236236857971864e-06

## Predict Non Personalized

In [23]:
test_users_not_in_data.head()

Unnamed: 0,users to recommend products
0,A04904273OLXHXFW1AQK2
1,A100US3LDAJU51
2,A10885J2DS1NQ6
3,A108M7R9UBH5LT
4,A10W7NGRNY5GTR


In [24]:
# Join both dataframes with user_id's
all_test_users = pd.concat([test_users_in_data, test_users_not_in_data]).reset_index(drop=True)

In [25]:
non_pers_test_most_rated_df = convert_non_pers_recommendations_to_df(non_pers_most_rated, all_test_users)
print(non_pers_test_most_rated_df.shape)
non_pers_test_most_rated_df.head()

(9225, 50)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
users to recommend products,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0029274J35Q1MYNKUWO,7098,7947,4065,4783,9732,248,2454,9247,10054,7099,...,9475,6716,8163,8799,9838,4037,4421,6281,5516,3081
A0103849GBVWICKXD4T6,7098,7947,4065,4783,9732,248,2454,9247,10054,7099,...,9475,6716,8163,8799,9838,4037,4421,6281,5516,3081
A01685981QK9IX1Q16YZY,7098,7947,4065,4783,9732,248,2454,9247,10054,7099,...,9475,6716,8163,8799,9838,4037,4421,6281,5516,3081
A02904661A62AP64S46MT,7098,7947,4065,4783,9732,248,2454,9247,10054,7099,...,9475,6716,8163,8799,9838,4037,4421,6281,5516,3081
A036147939NFPC389VLK,7098,7947,4065,4783,9732,248,2454,9247,10054,7099,...,9475,6716,8163,8799,9838,4037,4421,6281,5516,3081


In [26]:
def row_to_list(row, items):
    row_ = [items[i] for i in row]
    return row_

In [27]:
def product_to_list(df):
    df_ = copy.copy(df)
    df_.index.names = ["user_id"]
    df_["purchases"] = df_.apply(row_to_list, args=(items,), axis=1)
    return df_[["purchases"]]

In [28]:
df_test_hk5 = product_to_list(non_pers_test_most_rated_df)
df_test_hk5['purchases'] = df_test_hk5.apply(lambda x: json.dumps(x.purchases), axis=1)
df_test_hk5.head()

Unnamed: 0_level_0,purchases
user_id,Unnamed: 1_level_1
A0029274J35Q1MYNKUWO,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
A0103849GBVWICKXD4T6,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
A01685981QK9IX1Q16YZY,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
A02904661A62AP64S46MT,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
A036147939NFPC389VLK,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."


In [29]:
def save_predictions(predictions: pd.DataFrame, output_path: str):
    """Save predictions to csv.
    
    Saves the predictions into a csv file with the format we need.
    We keep the index since it contains the user ids.
    
    Args:
        predictions (pd.DataFrame): DataFrame with user_id as index and ordered recommendations in the columns.
        output_path (str): Filepath for the predictions file.
    
    """
    predictions.to_csv(output_path)
    print(f"Saved to csv in '{output_path}'.")
    
    
save_predictions(df_test_hk5, os.path.join("data", "test_non_personalized_recommendations.csv"))

Saved to csv in 'data/test_non_personalized_recommendations.csv'.


In [30]:
new_users_recommendations = df_test_hk5.loc[test_users_not_in_data["users to recommend products"]]
new_users_recommendations.head()

Unnamed: 0_level_0,purchases
user_id,Unnamed: 1_level_1
A04904273OLXHXFW1AQK2,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
A100US3LDAJU51,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
A10885J2DS1NQ6,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
A108M7R9UBH5LT,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
A10W7NGRNY5GTR,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."


### Personalized

In [31]:
def make_user_similarities(R_):
    """Creates the user similarities matrix.
    
    Args:
        R_ (csr_matrix): Ratings matrix.
        
    Returns:
        user_similarities (csr_matrix): Matrix with user similarities.
    
    """
    return cosine_similarity(R_, dense_output=False)


user_similarities = make_user_similarities(ratings_train)
user_similarities

<34713x34713 sparse matrix of type '<class 'numpy.float64'>'
	with 8767358 stored elements in Compressed Sparse Row format>

In [32]:
def make_user_predictions_collab_filt(S: csr_matrix, R_: csr_matrix):
    """Predict using collaborative filtering.
    
    Args:
        S (csr_matrix): Similarities matrix (tipically using the cosine_similarity).
        R_ (csr_matrix): Ratings matrix.
        
    Returns:
        preds (csr_matrix): Predictions matrix.
    
    """
    weighted_sum = np.dot(S, R_)
    
    # We use the absolute value to support negative similarities.
    # In this particular example there are none.
    sum_of_weights = np.abs(S).sum(axis=1)
    
    preds = weighted_sum / sum_of_weights
    
    # Exclude previously rated items.
    preds[R_.nonzero()] = 0
    
    return csr_matrix(preds)
 

collab_filt_user_preds = make_user_predictions_collab_filt(user_similarities, ratings_train)
collab_filt_user_preds

  return np.true_divide(self.todense(), other)


<34713x18264 sparse matrix of type '<class 'numpy.float64'>'
	with 53746436 stored elements in Compressed Sparse Row format>

In [33]:
def get_most_rated_from_user_preds(user_preds_: csr_matrix, n: int) -> np.matrix:
    """Returns the n most rated items from the user predictions.
    
    Args:
        ratings (csr_matrix): A sparse ratings matrix
        n (int): The number of top-n items we should retrieve.
        
    Returns:
        most_rated (np.matrix): An array of the most rated items.
    
    """
    pred_ = np.negative(user_preds_).toarray()
    return pred_.argsort()[:, :n]


collab_filt_most_rated = get_most_rated_from_user_preds(collab_filt_user_preds, 50)
print(collab_filt_most_rated.shape)
collab_filt_most_rated

(34713, 50)


array([[ 2454,  5853,  4065, ...,  6304,  7024,  9168],
       [ 5853,  2454,  4065, ...,  7098,  9247,   877],
       [11708,  9064,  1370, ..., 12107, 13968,  5876],
       ...,
       [    0, 12167, 12168, ..., 12133, 12132, 12131],
       [    0, 12167, 12168, ..., 12133, 12132, 12131],
       [    0, 12167, 12168, ..., 12133, 12132, 12131]])

In [34]:
def convert_pers_recommendations_to_df(pers_recs: np.array, users_to_pred: pd.DataFrame) -> pd.DataFrame:
    """Converts the personalized most rated to an DataFrame with the users and the recommendations.
    
    Args:
        pers_recs (np.array): Array of indices for the best personalized items to recommend.
        users_to_pred (pd.DataFrame): DataFrame containing the users which need recommendations.
        
    Returns:
        non_pers_most_rated_matrix (np.array): Two dimensional array of (n_users, top_n_items)
    
    """
    pers_df = pd.concat([users_to_pred, pd.DataFrame(pers_recs)], axis=1)
    pers_df = pers_df.set_index("users to recommend products")
    
    return pers_df


collab_filt_most_rated_df = convert_pers_recommendations_to_df(collab_filt_most_rated, unique_users_training_data)
collab_filt_most_rated_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
users to recommend products,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00046902LP5YSDV0VVNF,2454,5853,4065,7164,8355,14988,468,4251,1146,2133,...,4632,7316,7554,6787,11763,6839,7098,6304,7024,9168
A0029274J35Q1MYNKUWO,5853,2454,4065,5630,14988,7164,8355,468,2133,3580,...,248,3765,5647,9297,5214,16153,4251,7098,9247,877
A004507634IQ8TNQ3YTUI,11708,9064,1370,7395,4037,8059,2235,2276,5467,1177,...,8335,9547,3571,2744,10372,14766,3707,12107,13968,5876
A005011233SVRED9Q0VY0,6550,517,11589,6253,8290,2066,6215,566,519,1725,...,8569,6226,13842,15120,1513,13182,6221,9202,8872,9497
A00676013IBZZQ3UYWK47,4065,5853,6970,16006,5630,8355,4783,2454,8179,13800,...,6511,11235,9970,14754,1505,131,9732,3708,6226,8799


In [35]:
collab_filt_dict = create_dict_preds(collab_filt_most_rated_df)

In [36]:
evaluate(y_true_dict, collab_filt_dict)

0.002653244829916344

In [37]:
# Filter the collaborative filtering most rated DataFrame  using the test_users_in_data mask.
collab_filt_most_rated_in_data_df = collab_filt_most_rated_df.loc[test_users_in_data.iloc[:, 0].to_list()]
print(collab_filt_most_rated_in_data_df.shape)
collab_filt_most_rated_in_data_df.head()

(8343, 50)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
users to recommend products,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0029274J35Q1MYNKUWO,5853,2454,4065,5630,14988,7164,8355,468,2133,3580,...,248,3765,5647,9297,5214,16153,4251,7098,9247,877
A0103849GBVWICKXD4T6,4196,7947,7098,4996,6505,15218,4783,4065,13077,2031,...,9755,2584,12063,11978,6080,15069,10657,6059,5347,15912
A01685981QK9IX1Q16YZY,5853,10265,4783,4065,12199,8355,6668,8006,6193,4267,...,8283,12371,1380,12621,8099,2133,5283,2421,2249,9829
A02904661A62AP64S46MT,7098,7099,8221,248,15069,12978,10054,8872,4421,14207,...,12252,13007,8185,13646,11518,2521,12372,6270,2973,11359
A036147939NFPC389VLK,15334,15332,5609,14525,15704,16770,8620,4783,12372,8926,...,12978,13009,8812,6281,5516,6550,12466,6837,6716,6221


In [38]:
print(new_users_recommendations.shape)
new_users_recommendations

(882, 1)


Unnamed: 0_level_0,purchases
user_id,Unnamed: 1_level_1
A04904273OLXHXFW1AQK2,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
A100US3LDAJU51,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
A10885J2DS1NQ6,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
A108M7R9UBH5LT,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
A10W7NGRNY5GTR,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
...,...
AZI11KBKDTTLG,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
AZJ63NOPIZHJA,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
AZS6SE0KV73FW,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
AZSLT1XWZ4ELG,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."


In [39]:
collab_filt_most_rated_test_df = collab_filt_most_rated_in_data_df
print(collab_filt_most_rated_test_df.shape)

(8343, 50)


In [40]:
collab_filt_most_rated_test_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
users to recommend products,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0029274J35Q1MYNKUWO,5853,2454,4065,5630,14988,7164,8355,468,2133,3580,...,248,3765,5647,9297,5214,16153,4251,7098,9247,877
A0103849GBVWICKXD4T6,4196,7947,7098,4996,6505,15218,4783,4065,13077,2031,...,9755,2584,12063,11978,6080,15069,10657,6059,5347,15912
A01685981QK9IX1Q16YZY,5853,10265,4783,4065,12199,8355,6668,8006,6193,4267,...,8283,12371,1380,12621,8099,2133,5283,2421,2249,9829
A02904661A62AP64S46MT,7098,7099,8221,248,15069,12978,10054,8872,4421,14207,...,12252,13007,8185,13646,11518,2521,12372,6270,2973,11359
A036147939NFPC389VLK,15334,15332,5609,14525,15704,16770,8620,4783,12372,8926,...,12978,13009,8812,6281,5516,6550,12466,6837,6716,6221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZWXG6KBXXC2N,0,12167,12168,12169,12170,12171,12172,12173,12174,12175,...,12140,12139,12145,12138,12136,12135,12134,12133,12132,12131
AZXLZEOK0D72T,0,12167,12168,12169,12170,12171,12172,12173,12174,12175,...,12140,12139,12145,12138,12136,12135,12134,12133,12132,12131
AZY7YVOUIBM8Q,0,12167,12168,12169,12170,12171,12172,12173,12174,12175,...,12140,12139,12145,12138,12136,12135,12134,12133,12132,12131
AZYD9TCTGRX5H,0,12167,12168,12169,12170,12171,12172,12173,12174,12175,...,12140,12139,12145,12138,12136,12135,12134,12133,12132,12131


In [41]:
df_test_personalized_hk5 = product_to_list(collab_filt_most_rated_test_df)
df_test_personalized_hk5['purchases'] = df_test_personalized_hk5.apply(lambda x: json.dumps(x.purchases), axis=1)
df_test_personalized_hk5_ = pd.concat([df_test_personalized_hk5, new_users_recommendations])

In [42]:
df_test_personalized_hk5_

Unnamed: 0_level_0,purchases
user_id,Unnamed: 1_level_1
A0029274J35Q1MYNKUWO,"[""B00192M24A"", ""B000GCATVW"", ""B000S5HWQC"", ""B0..."
A0103849GBVWICKXD4T6,"[""B000U3X21Q"", ""B001T7H01U"", ""B001HAYPDA"", ""B0..."
A01685981QK9IX1Q16YZY,"[""B00192M24A"", ""B0039B9GWU"", ""B0010KL4I6"", ""B0..."
A02904661A62AP64S46MT,"[""B001HAYPDA"", ""B001HB77BQ"", ""B001ZTP1PY"", ""B0..."
A036147939NFPC389VLK,"[""B007BNHFLQ"", ""B007BKL6G4"", ""B001727XAK"", ""B0..."
...,...
AZI11KBKDTTLG,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
AZJ63NOPIZHJA,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
AZS6SE0KV73FW,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."
AZSLT1XWZ4ELG,"[""B001HAYPDA"", ""B001T7H01U"", ""B000S5HWQC"", ""B0..."


In [43]:
save_predictions(df_test_personalized_hk5_, os.path.join("data", "collab_filt_recommendations.csv")) #0.363

Saved to csv in 'data/collab_filt_recommendations.csv'.


In [49]:
np.unique(df_test_personalized_hk5_.index == df_test_hk5.index)

array([ True])

In [47]:
df_test_personalized_hk5_.shape

(9225, 1)