In [1]:
# Import the necessary dependencies

# Operating System
import os

# Numpy, Pandas and Scipy
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, coo_matrix, save_npz, load_npz

# Scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [2]:
def read_users_history(features=None) -> pd.DataFrame:
    """Imports the listening history for each user.
    
    Returns:
        data (pd.DataFrame): DataFrame with the user for each user. 
                             The rows are tuples of (user, song_id, rating).
                             
    """
    path = os.path.join('data', 'train_reviews.csv')
    data = pd.read_csv(path, sep=',').sort_values(by=["review_date"])
    if features != None:
        return data[features]
    else:
        return data

data = read_users_history(['user_id', 'product_id', 'overall'])
data.head()

Unnamed: 0,user_id,product_id,overall
195,A1M2T0J45TTE64,B00004S9I0,5
5362,A11I1I9QLMAM1A,B0000D80FM,4
146,AGFW667QNHDOY,B00000IURU,5
193,A1MR1VMK999I6O,B00004S9I0,5
1024,A96JD9312DHWC,B00005JD40,5


In [3]:
data.shape

(268169, 3)

In [4]:
def read_test_users() -> pd.DataFrame:
    """Imports the list of users for which we need to predict.
    
    Returns:
        users_to_pred (pd.DataFrame): DataFrame with the users for which we will recommend songs. 
    """

    path = os.path.join('data', 'test_users.csv')
    users_to_pred_ = pd.read_csv(path)
    
    return users_to_pred_


users_to_pred = read_test_users()
users_to_pred.head()

Unnamed: 0,user_id
0,A0029274J35Q1MYNKUWO
1,A0103849GBVWICKXD4T6
2,A01685981QK9IX1Q16YZY
3,A02904661A62AP64S46MT
4,A036147939NFPC389VLK


In [5]:
print(f"We have {len(users_to_pred)} users in need for better products!")

We have 9225 users in need for better products!


In [6]:
def get_indices_from_users_to_pred(users_to_pred: pd.DataFrame, data: pd.DataFrame):
    """Get the indices of users_to_pred for which we have data and for which we don't.
    
    Args:
        users_to_pred (pd.DataFrame): DataFrame containing the list of users we are going to recommend items.
        data (pd.DataFrame): Original of listening history for the users.
        
    Returns:
        index_users_in_data (Int64Index): Index that filters the users_to_pred to get the user_id's with training data.
        index_users_not_in_data (Int64Index): Index that filters the users_to_pred to get the user_id's without training data.
        
    """
    index_users_in_data = users_to_pred[users_to_pred.isin(data.user_id.values).values].index
    index_users_not_in_data = users_to_pred[~users_to_pred.isin(data.user_id.values).values].index
    
    return index_users_in_data, index_users_not_in_data

index_users_in_data, index_users_not_in_data = get_indices_from_users_to_pred(users_to_pred, data)

In [7]:
# For further inspection, we advise you to look at the objects themselves.
print(f"The index for users which we have training data has length of {len(index_users_in_data)}.")
print(f"The index for users which we don't have training data has length of {len(index_users_not_in_data)}.")

The index for users which we have training data has length of 8343.
The index for users which we don't have training data has length of 882.


In [8]:
def get_users_to_pred_by_index(users_to_pred, index_users_in_data):
    """DataFrame with user_id's in test set for for which we have training data.

    Args: 
        users_to_pred (pd.DataFrame): DataFrame containing the list of users we are going to recommend items.
        index_users_in_data (Int64Index): Index that filters the users_to_pred to get the user_id's with training data.
    Returns:
        users_in_data (pd.DataFrame): Dataframe containing the list of user_id's with training data.
    
    """
    return users_to_pred.iloc[index_users_in_data].reset_index(drop=True)

# Get the test users with training data
test_users_in_data = get_users_to_pred_by_index(users_to_pred, index_users_in_data)
test_users_in_data.head()

Unnamed: 0,user_id
0,A0029274J35Q1MYNKUWO
1,A0103849GBVWICKXD4T6
2,A01685981QK9IX1Q16YZY
3,A02904661A62AP64S46MT
4,A036147939NFPC389VLK


In [9]:
# Get the test users without training data
test_users_not_in_data = get_users_to_pred_by_index(users_to_pred, index_users_not_in_data)
print(test_users_not_in_data.shape)
test_users_not_in_data.head()

(882, 1)


Unnamed: 0,user_id
0,A04904273OLXHXFW1AQK2
1,A100US3LDAJU51
2,A10885J2DS1NQ6
3,A108M7R9UBH5LT
4,A10W7NGRNY5GTR


### Create the Ratings matrix

In [10]:
def make_ratings(data: pd.DataFrame) -> csr_matrix:
    """Creates the ratings matrix of listening history.
    
    Creates the ratings matrix from the listening history imported using the read_users_history() method.
    
    Args:
        data (pd.DataFrame):  listening history for the users.
        
    Returns:
        ratings (csr_matrix): ratings matrix with shape (n_users, n_items)
        
    TODO:
        * Add the shape as an optional parameter.
        
    """
    # You have probably seen this before
    users, user_pos = np.unique(data.iloc[:, 0].values, return_inverse=True)
    items, item_pos = np.unique(data.iloc[:, 1].values, return_inverse=True)
    values = data.iloc[:, 2].fillna(0).values
    
    shape = (len(users), len(items))

    R_ = coo_matrix((values, (user_pos, item_pos)), shape=shape).tocsr()
    
    return R_


R = make_ratings(data)
R

<34713x18264 sparse matrix of type '<class 'numpy.int64'>'
	with 268169 stored elements in Compressed Sparse Row format>

In [11]:
# Just for mental (in)sanity, let's match the info of the matrix to what is printed in the previous cell.
print(f"The shape is {R.shape}")
print(f"The dtypes of the elements are {R.dtype}")
print(f"The number of stored elements is {R.nnz}")
print(f"The type of the matrix is {type(R)}")

The shape is (34713, 18264)
The dtypes of the elements are int64
The number of stored elements is 268169
The type of the matrix is <class 'scipy.sparse.csr.csr_matrix'>


In [12]:
# Let's store a Series with the unique user id's that we have in the original data.
def get_unique_users(data):
    """Get unique users in training data.
    
    Args:
        data (pd.DataFrame):  listening history for the users.
        
    Returns:
        unique_users (pd.DataFrame): DataFrame of one column with unique users in training data.
    
    """
    return pd.DataFrame(np.unique(data.iloc[:, 0].values), columns=["users to recommend products"])


unique_users_training_data = get_unique_users(data)
unique_users_training_data.head()

Unnamed: 0,users to recommend products
0,A00046902LP5YSDV0VVNF
1,A0029274J35Q1MYNKUWO
2,A004507634IQ8TNQ3YTUI
3,A005011233SVRED9Q0VY0
4,A00676013IBZZQ3UYWK47


In [13]:
def get_indices_in_ratings_for_test_users_in_data(data: pd.DataFrame, test_users_in_data: pd.DataFrame):
    """Returns the index of the ratings matrix for the test users for which we have training data.
    
    Args:
        data (pd.DataFrame): DataFrame with the user for each user. 
                             The rows are tuples of (user, song_id, rating).
                             
        test_users_in_data (pd.DataFrame): DataFrame containing the list of test users for which we have training data.
        
    Returns:
        indices_ratings_tests_users_in_data (np.array): Indices of users in test set with training data for ratings matrix.
    
    """
    unique_users = get_unique_users(data).iloc[:, 0]
    indices_ratings_tests_users_in_data = unique_users[unique_users.isin(test_users_in_data.iloc[:, 0])].index.to_numpy()                                                                                       
    return indices_ratings_tests_users_in_data

indices_ratings_tests_users_in_data = get_indices_in_ratings_for_test_users_in_data(data, test_users_in_data)
len(indices_ratings_tests_users_in_data)

8343