# Hackathon 5

In [7]:
!pip install ml-metrics==0.1.4

Collecting ml-metrics==0.1.4
Installing collected packages: ml-metrics
Successfully installed ml-metrics-0.1.4


In [8]:
# Import the necessary dependencies

# Operating System
import os

# Numpy, Pandas and Scipy
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, coo_matrix, save_npz, load_npz

# Scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Model Evaluationa
from evaluation import evaluate

## Understanding the data

Our **training data** is:

* `train_reviews.csv` reviews that users wrote about sports products. This dataset has the reviews both in text format and as numerical rating (in the overall column). This can be ordered in time

* `train_products_metadata.csv` this file has some metadata about the products that were reviewed. In particular, you’ll find data about the product brand, categories and price

For **test data** only, we have the following list of users to generate a recommendation for:

* `test_users.csv` this file has the list of users for which you’ll have to provide recommendations for

In [13]:
def read_users_history(features=None) -> pd.DataFrame:
    """Imports the listening history for each user.
    Returns:
        data (pd.DataFrame): DataFrame with the user for each user.
                             The rows are tuples of (user, song_id, rating).
    """
    path = os.path.join('data', 'train_reviews.csv')
    data = pd.read_csv(path, sep=',')
    data.sort_values(by='review_date', ascending = True, inplace = True)
    if features != None:
        return data[features]
    else:
        return data
data = read_users_history(['product_id', 'user_id', 'overall'])
data.head()

   Unnamed: 0  product_id                                        review_text  \
0           0  1881509818  This came in on time and I am veru happy with ...   
1           1  1881509818  I had a factory Glock tool that I was using fo...   

  review_date         user_id                            summary  overall  
0  2014-01-26    AIXZKN4ACSKI                     Woks very good        5  
1  2012-02-02  A1L5P841VIO02V  Works as well as the factory tool        5  
#############################
      Unnamed: 0  product_id  \
195          229  B00004S9I0   
5362        5821  B0000D80FM   

                                            review_text review_date  \
195   I have a cheap gas stove and I always wanted o...  2002-03-07   
5362  The manufacturer says to use their ink in thei...  2002-09-03   

             user_id                                            summary  \
195   A1M2T0J45TTE64        Great stove top giddle/Or use for broiling!   
5362  A11I1I9QLMAM1A  Quality and reliab

Unnamed: 0,product_id,user_id,overall
195,B00004S9I0,A1M2T0J45TTE64,5
5362,B0000D80FM,A11I1I9QLMAM1A,4
146,B00000IURU,AGFW667QNHDOY,5
193,B00004S9I0,A1MR1VMK999I6O,5
1024,B00005JD40,A96JD9312DHWC,5


In [10]:
def read_test_users() -> pd.DataFrame:
    """Imports the list of users for which we need to predict.
    Returns:
        users_to_pred (pd.DataFrame): DataFrame with the users for which we will recommend songs.
    """
    path = os.path.join('data', 'test_users.csv')
    users_to_pred_ = pd.read_csv(path)
    return users_to_pred_
users_to_pred = read_test_users()
users_to_pred.head()

Unnamed: 0,user_id
0,A0029274J35Q1MYNKUWO
1,A0103849GBVWICKXD4T6
2,A01685981QK9IX1Q16YZY
3,A02904661A62AP64S46MT
4,A036147939NFPC389VLK


In [15]:
def get_indices_from_users_to_pred(users_to_pred: pd.DataFrame, data: pd.DataFrame):
    """Get the indices of users_to_pred for which we have data and for which we don't.
    Args:
        users_to_pred (pd.DataFrame): DataFrame containing the list of users we are going to recommend items.
        data (pd.DataFrame): Original of listening history for the users.
    Returns:
        index_users_in_data (Int64Index): Index that filters the users_to_pred to get the user_id's with training data.
        index_users_not_in_data (Int64Index): Index that filters the users_to_pred to get the user_id's without training data.
    """
    index_users_in_data = users_to_pred[users_to_pred.isin(data.user_id.values).values].index
    index_users_not_in_data = users_to_pred[~users_to_pred.isin(data.user_id.values).values].index
    return index_users_in_data, index_users_not_in_data
index_users_in_data, index_users_not_in_data = get_indices_from_users_to_pred(users_to_pred, data)

In [16]:
# For further inspection, we advise you to look at the objects themselves.
print(f"The index for users which we have training data has length of {len(index_users_in_data)}.")
print(f"The index for users which we don't have training data has length of {len(index_users_not_in_data)}.")

The index for users which we have training data has length of 8343.
The index for users which we don't have training data has length of 882.
