In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyspark
import os
import random



from surprise import BaselineOnly
from surprise import SVD, NMF
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import GridSearchCV, RandomizedSearchCV

# Data Preparation

In [2]:
sushi = pd.read_csv('sushi3.idata', sep='\t', names=['item_id', 'name', 'style', 'major_group', 'minor_group', 
                                                'oiliness', 'consumption_frequency', 'normalized_price',
                                                'selling_frequency'])
users = pd.read_csv('sushi3.udata', sep='\t', names=['user_id', 'gender', 'age', 'survey_time', 'prefecture_15', 
                                                    'region_15','east/west_15', 'prefecture_now', 'region_now',
                                                    'east/west_now', 'living_similarity'])
set_b_sushi = sushi.name.unique()
set_b_score = pd.read_csv('sushi3b.5000.10.score', sep=' ', names=set_b_sushi)

In [294]:
df_sparse.shape

(5000, 100)

In [138]:
#Create dataset
df_sparse = pd.concat([users['user_id'], set_b_score], axis=1)
df_sparse = df_sparse.replace(-1, np.nan).set_index('user_id')
# Redefine the rating scale to be from 1-5
for col in df_sparse.columns:
    df_sparse[col] = df_sparse[col].map({0.0:1.0, 1.0:2.0, 2.0:3.0, 3.0:4.0, 4.0:5.0})
print(df_sparse.head())

         ebi  anago  maguro  ika  uni  tako  ikura  tamago  toro  amaebi  \
user_id                                                                    
6371     NaN    1.0     NaN  5.0  3.0   NaN    NaN     NaN   NaN     NaN   
10007    NaN    NaN     NaN  NaN  NaN   NaN    1.0     NaN   2.0     NaN   
1777     NaN    4.0     5.0  NaN  NaN   NaN    4.0     NaN   NaN     NaN   
3613     5.0    NaN     NaN  4.0  5.0   2.0    NaN     NaN   5.0     4.0   
8081     NaN    NaN     NaN  NaN  2.0   NaN    NaN     NaN   NaN     5.0   

         ...   hoya  battera  kyabia  karasumi  uni_kurage  karei  hiramasa  \
user_id  ...                                                                  
6371     ...    NaN      NaN     NaN       NaN         NaN    NaN       NaN   
10007    ...    NaN      NaN     NaN       NaN         NaN    NaN       NaN   
1777     ...    NaN      NaN     NaN       NaN         NaN    NaN       NaN   
3613     ...    NaN      NaN     NaN       NaN         NaN    NaN       

In [139]:
#Calculate sparsity
sparsity = df_sparse.isnull().sum().sum()*100/(df_sparse.shape[0] * df_sparse.shape[1])
print("The dataframe is {}% empty".format(sparsity))

The dataframe is 90.0% empty


In [179]:
# Convert the sparse dataframe from wide to long, dropping NaNs in the process
df_sparse_original = df_sparse.reset_index()
temp_df = pd.melt(df_sparse_original, id_vars=['user_id'], var_name='sushi', value_name='rating')
temp_df.columns = ['uid', 'sushi', 'rating'] 
temp_df = temp_df.dropna()
temp_df.rating = temp_df.rating.astype(int)
print(temp_df.head())

     uid sushi  rating
3   3613   ebi       5
5   1462   ebi       5
8   6861   ebi       5
10  9077   ebi       1
15  5281   ebi       4


# Model Development

In [275]:
def best_model_hyperparamters(method, param_grid, data):
    '''
    method: Chosen approach to the problem e.g. NMF or SVD
    param_grid: The gridsearchcv parameter grid associated with the chosen method
    data: The dataset used in this project; must have been a Surprise Dataset object 
    
    Returns: Best parameters for the method, associated best RMSE score
    '''
    random.seed(42)
    raw_ratings = data.raw_ratings
    random.shuffle(raw_ratings)
    # training data = 70% of the data, test data = 30% of the data
    threshold = int(.7 * len(raw_ratings))
    training_data = raw_ratings[:threshold]
    test_data = raw_ratings[threshold:]
    data.raw_ratings = training_data  # data is now the training_data
    
    algo_cv = GridSearchCV(method, param_grid, cv=3)
    algo_cv.fit(data)
    best_parameters = algo_cv.best_params['rmse']
    best_score = algo_cv.best_score['rmse']
    
    return best_parameters, best_score

## Non-negative Matrix Factorization 

In [284]:
reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(temp_df[['uid', 'sushi', 'rating']], reader)

param_grid_NMF = {'n_factors': [5, 10, 15, 20, 25], 'n_epochs':[40, 50, 60],
                 'reg_pu':list(np.arange(0.07,0.09,0.01)), 'reg_pu':list(np.arange(0.07,0.09,0.01)),
                 'reg_bu':list(np.arange(0.03, 0.05,0.01)), 'reg_bi':list(np.arange(0.03,0.05,0.01))}
best_model_hyperparamters(NMF, param_grid_NMF, data)

({'n_factors': 25,
  'n_epochs': 40,
  'reg_pu': 0.08,
  'reg_bu': 0.05,
  'reg_bi': 0.04},
 1.27662401932413)

In [285]:
algo_nmf = NMF(n_factors=25, n_epochs=40, reg_pu=0.08, reg_bu=0.05, reg_bi=0.04)

## Singular value decomposition

In [277]:
reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(temp_df[['uid', 'sushi', 'rating']], reader)

param_grid_svd = {'n_factors': [90, 100, 110], 'n_epochs':[20,30,40]}

best_model_hyperparamters(SVD, param_grid_svd, data)

({'n_factors': 100, 'n_epochs': 20}, 1.177744070559631)

In [280]:
algo_svd = SVD(n_factors=100, n_epochs=20)

# Model Evaluation

In [248]:
def model_rmse(algo_cv, data):
    '''
    Returns the RMSE score of a cross-validated model on the test set
    
    algo_cv: A cross-validated model with parameters entered (from GridSearchCV or RandomizedSearchCV) 
             e.g. NMF(n_factors=50)
    data: The dataset used in this project; must have been a Surprise Dataset object 
    '''
    import random
    # Randomly shuffle ratings
    random.seed(42)
    raw_ratings = data.raw_ratings
    random.shuffle(raw_ratings)
    # training data = 70% of the data, test data = 30% of the data
    threshold = int(.7 * len(raw_ratings))
    training_data = raw_ratings[:threshold]
    test_data = raw_ratings[threshold:]
    data.raw_ratings = training_data  # data is now the training_data
    # Training set
    trainset = data.build_full_trainset()
    algo_cv.fit(trainset)

    # Compute RMSE on test set
    testset = data.construct_testset(test_data)  # construct testset
    predictions = algo_cv.test(testset)
    model_rmse = accuracy.rmse(predictions)
    return(model_rmse)

In [286]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(temp_df[['uid', 'sushi', 'rating']], reader)

svd_rmse = model_rmse(algo_svd, data)
nmf_rmse = model_rmse(algo_nmf, data)

RMSE: 1.1595
RMSE: 1.2899


In [287]:
print("The RMSE of the SVD model is : {}".format(svd_rmse))
print("The RMSE of the NMF model is : {}".format(nmf_rmse))

The RMSE of the SVD model is : 1.159504599012967
The RMSE of the NMF model is : 1.2899191155402001


# Making predictions for individuals

The goal here is to create functions which can return a list of the top 10 recommended sushi for a given individual.

### For a given user - Non-novel

In [131]:
def get_top_n_user(algo, uid, n=10):
    '''
    Returns the top n recommendations for a user, INCLUDING sushi they are familiar with.
    
    n(int): The number of recommendations to show for each user. Default is 10. 
    
    Returns:
    A dataframe with sushi names as well as values for sushi ratings, either predicted by the model or previously known
    from user data. The top 10 entries (as defined by highest ratings) are shown.
    '''
    
    # Creating lists of predicted sushi ratings (based on the model) and sushi names
    sushi_ratings = [algo.predict(uid=6185, iid=item)[3] for item in df_sparse.columns]
    sushi_list = [item for item in df_sparse.columns]
    _ = pd.DataFrame({'Name': sushi_list, 'Ratings':sushi_ratings})
    _ = _.sort_values(by=['Ratings'], ascending=False).head(n=10)
    
    return(_.reset_index(drop=True))

### For a given user - Novel recommendations only

In [216]:
def get_top_n_user_novel(algo, uid, n=10):
    '''
    Returns the top n novel recommendations for a user i.e. NOT INCLUDING sushi they are familiar with.
    
    n(int): The number of recommendations to show for each user. Default is 10. 
    
    Returns:
    A dataframe with sushi names as well as values for sushi ratings, either predicted by the model or previously known
    from user data. The top 10 entries (as defined by highest ratings) are shown.
    '''
    
    # First, find sushi the user has already rated
    user_rated_sushi = [item for item in temp_df[temp_df.uid==uid]['sushi']]
    # List of sushi that the user has NOT rated
    unrated_sushi = set(user_rated_sushi) ^ set(df_sparse.columns) # Returns the non-overlapping elements between two sets
    
    # Creating lists of predicted sushi ratings (based on the model) and sushi names
    sushi_ratings = [algo.predict(uid=uid, iid=item)[3] for item in unrated_sushi] #[3] is the estimated rating 
    sushi_list = [item for item in unrated_sushi]
    _ = pd.DataFrame({'Name': sushi_list, 'Ratings':sushi_ratings})
    _ = _.sort_values(by=['Ratings'], ascending=False).head(n=10)
    
    return(_.reset_index(drop=True))

In [283]:
get_top_n_user(algo_svd, 6371)

Unnamed: 0,Name,Ratings
0,shiso_maki,3.293347
1,toro,3.170982
2,botanebi,3.157127
3,chu_toro,3.153187
4,tarabagani,3.108594
5,kanpachi,3.079632
6,ebi,3.048903
7,uni,3.001063
8,hotategai,2.998628
9,negi_toro_maki,2.980982


In [262]:
get_top_n_user_novel(algo_svd, 3613)

Unnamed: 0,Name,Ratings
0,maguro,4.499095
1,tarabagani,4.493927
2,kani,4.464559
3,hirame,4.428349
4,chu_toro,4.370267
5,negi_toro_maki,4.309677
6,ikura,4.307424
7,samon,4.238509
8,kurumaebi,4.220072
9,toro_samon,4.177591


In [None]:
algo = NMF()
# retrain on the whole set training data
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute RMSE on test set
testset = data.construct_testset(test_data)  # construct testset
predictions = algo.test(testset)
print(accuracy.rmse(predictions))