In [1]:
#Importing the data
import pandas as pd
import pandas as pd
from surprise import Reader, Dataset, KNNBasic, accuracy, PredictionImpossible
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from collections import defaultdict

#Importing the data
data = pd.read_csv('/home/bbruno/all_here/python course/vinnie/data/cleaned_data/upsampled_df_smote.csv')
data.head()

Unnamed: 0,userId,wine,rate
0,5ccd8030-047b-432c-a630-d784ab415756,Valpolicella Ripasso,3
1,df904a46-c461-4233-9d44-6ac11a8bbddc,Astrale,3
2,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Bricco Riva Bianca,3
3,bc79b0e3-064d-4240-86de-e86499f577e8,3/11 Syrah and blend #4,3
4,5ccd8030-047b-432c-a630-d784ab415756,Valpolicella Ripasso Classico Superiore Pojega,4


In [2]:
class Knn (KNNBasic):
    def __init__(self, sim_options={}, bsl_options={}):
        KNNBasic.__init__(self, sim_options=sim_options, bsl_options=bsl_options)
    
    def create_reader(self, data):
        reader = Reader(rating_scale=(1, 5))
        self.data = Dataset.load_from_df(data[['userId', 'wine', 'rate']], reader)

    def fit (self):
        self.trainset, testset = train_test_split(self.data, test_size=0.2)
        # predictions = KNNBasic.fit(self, self.trainset).test(testset)
        KNNBasic.fit(self, self.trainset)
        train_predictions = KNNBasic.test(self, self.trainset.build_testset())
        
        # test_predictions = KNNBasic.test(self, testset)
        predictions = KNNBasic.test(self, testset)
        
        # calculate RMSE
        train_rmse = accuracy.rmse(train_predictions)
        # test_rmse = accuracy.rmse(test_predictions)
        test_rmse = accuracy.rmse(predictions)
        print(f"RMSE on Training Set: {train_rmse}")
        print(f"RMSE on Test Set: {test_rmse}")

        self.sim = self.compute_similarities()
        self.bu, self.bi = self.compute_baselines()
        return predictions
    
    def get_user_name(self, uid):
        """Return the name of a user from their id.
        Args:
            uid(int): The raw id of the user.
        Returns:
            The name of the user.
        """
        return self.trainset.to_raw_uid(uid)
    
    def get_item_name(self, iid):
        """Return the name of an item from their id.
        Args:
            iid(int): The raw id of the item.
        Returns:
            The name of the item.
        """
        return self.trainset.to_raw_iid(iid)
    
    def get_neighbors_uid(self, user_id, k=10):
        neighbor_ids = self.get_neighbors(user_id, k=10)
        neighbor_names = [self.get_user_name(uid) for uid in neighbor_ids]
        return neighbor_names
    def get_neighbors_iid(self, item_id, k=10):
        neighbor_ids = self.get_neighbors(item_id, k=10)
        neighbor_names = [self.get_item_name(iid) for iid in neighbor_ids]
        return neighbor_names
        
    def get_top_n_for_user(self, predictions,user_id, n=10):
        """Return the top-N recommendation for a user from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        """

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            if uid == user_id:
                top_n[uid].append((iid, est))

        # Then sort the predictions for the user and retrieve the k highest ones.
        user_ratings = top_n[user_id]
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[user_id] = user_ratings[:n]

        return top_n[user_id]
    
    def get_top_n_users_for_item(self, predictions, item_id, n=10):
        """Return the top-N users for a specific item from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            item_id: The id of the item for which to get the top-N users.
            n(int): The number of users to output for the item. Default is 10.

        Returns:
        A list of tuples:
            [(raw user id, rating estimation), ...] of size n.
        """

        # First map the predictions to each item.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            if iid == item_id:
                top_n[iid].append((uid, est))

        # Then sort the predictions for the item and retrieve the k highest ones.
        item_ratings = top_n[item_id]
        item_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[item_id] = item_ratings[:n]

        return top_n[item_id]

    def estimated(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible("User and/or item is unknown.")
        
        #Compute similarities between u and v, where v describes all other
        #users that have also rated item i.
        neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]]
        # Sort these neighbors by similarity
        neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)

        print("The 5 nearest neighbors of user", str(u), "are:")
        for v, sim_uv in neighbors[:5]:
            print(f"user {v} with sim {sim_uv:1.15f}")

        # ... Aaaaand return the baseline estimate anyway ;)
        bsl = self.trainset.global_mean + self.bu[u] + self.bi[i]
        return print(f"And the baseline estimate is: {bsl}")
    
    def get_Iu(self, uid):
        """Return the number of items rated by given user
        args:
          uid: the id of the user
        returns:
          the number of items rated by the user
        """
        try:
            return len(self.trainset.ur[self.trainset.to_inner_uid(uid)])
        except ValueError:  # user was not part of the trainset
            return 0

    def get_Ui(self, iid):
        """Return the number of users that have rated given item
        args:
          iid: the raw id of the item
        returns:
          the number of users that have rated the item.
        """
        try:
            return len(self.trainset.ir[self.trainset.to_inner_iid(iid)])
        except ValueError:
            return 0

    def inspect_predictions(self, predictions):
        print(f"uid means the user id and iid means the wine id\n")
        print(f"rui means the actual rating and est means the estimated rating\n")
        print(f"err means the error between the actual and the estimated rating\n")
        print(f"Iu means the number of items rated by given user\n")
        print(f"Ui means the number of users that have rated given item\n")
        # Create a dataframe with the predictions
        df_pred = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
        df_pred['Iu'] = df_pred.uid.apply(self.get_Iu)
        df_pred['Ui'] = df_pred.iid.apply(self.get_Ui)
        df_pred['err'] = abs(df_pred.est - df_pred.rui)
        return df_pred
    
    def get_accuracy(self, predictions, k=10, threshold=3.5):
        #Compute RMSE
        rmse = accuracy.rmse(predictions, verbose=True)
        #Compute MAE
        mae = accuracy.mae(predictions, verbose=True)
        # Compute MSE
        mse = accuracy.mse(predictions, verbose=True)

        #Compute precision and recall
        precisions, recalls = self.precision_recall_at_k(predictions, k=k, threshold=threshold)

        #Precision and recall can then be averaged over all users
        precision = sum(prec for prec in precisions.values()) / len(precisions)
        recall = sum(rec for rec in recalls.values()) / len(recalls)
        print(f'Precision: {precision:.2f}\nRecall: {recall:.2f}')

        #Count correct predictions
        correct = 0
        for uid, iid, true_r, est, _ in predictions:
            if round(est) == round(true_r):
                correct += 1

        #Compute accuracy
        accuracy_percentage = correct / len(predictions)
        accuracy_percentage = accuracy_percentage * 100
        print(f"the acc is {accuracy_percentage:.2f}")

        #Return a dictionary with the metrics
        return {'RMSE': rmse, 'MAE': mae, 'MSE': mse, 'Precision': precision, 'Recall': recall, 'Accuracy': accuracy_percentage}
        
    @staticmethod 
    def precision_recall_at_k(predictions, k=10, threshold=3.5):
        """Return precision and recall at k metrics for each user"""
        # First map the predictions to each user.
        user_est_true = defaultdict(list)
        for uid, _, true_r, est, _ in predictions:
            user_est_true[uid].append((est, true_r))

        precisions = dict()
        recalls = dict()
        for uid, user_ratings in user_est_true.items():
            # Sort user ratings by estimated value
            user_ratings.sort(key=lambda x: x[0], reverse=True)

            # Number of relevant items
            n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

            # Number of recommended items in top k
            n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

            # Number of relevant and recommended items in top k
            n_rel_and_rec_k = sum(
                ((true_r >= threshold) and (est >= threshold))
                for (est, true_r) in user_ratings[:k]
            )

            # Precision@K: Proportion of recommended items that are relevant
            # When n_rec_k is 0, Precision is undefined. We here set it to 0.
            precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

            # Recall@K: Proportion of relevant items that are recommended
            # When n_rel is 0, Recall is undefined. We here set it to 0.
            recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
        return precisions, recalls

In [3]:
knn = Knn(
    sim_options = {'name': 'pearson_baseline','user_based': False}, 
    bsl_options={'method': 'sgd', 'learning_rate': 0.05, 'n_epochs':60, 'reg_u': 12 , 'reg_i': 5}
    )

In [4]:
knn.create_reader(data)

In [5]:
predictions = knn.fit()

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.1755
RMSE: 1.3297
RMSE on Training Set: 0.17551635732016757
RMSE on Test Set: 1.329730958310954
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [6]:
#recommends items for a given user from predictions

# user_id = '7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7'
# top_n = knn.get_top_n_for_user(predictions, user_id)
# print(user_id, [iid for (iid, _) in top_n])
######### better print
user_id = '7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7'
top_n = knn.get_top_n_for_user(predictions, user_id)
print(f"Top items for the user '{user_id}' are:")
for i, (iid, _) in enumerate(top_n, 1):
    print(f"{i}. Item ID: {iid}")

Top items for the user '7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7' are:
1. Item ID: Portillo Malbec
2. Item ID: Quintessence Montagny 1er Cru
3. Item ID: Ammonite
4. Item ID: Pinot gris
5. Item ID: Beyerskloof Western Cape
6. Item ID: Lodi California
7. Item ID: Mercurey 'Clos les Bussières' 


In [7]:
# recommneds users for a given item from predictions

# item_id = 'Chardonnay'
# top_n = knn.get_top_n_users_for_item(predictions, item_id)
# print(item_id, [uid for (uid, _) in top_n])
########## better print
item_id = 'Chardonnay'
top_n = knn.get_top_n_users_for_item(predictions, item_id)
print(f"Top users for the item '{item_id}' are:")
for i, (uid, _) in enumerate(top_n, 1):
    print(f"{i}. User ID: {uid}")

Top users for the item 'Chardonnay' are:
1. User ID: 918cd9b7-8057-4106-8f7f-9e9e3a423fb1
2. User ID: 36fec3ea-e183-491d-b65a-6c7ac90fafa5
3. User ID: 518f9753-12b1-4987-b651-38944fc33d07


In [8]:
#get the KNN for a user from the model
knn.get_neighbors_uid(user_id=5)

['a0ef6b50-093e-4ae1-8e3c-58a2a17d2bb8',
 'f8c400a6-ded1-40b8-95d2-a263d05c30ce',
 '2587fe2b-2481-4150-ae14-99ed61c218ac',
 '83128cf9-f07f-4de1-97cf-cbde8d38bc3c',
 'e336887b-883b-4d9a-8cd3-060c25856bfb',
 'fb9faae5-b479-4a1a-af12-6ed14e5f9d3b',
 '09f6ca6e-905f-4afd-bab2-d928fe046f18',
 '0c0fb748-ee39-4687-b1a4-65044a48a02f',
 '2a62eecd-b0cd-4395-9f8c-7c912a208be2',
 'ecc462e1-5041-43ca-94c3-c2bfc9b3215e']

In [9]:
#get the KNN for an item from the model
knn.get_neighbors_iid(item_id=20)

['Big Easy Red',
 'Amarone',
 'Morellino di Scansano 2021',
 'Rubicon',
 'Casillero del diabolo merlot Chile ',
 'Vigna Piaggia Brunello di Montalcino',
 'Shiraz',
 'Neuburger 2019',
 '19 crimes red wine',
 'Daphne']

In [10]:
knn.estimated(16, 10)
print("If the baseline is {} then the value is a default value".format(knn.trainset.global_mean))

The 5 nearest neighbors of user 16 are:
user 16 with sim 1.000000000000000
user 16 with sim 1.000000000000000
user 11 with sim 0.000000000000000
user 20 with sim 0.000000000000000
user 51 with sim 0.000000000000000
And the baseline estimate is: 3.798436160561236
If the baseline is 2.91994750656168 then the value is a default value


In [11]:
df_pred = knn.inspect_predictions(predictions)
best_pred = df_pred.sort_values(by='err')[:10]
worst_pred = df_pred.sort_values(by='err')[-10:]
df_pred.head(10)

uid means the user id and iid means the wine id

rui means the actual rating and est means the estimated rating

err means the error between the actual and the estimated rating

Iu means the number of items rated by given user

Ui means the number of users that have rated given item



Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Gramigna Imperial,3.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",100,0,0.080052
1,367d109c-cc13-43ec-b153-a9547a12bd4c,Nativi Gewürtztraminer,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",2,5,0.0
2,76b71bb0-6cc9-4168-8a45-bbafc1e9a256,Ribolla Cormons,4.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",8,0,1.080052
3,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Mazon Trattmannhof Blauburgunder - Pinot Nero,5.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",52,0,2.080052
4,91cf68cc-9436-43e1-871d-33beef4d2337,Domaine des Cassiers Pouilly-Fumé,3.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",13,0,0.080052
5,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Etna Bianco,3.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",100,0,0.080052
6,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Ruche',3.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",100,0,0.080052
7,91cf68cc-9436-43e1-871d-33beef4d2337,Alsace Josmeyer Gewurztraminer Biodynamic,4.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",13,0,1.080052
8,defee34b-7775-4889-b703-92b730d85e70,Zamò Bianco,5.0,2.919948,"{'was_impossible': True, 'reason': 'Not enough...",7,2,2.080052
9,70c67292-57ce-4012-a939-5e6ddcb499f8,Müller Thurgau,1.0,2.919948,"{'was_impossible': True, 'reason': 'Not enough...",3,5,1.919948


In [12]:
knn.get_accuracy(predictions)

RMSE: 1.3297
MAE:  1.0962
MSE: 1.7682
Precision: 0.03
Recall: 0.02
the acc is 27.23


{'RMSE': 1.329730958310954,
 'MAE': 1.096201238298431,
 'MSE': 1.7681844214905682,
 'Precision': 0.029069767441860465,
 'Recall': 0.02131782945736434,
 'Accuracy': 27.225130890052355}

* best predictions


In [13]:
best_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
1,367d109c-cc13-43ec-b153-a9547a12bd4c,Nativi Gewürtztraminer,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",2,5,0.0
44,20e0b694-84a5-4f11-9191-598f924c2947,Morellino di Scansano 2021,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",2,8,0.0
62,13016d41-00bd-411c-83f5-2b95691696b7,Cabernet Franc,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",5,4,0.0
85,f9b653da-6c1b-4390-87c6-f74e42bf0a03,Vinho Verde Branco,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",1,1,0.0
109,76b71bb0-6cc9-4168-8a45-bbafc1e9a256,Brunello di Montalcino Riserva Vigna Paganelli,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",8,1,0.0
110,4ed273f7-1816-4f36-88c7-789125f011c7,Muscat de Beaumes-de-Venise,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",8,3,0.0
13,014e4ed1-6f8b-4b25-917d-c167a2acca17,Nespolino Rubicone Sangiovese - Merlot,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",2,4,0.0
150,4313b57d-e3f7-495e-bcfd-ff5c1bf15fcc,Nespolino Rubicone Sangiovese - Merlot,1.0,1.0,"{'actual_k': 3, 'was_impossible': False}",8,4,0.0
114,20e0b694-84a5-4f11-9191-598f924c2947,Morellino di Scansano 2021,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",2,8,0.0
137,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Langhe Nebbiolo,3.0,3.000432,"{'actual_k': 17, 'was_impossible': False}",100,1,0.000432


* worst predictions

In [14]:
worst_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
57,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Ribolla Gialla,5.0,2.919948,"{'was_impossible': True, 'reason': 'Not enough...",52,2,2.080052
148,bc79b0e3-064d-4240-86de-e86499f577e8,3／17 Temata 2 Cape Crest,5.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",29,0,2.080052
149,bc79b0e3-064d-4240-86de-e86499f577e8,Boscarelli Vino Nobile de Montepulciano DOCG,5.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",29,0,2.080052
69,7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7,Ammonite,5.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",23,0,2.080052
165,753594e4-9af7-44dd-a5ad-09a776352d7b,Dogtown Vineyard Zinfandel,5.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",0,0,2.080052
102,defee34b-7775-4889-b703-92b730d85e70,I Saltari Valpolicella Superiore,5.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",7,0,2.080052
100,4c0b31a0-1287-4efc-b24c-fe850084eafd,Chianti,5.0,2.919948,"{'was_impossible': True, 'reason': 'Not enough...",1,2,2.080052
97,2a62eecd-b0cd-4395-9f8c-7c912a208be2,Chateauneuf du pape,5.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",27,0,2.080052
115,dcd16582-14d2-4e00-a024-9b5886d29b29,Alma Mora Reserva Malbec,5.0,2.919948,"{'was_impossible': True, 'reason': 'User and/o...",6,0,2.080052
134,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Vieilles Vignes Rouge,5.0,2.919948,"{'was_impossible': True, 'reason': 'Not enough...",52,1,2.080052
