In [12]:
#Importing the data
import pandas as pd
import pandas as pd
from surprise import Reader, Dataset, KNNBasic, accuracy, PredictionImpossible
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from collections import defaultdict

#Importing the data
data = pd.read_csv('/home/bbruno/all_here/python course/vinnie/data/cleaned_data/upsampled_df_smote.csv')
data.head()

Unnamed: 0,userId,wine,rate
0,5ccd8030-047b-432c-a630-d784ab415756,Valpolicella Ripasso,3
1,df904a46-c461-4233-9d44-6ac11a8bbddc,Astrale,3
2,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Bricco Riva Bianca,3
3,bc79b0e3-064d-4240-86de-e86499f577e8,3/11 Syrah and blend #4,3
4,5ccd8030-047b-432c-a630-d784ab415756,Valpolicella Ripasso Classico Superiore Pojega,4


In [26]:
class Knn(KNNBasic):
    def __init__(self, k, sim_options={}, bsl_options={}):
        self.k = k
        super().__init__(sim_options=sim_options, bsl_options=bsl_options)
    
    def create_reader(self, data):
        reader = Reader(rating_scale=(1, 5))
        self.data = Dataset.load_from_df(data[['userId', 'wine', 'rate']], reader)

    def fit (self):
        self.trainset, testset = train_test_split(self.data, test_size=0.2)
        # predictions = KNNBasic.fit(self, self.trainset).test(testset)
        KNNBasic.fit(self, self.trainset)
        train_predictions = KNNBasic.test(self, self.trainset.build_testset())
        
        # test_predictions = KNNBasic.test(self, testset)
        predictions = KNNBasic.test(self, testset)
        
        # calculate RMSE
        train_rmse = accuracy.rmse(train_predictions)
        # test_rmse = accuracy.rmse(test_predictions)
        test_rmse = accuracy.rmse(predictions)
        print(f"RMSE on Training Set: {train_rmse}")
        print(f"RMSE on Test Set: {test_rmse}")

        self.sim = self.compute_similarities()
        self.bu, self.bi = self.compute_baselines()
        return predictions
    
    def get_user_name(self, uid):
        """Return the name of a user from their id.
        Args:
            uid(int): The raw id of the user.
        Returns:
            The name of the user.
        """
        return self.trainset.to_raw_uid(uid)
    
    def get_item_name(self, iid):
        """Return the name of an item from their id.
        Args:
            iid(int): The raw id of the item.
        Returns:
            The name of the item.
        """
        return self.trainset.to_raw_iid(iid)
    
    def get_neighbors_uid(self, user_id, k=10):
        neighbor_ids = self.get_neighbors(user_id, k=10)
        neighbor_names = [self.get_user_name(uid) for uid in neighbor_ids]
        return neighbor_names
    def get_neighbors_iid(self, item_id, k=10):
        neighbor_ids = self.get_neighbors(item_id, k=10)
        neighbor_names = [self.get_item_name(iid) for iid in neighbor_ids]
        return neighbor_names
        
    def get_top_n_for_user(self, predictions,user_id, n=10):
        """Return the top-N recommendation for a user from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        """

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            if uid == user_id:
                top_n[uid].append((iid, est))

        # Then sort the predictions for the user and retrieve the k highest ones.
        user_ratings = top_n[user_id]
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[user_id] = user_ratings[:n]

        return top_n[user_id]
    
    def get_top_n_users_for_item(self, predictions, item_id, n=10):
        """Return the top-N users for a specific item from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            item_id: The id of the item for which to get the top-N users.
            n(int): The number of users to output for the item. Default is 10.

        Returns:
        A list of tuples:
            [(raw user id, rating estimation), ...] of size n.
        """

        # First map the predictions to each item.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            if iid == item_id:
                top_n[iid].append((uid, est))

        # Then sort the predictions for the item and retrieve the k highest ones.
        item_ratings = top_n[item_id]
        item_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[item_id] = item_ratings[:n]

        return top_n[item_id]

    def estimated(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible("User and/or item is unknown.")
        
        #Compute similarities between u and v, where v describes all other
        #users that have also rated item i.
        neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]]
        # Sort these neighbors by similarity
        neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)

        print("The 5 nearest neighbors of user", str(u), "are:")
        for v, sim_uv in neighbors[:5]:
            print(f"user {v} with sim {sim_uv:1.15f}")

        # ... Aaaaand return the baseline estimate anyway ;)
        bsl = self.trainset.global_mean + self.bu[u] + self.bi[i]
        return print(f"And the baseline estimate is: {bsl}")
    
    def get_Iu(self, uid):
        """Return the number of items rated by given user
        args:
          uid: the id of the user
        returns:
          the number of items rated by the user
        """
        try:
            return len(self.trainset.ur[self.trainset.to_inner_uid(uid)])
        except ValueError:  # user was not part of the trainset
            return 0

    def get_Ui(self, iid):
        """Return the number of users that have rated given item
        args:
          iid: the raw id of the item
        returns:
          the number of users that have rated the item.
        """
        try:
            return len(self.trainset.ir[self.trainset.to_inner_iid(iid)])
        except ValueError:
            return 0

    def inspect_predictions(self, predictions):
        print(f"uid means the user id and iid means the wine id\n")
        print(f"rui means the actual rating and est means the estimated rating\n")
        print(f"err means the error between the actual and the estimated rating\n")
        print(f"Iu means the number of items rated by given user\n")
        print(f"Ui means the number of users that have rated given item\n")
        # Create a dataframe with the predictions
        df_pred = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
        df_pred['Iu'] = df_pred.uid.apply(self.get_Iu)
        df_pred['Ui'] = df_pred.iid.apply(self.get_Ui)
        df_pred['err'] = abs(df_pred.est - df_pred.rui)
        return df_pred
    
    def get_accuracy(self, predictions, k=10, threshold=3.5):
        #Compute RMSE
        rmse = accuracy.rmse(predictions, verbose=True)
        #Compute MAE
        mae = accuracy.mae(predictions, verbose=True)
        # Compute MSE
        mse = accuracy.mse(predictions, verbose=True)

        #Compute precision and recall
        precisions, recalls = self.precision_recall_at_k(predictions, k=k, threshold=threshold)

        #Precision and recall can then be averaged over all users
        precision = sum(prec for prec in precisions.values()) / len(precisions)
        recall = sum(rec for rec in recalls.values()) / len(recalls)
        print(f'Precision: {precision:.2f}\nRecall: {recall:.2f}')

        #Count correct predictions
        correct = 0
        for uid, iid, true_r, est, _ in predictions:
            if round(est) == round(true_r):
                correct += 1

        #Compute accuracy
        accuracy_percentage = correct / len(predictions)
        accuracy_percentage = accuracy_percentage * 100
        print(f"the acc is {accuracy_percentage:.2f}")

        #Return a dictionary with the metrics
        return {'RMSE': rmse, 'MAE': mae, 'MSE': mse, 'Precision': precision, 'Recall': recall, 'Accuracy': accuracy_percentage}
        
    @staticmethod 
    def precision_recall_at_k(predictions, k=10, threshold=3.5):
        """Return precision and recall at k metrics for each user"""
        # First map the predictions to each user.
        user_est_true = defaultdict(list)
        for uid, _, true_r, est, _ in predictions:
            user_est_true[uid].append((est, true_r))

        precisions = dict()
        recalls = dict()
        for uid, user_ratings in user_est_true.items():
            # Sort user ratings by estimated value
            user_ratings.sort(key=lambda x: x[0], reverse=True)

            # Number of relevant items
            n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

            # Number of recommended items in top k
            n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

            # Number of relevant and recommended items in top k
            n_rel_and_rec_k = sum(
                ((true_r >= threshold) and (est >= threshold))
                for (est, true_r) in user_ratings[:k]
            )

            # Precision@K: Proportion of recommended items that are relevant
            # When n_rec_k is 0, Precision is undefined. We here set it to 0.
            precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

            # Recall@K: Proportion of relevant items that are recommended
            # When n_rel is 0, Recall is undefined. We here set it to 0.
            recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
        return precisions, recalls
        

In [14]:
knn = Knn(k=5,
    sim_options = {'name': 'pearson_baseline','user_based': False}, 
    bsl_options={'method': 'sgd', 'learning_rate': 0.05, 'n_epochs':60, 'reg_u': 12 , 'reg_i': 5}
    )

In [15]:
knn.create_reader(data)

In [16]:
predictions = knn.fit()

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.1269
RMSE: 1.3507
RMSE on Training Set: 0.12688369458321658
RMSE on Test Set: 1.3507334984456472
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [17]:
#recommends items for a given user from predictions

# user_id = '7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7'
# top_n = knn.get_top_n_for_user(predictions, user_id)
# print(user_id, [iid for (iid, _) in top_n])
######### better print
user_id = '7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7'
top_n = knn.get_top_n_for_user(predictions, user_id)
print(f"Top items for the user '{user_id}' are:")
for i, (iid, _) in enumerate(top_n, 1):
    print(f"{i}. Item ID: {iid}")

Top items for the user '7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7' are:
1. Item ID: Fleurie Grand Pre
2. Item ID: Marlborough
3. Item ID: Valle de Casablanca Sauvignon
4. Item ID: Pinot blanc


In [18]:
# recommneds users for a given item from predictions

# item_id = 'Chardonnay'
# top_n = knn.get_top_n_users_for_item(predictions, item_id)
# print(item_id, [uid for (uid, _) in top_n])
########## better print
item_id = 'Chardonnay'
top_n = knn.get_top_n_users_for_item(predictions, item_id)
print(f"Top users for the item '{item_id}' are:")
for i, (uid, _) in enumerate(top_n, 1):
    print(f"{i}. User ID: {uid}")

Top users for the item 'Chardonnay' are:
1. User ID: 9a3cc655-c88e-4c1b-95e2-add7579ff4c2
2. User ID: 36fec3ea-e183-491d-b65a-6c7ac90fafa5


In [19]:
#get the KNN for a user from the model
knn.get_neighbors_uid(user_id=5)

['5ccd8030-047b-432c-a630-d784ab415756',
 'bc8f3005-c2c6-4277-9fd7-340248f4e7ec',
 'a8cc530e-e3fe-4a50-a043-a376ea599bf2',
 '71d38b96-326c-4d01-afc4-b12c947a5c6b',
 'ccdd1b9a-2056-4d76-b922-f427b7932ba7',
 'f8c400a6-ded1-40b8-95d2-a263d05c30ce',
 'bc79b0e3-064d-4240-86de-e86499f577e8',
 '2a62eecd-b0cd-4395-9f8c-7c912a208be2',
 '4ed273f7-1816-4f36-88c7-789125f011c7',
 'cabb87a0-6498-438a-beb7-676f72bddd09']

In [20]:
#get the KNN for an item from the model
knn.get_neighbors_iid(item_id=20)

['Soave classico',
 'Terrano',
 'Nespolino Rubicone Sangiovese - Merlot',
 'Weissburgunder Trocken',
 'Valle Central Merlot',
 'AIX Rose Provence 2019 France Blended',
 'Amarone della Valpolicella ',
 '3/11 Syrah and blend #10',
 'Langhe Nebbiolo',
 'The long coast cabernet sauvignon']

In [21]:
knn.estimated(16, 10)
print("If the baseline is {} then the value is a default value".format(knn.trainset.global_mean))

The 5 nearest neighbors of user 16 are:
user 6 with sim 0.000000000000000
And the baseline estimate is: 1.8736415729156795
If the baseline is 2.9304461942257216 then the value is a default value


In [22]:
df_pred = knn.inspect_predictions(predictions)
best_pred = df_pred.sort_values(by='err')[:10]
worst_pred = df_pred.sort_values(by='err')[-10:]
df_pred.head(10)

uid means the user id and iid means the wine id

rui means the actual rating and est means the estimated rating

err means the error between the actual and the estimated rating

Iu means the number of items rated by given user

Ui means the number of users that have rated given item



Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,4ed273f7-1816-4f36-88c7-789125f011c7,Müller Thurgau,1.0,2.930446,"{'was_impossible': True, 'reason': 'Not enough...",8,4,1.930446
1,c60f0ff1-ed30-4df6-a612-b9ae7ca0814b,Pinot nero,1.0,2.930446,"{'was_impossible': True, 'reason': 'Not enough...",7,1,1.930446
2,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Bortolomiol Vintage Extra Dry Millesimato,3.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",51,0,0.069554
3,86ddb2c4-016f-47ed-8cf9-aa0a3564b7f8,Puntalice,5.0,2.930446,"{'was_impossible': True, 'reason': 'Not enough...",1,1,2.069554
4,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Terrebianche col fondo,4.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",87,0,1.069554
5,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Sior Berto,4.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",51,0,1.069554
6,5ccd8030-047b-432c-a630-d784ab415756,Sauvignon Blanc,4.0,3.002837,"{'actual_k': 10, 'was_impossible': False}",69,4,0.997163
7,eadb1fb9-7260-4618-aa5a-8f2ebdca2077,Cirò,1.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",5,0,1.930446
8,5ccd8030-047b-432c-a630-d784ab415756,Ingelheimer Frühburgunder,4.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",69,0,1.069554
9,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Brunello di Montalcino Vigna Manapetra,3.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",87,0,0.069554


In [23]:
knn.get_accuracy(predictions)

RMSE: 1.3507
MAE:  1.0731
MSE: 1.8245
Precision: 0.07
Recall: 0.06
the acc is 30.89


{'RMSE': 1.3507334984456472,
 'MAE': 1.0731221074529793,
 'MSE': 1.824480983823217,
 'Precision': 0.06547619047619048,
 'Recall': 0.057539682539682536,
 'Accuracy': 30.89005235602094}

* best predictions


In [24]:
best_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
47,4313b57d-e3f7-495e-bcfd-ff5c1bf15fcc,Nespolino Rubicone Sangiovese - Merlot,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",5,5,0.0
131,76b71bb0-6cc9-4168-8a45-bbafc1e9a256,Brunello di Montalcino Riserva Vigna Paganelli,4.0,4.0,"{'actual_k': 3, 'was_impossible': False}",10,1,0.0
33,20e0b694-84a5-4f11-9191-598f924c2947,Morellino di Scansano 2021,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",2,9,0.0
35,5ccd8030-047b-432c-a630-d784ab415756,Randersacker Spätburgunder Rosé,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",69,1,0.0
97,4eb7031c-da00-48f4-bc7f-0a1f1eda7cab,Bollicina,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",8,4,0.0
163,2ae69fac-d491-412f-8164-ccaf59a230c2,Ronco del Balbo Merlot,5.0,5.0,"{'actual_k': 1, 'was_impossible': False}",3,2,0.0
13,2ae69fac-d491-412f-8164-ccaf59a230c2,Ronco del Balbo Merlot,5.0,5.0,"{'actual_k': 1, 'was_impossible': False}",3,2,0.0
107,f9b653da-6c1b-4390-87c6-f74e42bf0a03,Vinho Verde Branco,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",1,1,0.0
14,776b5af7-bb8a-484a-bddb-dadf5dab4621,Muscat de Beaumes-de-Venise,5.0,5.0,"{'actual_k': 1, 'was_impossible': False}",4,3,0.0
136,20e0b694-84a5-4f11-9191-598f924c2947,Morellino di Scansano 2021,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",2,9,0.0


* worst predictions

In [25]:
worst_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
110,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Chardonnay,5.0,2.930446,"{'was_impossible': True, 'reason': 'Not enough...",51,5,2.069554
109,35bcb30c-a1b8-49c0-85e3-aaa84e30f50d,Rubicon,5.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",0,1,2.069554
59,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Cabernet Sauvignon,5.0,2.930446,"{'was_impossible': True, 'reason': 'Not enough...",51,2,2.069554
105,00324bc3-0b35-4051-bc6d-9c258a4252c2,Ettore germano riesling,5.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",0,0,2.069554
70,bc79b0e3-064d-4240-86de-e86499f577e8,La Rocca Soave Classico,5.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",36,0,2.069554
78,018ad2f2-d320-46c2-8eff-56af0c2ac2b8,Cinque Terre Sciacchetrà Riserva,5.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",2,0,2.069554
79,ff89d8c1-48a3-49b5-af95-62d0ce9ecf55,Lyric chardonnay 2016,5.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",0,0,2.069554
140,bec96c0b-5b0b-4992-8756-e0d3b704afb5,Aspromonte,5.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",5,0,2.069554
40,bc79b0e3-064d-4240-86de-e86499f577e8,3／17 Temata 2 Cape Crest,5.0,2.930446,"{'was_impossible': True, 'reason': 'User and/o...",36,0,2.069554
77,4ed273f7-1816-4f36-88c7-789125f011c7,Moscato d'Asti,1.0,5.0,"{'actual_k': 1, 'was_impossible': False}",8,3,4.0
