In [1]:
#Importing the data
import pandas as pd
import numpy as np
import pandas as pd
from surprise import Reader, Dataset, KNNBasic, accuracy, PredictionImpossible
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from collections import defaultdict

#Importing the data
data = pd.read_csv('/home/bbruno/all_here/python course/vinnie/data/cleaned_data/downsampled_df_random.csv')
data.head()

Unnamed: 0,userId,wine,rate
0,976ec198-048f-405c-b6e6-b17ee1db1139,Nebbiolo d alba superiore,4
1,4eb7031c-da00-48f4-bc7f-0a1f1eda7cab,Malvasia legno,3
2,13016d41-00bd-411c-83f5-2b95691696b7,Bianco,3
3,13016d41-00bd-411c-83f5-2b95691696b7,Cabernet Franc,3
4,0a23a07a-8556-4ef6-85ee-d996f8ed619e,Talò Primitivo - Merlot,4


In [2]:
class Knn (KNNBasic):
    def __init__(self, sim_options={}, bsl_options={}):
        KNNBasic.__init__(self, sim_options=sim_options, bsl_options=bsl_options)
    
    def create_reader(self, data):
        reader = Reader(rating_scale=(1, 5))
        self.data = Dataset.load_from_df(data[['userId', 'wine', 'rate']], reader)

    #################################
    ## for cross validation we have two functions, cross_validate and fit
    # def cross_validate(self, data, measures=['RMSE'], cv=3, verbose=False):
    #     results = cross_validate(self, data, measures=measures, cv=cv, verbose=verbose)
    #     for measure in measures:
    #         print(f'{measure}: {results["test_" + measure.lower()].mean()}')
    #     return results
    #
    # def fit(self, trainset):
    #     predictions = KNNBasic.fit(self, trainset).test(trainset.build_testset())
    #     self.sim = self.compute_similarities()
    #     self.bu, self.bi = self.compute_baselines()
    #     return predictions
    #################################

    #################################
    # fit funtion that works without cross validation
    def fit (self):
        self.trainset, testset = train_test_split(self.data, test_size=0.2)
        predictions = KNNBasic.fit(self, self.trainset).test(testset)
        self.sim = self.compute_similarities()
        self.bu, self.bi = self.compute_baselines()
        return predictions
    
    def estimated(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible("User and/or item is unknown.")
        
        # Compute similarities between u and v, where v describes all other
        # users that have also rated item i.
        neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]]
        # Sort these neighbors by similarity
        neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)

        print("The 5 nearest neighbors of user", str(u), "are:")
        for v, sim_uv in neighbors[:5]:
            print(f"user {v} with sim {sim_uv:1.2f}")

        # ... Aaaaand return the baseline estimate anyway ;)
        bsl = self.trainset.global_mean + self.bu[u] + self.bi[i]
        return print(f"And the baseline estimate is: {bsl}")
    
    def get_Iu(self, uid):
        """Return the number of items rated by given user
        args:
          uid: the id of the user
        returns:
          the number of items rated by the user
        """
        try:
            return len(self.trainset.ur[self.trainset.to_inner_uid(uid)])
        except ValueError:  # user was not part of the trainset
            return 0

    def get_Ui(self, iid):
        """Return the number of users that have rated given item
        args:
          iid: the raw id of the item
        returns:
          the number of users that have rated the item.
        """
        try:
            return len(self.trainset.ir[self.trainset.to_inner_iid(iid)])
        except ValueError:
            return 0

    def inspect_predictions(self, predictions):
        print(f"uid means the user id and iid means the wine id\n")
        print(f"rui means the actual rating and est means the estimated rating\n")
        print(f"err means the error between the actual and the estimated rating\n")
        print(f"Iu means the number of items rated by given user\n")
        print(f"Ui means the number of users that have rated given item\n")
        # Create a dataframe with the predictions
        df_pred = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
        df_pred['Iu'] = df_pred.uid.apply(self.get_Iu)
        df_pred['Ui'] = df_pred.iid.apply(self.get_Ui)
        df_pred['err'] = abs(df_pred.est - df_pred.rui)
        return df_pred
    
    def get_accuracy(self, predictions, k=10, threshold=3.5):
        # Compute RMSE
        accuracy.rmse(predictions, verbose=True)
        
        # Compute precision and recall
        precisions, recalls = self.precision_recall_at_k(predictions, k=k, threshold=threshold)

        # Precision and recall can then be averaged over all users
        precision = sum(prec for prec in precisions.values()) / len(precisions)
        recall = sum(rec for rec in recalls.values()) / len(recalls)
        print(f'Precision: {precision:.2f}\nRecall: {recall:.2f}')

        # Count correct predictions
        correct = 0
        for uid, iid, true_r, est, _ in predictions:
            if round(est) == round(true_r):
                correct += 1

        # Compute accuracy
        accuracy_percentage = correct / len(predictions)
        return accuracy_percentage * 100
    
    @staticmethod 
    def precision_recall_at_k(predictions, k=10, threshold=3.5):
        """Return precision and recall at k metrics for each user"""

        # First map the predictions to each user.
        user_est_true = defaultdict(list)
        for uid, _, true_r, est, _ in predictions:
            user_est_true[uid].append((est, true_r))

        precisions = dict()
        recalls = dict()
        for uid, user_ratings in user_est_true.items():

            # Sort user ratings by estimated value
            user_ratings.sort(key=lambda x: x[0], reverse=True)

            # Number of relevant items
            n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

            # Number of recommended items in top k
            n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

            # Number of relevant and recommended items in top k
            n_rel_and_rec_k = sum(
                ((true_r >= threshold) and (est >= threshold))
                for (est, true_r) in user_ratings[:k]
            )

            # Precision@K: Proportion of recommended items that are relevant
            # When n_rec_k is 0, Precision is undefined. We here set it to 0.

            precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

            # Recall@K: Proportion of relevant items that are recommended
            # When n_rel is 0, Recall is undefined. We here set it to 0.

            recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

        return precisions, recalls
    
    # Mi function acc that works perfectly was modifyed to add precision and recall, for that reason this is commented
    # def get_accuracy(self, predictions):
    #     # # Compute RMSE
    #     # predictions = KNNBasic.test(self, testset)
    #     accuracy.rmse(predictions, verbose=True)
    #     # Count correct predictions
    #     correct = 0
    #     for uid, iid, true_r, est, _ in predictions:
    #         if round(est) == round(true_r):
    #             correct += 1

    #     # Compute accuracy
    #     accuracy_percentage = correct / len(predictions)
    #     return accuracy_percentage * 100

In [3]:
knn = Knn(
    sim_options = {'name': 'pearson_baseline','user_based': True}, 
    bsl_options={'method': 'sgd', 'learning_rate': 0.00005, 'n_epochs':20, 'reg_u': 12 , 'reg_i': 5}
    )

In [4]:
knn.create_reader(data)

In [5]:
predictions = knn.fit()

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [6]:
knn.estimated(140, 10)
print("If the baseline is {} then the value is a default value".format(knn.trainset.global_mean))

The 5 nearest neighbors of user 140 are:
user 9 with sim 0.00
And the baseline estimate is: 3.7252412478040933
If the baseline is 3.722689075630252 then the value is a default value


In [7]:
df_pred = knn.inspect_predictions(predictions)
best_pred = df_pred.sort_values(by='err')[:10]
worst_pred = df_pred.sort_values(by='err')[-10:]
df_pred.head(10)

uid means the user id and iid means the wine id

rui means the actual rating and est means the estimated rating

err means the error between the actual and the estimated rating

Iu means the number of items rated by given user

Ui means the number of users that have rated given item



Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,838e0d9d-27e9-4f89-9eab-ba2c2f412759,Pigato Riviera Ligure di Ponente,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",6,1,0.0
1,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Pistus,3.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",15,0,0.722689
2,df904a46-c461-4233-9d44-6ac11a8bbddc,Passerina Terre di Chieti IGP,4.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",8,0,0.277311
3,a0ef6b50-093e-4ae1-8e3c-58a2a17d2bb8,The Hassar Grill Anniversary Blend,3.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",11,0,0.722689
4,bc8f3005-c2c6-4277-9fd7-340248f4e7ec,Merlot,5.0,3.722689,"{'was_impossible': True, 'reason': 'Not enough...",8,2,1.277311
5,f7fb310d-87f1-441a-94ab-f526f66b9ec1,'Cuvée des Paladins' Erbaluce di Caluso,5.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",0,0,1.277311
6,ecc462e1-5041-43ca-94c3-c2bfc9b3215e,Pinot grigio,2.0,4.0,"{'actual_k': 1, 'was_impossible': False}",17,1,2.0
7,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Imperial Brut,3.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",15,0,0.722689
8,e786582d-0deb-4e55-969e-065508ca1138,Pinot Noir,3.0,3.722689,"{'was_impossible': True, 'reason': 'Not enough...",2,2,0.722689
9,5ccd8030-047b-432c-a630-d784ab415756,Barbera del Monferrato,3.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",13,0,0.722689


In [8]:
knn.get_accuracy(predictions)

RMSE: 0.9066
Precision: 0.63
Recall: 0.72


45.378151260504204

* best predictions


In [9]:
best_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,838e0d9d-27e9-4f89-9eab-ba2c2f412759,Pigato Riviera Ligure di Ponente,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",6,1,0.0
46,f9b653da-6c1b-4390-87c6-f74e42bf0a03,Vinho Verde Branco,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",1,1,0.0
40,76b71bb0-6cc9-4168-8a45-bbafc1e9a256,Pinot Grigio,4.0,4.0,"{'actual_k': 2, 'was_impossible': False}",11,4,0.0
10,76b71bb0-6cc9-4168-8a45-bbafc1e9a256,Brunello di Montalcino Riserva Vigna Paganelli,4.0,4.0,"{'actual_k': 2, 'was_impossible': False}",11,2,0.0
39,13016d41-00bd-411c-83f5-2b95691696b7,Vitovska acciaio,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",17,4,0.0
106,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Sangiovese Superiore,4.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",17,0,0.277311
41,8e521c78-d748-475a-9514-a8ae8895eb23,Cabernet Sauvignon - Merlot,4.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",0,0,0.277311
43,23b6603a-928c-4e54-9c34-bdf3dcb45435,Bardolino,4.0,3.722689,"{'was_impossible': True, 'reason': 'Not enough...",1,1,0.277311
44,77b7936b-9797-4d14-81bd-a365d777f83d,Montepulciano d'Abruzzo,4.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",1,0,0.277311
45,3d88fe74-6881-42bd-bb97-d67f2db649d7,Reserve Merlot,4.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",2,0,0.277311


* worst predictions

In [10]:
worst_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
88,9a3cc655-c88e-4c1b-95e2-add7579ff4c2,Terrano,2.0,3.722689,"{'was_impossible': True, 'reason': 'Not enough...",7,7,1.722689
105,7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7,Mendoza Chardonnay,2.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",16,0,1.722689
80,441ac739-42f8-48d2-9526-fdedf2d4852f,Terrano,2.0,3.722689,"{'was_impossible': True, 'reason': 'Not enough...",2,7,1.722689
81,877019f9-3c77-491d-912e-58b2404aed47,Chianti,2.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",0,3,1.722689
12,a0ef6b50-093e-4ae1-8e3c-58a2a17d2bb8,Fairbridge,2.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",11,0,1.722689
18,bc79b0e3-064d-4240-86de-e86499f577e8,Les Grands Blancs,2.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",18,0,1.722689
107,3da6eec4-bd99-4370-be23-676baf750f19,Monterey Pinot Noir,2.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",2,0,1.722689
6,ecc462e1-5041-43ca-94c3-c2bfc9b3215e,Pinot grigio,2.0,4.0,"{'actual_k': 1, 'was_impossible': False}",17,1,2.0
92,877019f9-3c77-491d-912e-58b2404aed47,Chianti,1.0,3.722689,"{'was_impossible': True, 'reason': 'User and/o...",0,3,2.722689
83,014e4ed1-6f8b-4b25-917d-c167a2acca17,New Wine,1.0,4.0,"{'actual_k': 1, 'was_impossible': False}",1,2,3.0
