In [1]:
#Importing the data
import pandas as pd
import numpy as np
import pandas as pd
from surprise import Reader, Dataset, KNNBasic, accuracy, PredictionImpossible
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from collections import defaultdict

#Importing the data
data = pd.read_csv('/home/bbruno/all_here/python course/vinnie/data/cleaned_data/upsampled_df_smote.csv')
data.head()

Unnamed: 0,userId,wine,rate
0,5ccd8030-047b-432c-a630-d784ab415756,Valpolicella Ripasso,3
1,df904a46-c461-4233-9d44-6ac11a8bbddc,Astrale,3
2,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Bricco Riva Bianca,3
3,bc79b0e3-064d-4240-86de-e86499f577e8,3/11 Syrah and blend #4,3
4,5ccd8030-047b-432c-a630-d784ab415756,Valpolicella Ripasso Classico Superiore Pojega,4


In [2]:
class Knn (KNNBasic):
    def __init__(self, sim_options={}, bsl_options={}):
        KNNBasic.__init__(self, sim_options=sim_options, bsl_options=bsl_options)
    
    def create_reader(self, data):
        reader = Reader(rating_scale=(1, 5))
        self.data = Dataset.load_from_df(data[['userId', 'wine', 'rate']], reader)

    #################################
    ## for cross validation we have two functions, cross_validate and fit
    # def cross_validate(self, data, measures=['RMSE'], cv=3, verbose=False):
    #     results = cross_validate(self, data, measures=measures, cv=cv, verbose=verbose)
    #     for measure in measures:
    #         print(f'{measure}: {results["test_" + measure.lower()].mean()}')
    #     return results
    #
    # def fit(self, trainset):
    #     predictions = KNNBasic.fit(self, trainset).test(trainset.build_testset())
    #     self.sim = self.compute_similarities()
    #     self.bu, self.bi = self.compute_baselines()
    #     return predictions
    #################################

    #################################
    # fit funtion that works without cross validation
    def fit (self):
        self.trainset, testset = train_test_split(self.data, test_size=0.2)
        predictions = KNNBasic.fit(self, self.trainset).test(testset)
        self.sim = self.compute_similarities()
        self.bu, self.bi = self.compute_baselines()
        return predictions
    
    def estimated(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible("User and/or item is unknown.")
        
        # Compute similarities between u and v, where v describes all other
        # users that have also rated item i.
        neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]]
        # Sort these neighbors by similarity
        neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)

        print("The 5 nearest neighbors of user", str(u), "are:")
        for v, sim_uv in neighbors[:5]:
            print(f"user {v} with sim {sim_uv:1.2f}")

        # ... Aaaaand return the baseline estimate anyway ;)
        bsl = self.trainset.global_mean + self.bu[u] + self.bi[i]
        return print(f"And the baseline estimate is: {bsl}")
    
    def get_Iu(self, uid):
        """Return the number of items rated by given user
        args:
          uid: the id of the user
        returns:
          the number of items rated by the user
        """
        try:
            return len(self.trainset.ur[self.trainset.to_inner_uid(uid)])
        except ValueError:  # user was not part of the trainset
            return 0

    def get_Ui(self, iid):
        """Return the number of users that have rated given item
        args:
          iid: the raw id of the item
        returns:
          the number of users that have rated the item.
        """
        try:
            return len(self.trainset.ir[self.trainset.to_inner_iid(iid)])
        except ValueError:
            return 0

    def inspect_predictions(self, predictions):
        print(f"uid means the user id and iid means the wine id\n")
        print(f"rui means the actual rating and est means the estimated rating\n")
        print(f"err means the error between the actual and the estimated rating\n")
        print(f"Iu means the number of items rated by given user\n")
        print(f"Ui means the number of users that have rated given item\n")
        # Create a dataframe with the predictions
        df_pred = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
        df_pred['Iu'] = df_pred.uid.apply(self.get_Iu)
        df_pred['Ui'] = df_pred.iid.apply(self.get_Ui)
        df_pred['err'] = abs(df_pred.est - df_pred.rui)
        return df_pred
    
    def get_accuracy(self, predictions, k=10, threshold=3.5):
        # Compute RMSE
        accuracy.rmse(predictions, verbose=True)
        
        # Compute precision and recall
        precisions, recalls = self.precision_recall_at_k(predictions, k=k, threshold=threshold)

        # Precision and recall can then be averaged over all users
        precision = sum(prec for prec in precisions.values()) / len(precisions)
        recall = sum(rec for rec in recalls.values()) / len(recalls)
        print(f'Precision: {precision:.2f}\nRecall: {recall:.2f}')

        # Count correct predictions
        correct = 0
        for uid, iid, true_r, est, _ in predictions:
            if round(est) == round(true_r):
                correct += 1

        # Compute accuracy
        accuracy_percentage = correct / len(predictions)
        return accuracy_percentage * 100
    
    @staticmethod 
    def precision_recall_at_k(predictions, k=10, threshold=3.5):
        """Return precision and recall at k metrics for each user"""

        # First map the predictions to each user.
        user_est_true = defaultdict(list)
        for uid, _, true_r, est, _ in predictions:
            user_est_true[uid].append((est, true_r))

        precisions = dict()
        recalls = dict()
        for uid, user_ratings in user_est_true.items():

            # Sort user ratings by estimated value
            user_ratings.sort(key=lambda x: x[0], reverse=True)

            # Number of relevant items
            n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

            # Number of recommended items in top k
            n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

            # Number of relevant and recommended items in top k
            n_rel_and_rec_k = sum(
                ((true_r >= threshold) and (est >= threshold))
                for (est, true_r) in user_ratings[:k]
            )

            # Precision@K: Proportion of recommended items that are relevant
            # When n_rec_k is 0, Precision is undefined. We here set it to 0.

            precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

            # Recall@K: Proportion of relevant items that are recommended
            # When n_rel is 0, Recall is undefined. We here set it to 0.

            recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

        return precisions, recalls
    
    # Mi function acc that works perfectly was modifyed to add precision and recall, for that reason this is commented
    # def get_accuracy(self, predictions):
    #     # # Compute RMSE
    #     # predictions = KNNBasic.test(self, testset)
    #     accuracy.rmse(predictions, verbose=True)
    #     # Count correct predictions
    #     correct = 0
    #     for uid, iid, true_r, est, _ in predictions:
    #         if round(est) == round(true_r):
    #             correct += 1

    #     # Compute accuracy
    #     accuracy_percentage = correct / len(predictions)
    #     return accuracy_percentage * 100

In [3]:
knn = Knn(
    sim_options = {'name': 'pearson_baseline','user_based': True}, 
    bsl_options={'method': 'sgd', 'learning_rate': 0.00005, 'n_epochs':20, 'reg_u': 12 , 'reg_i': 5}
    )

In [4]:
knn.create_reader(data)

In [5]:
predictions = knn.fit()

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [6]:
knn.estimated(140, 10)
print("If the baseline is {} then the value is a default value".format(knn.trainset.global_mean))

The 5 nearest neighbors of user 140 are:
user 9 with sim 0.00
And the baseline estimate is: 2.9254099915778244
If the baseline is 2.925196850393701 then the value is a default value


In [7]:
df_pred = knn.inspect_predictions(predictions)
best_pred = df_pred.sort_values(by='err')[:10]
worst_pred = df_pred.sort_values(by='err')[-10:]
df_pred.head(10)

uid means the user id and iid means the wine id

rui means the actual rating and est means the estimated rating

err means the error between the actual and the estimated rating

Iu means the number of items rated by given user

Ui means the number of users that have rated given item



Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Chardonnay,5.0,2.925197,"{'was_impossible': True, 'reason': 'Not enough...",53,5,2.074803
1,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Chianti classico,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",96,1,0.0
2,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Rosso d’Asia,3.0,2.925197,"{'was_impossible': True, 'reason': 'User and/o...",96,0,0.074803
3,83128cf9-f07f-4de1-97cf-cbde8d38bc3c,Sacred Hill Pinot Noir,1.0,2.925197,"{'was_impossible': True, 'reason': 'Not enough...",4,1,1.925197
4,5ccd8030-047b-432c-a630-d784ab415756,Frescaripa Bardolino Classico,3.0,2.925197,"{'was_impossible': True, 'reason': 'User and/o...",60,0,0.074803
5,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Chardonnay Cardellino,5.0,2.925197,"{'was_impossible': True, 'reason': 'Not enough...",53,1,2.074803
6,91cf68cc-9436-43e1-871d-33beef4d2337,Nuits-Saint-Georges 1er Cru Aux Bousselots,3.0,2.925197,"{'was_impossible': True, 'reason': 'Not enough...",13,1,0.074803
7,df904a46-c461-4233-9d44-6ac11a8bbddc,Il Bianco di Ciccio (Tralcetto),3.0,2.925197,"{'was_impossible': True, 'reason': 'User and/o...",7,0,0.074803
8,5ccd8030-047b-432c-a630-d784ab415756,Theo Minges Chardonnay,3.0,2.925197,"{'was_impossible': True, 'reason': 'User and/o...",60,0,0.074803
9,71d38b96-326c-4d01-afc4-b12c947a5c6b,Müller Thurgau,1.0,2.925197,"{'was_impossible': True, 'reason': 'Not enough...",4,5,1.925197


In [8]:
knn.get_accuracy(predictions)

RMSE: 1.3152
Precision: 0.09
Recall: 0.07


30.89005235602094

* best predictions


In [9]:
best_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
86,76b71bb0-6cc9-4168-8a45-bbafc1e9a256,Brachetto Piemonte,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",10,3,0.0
183,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Amarone,3.0,3.0,"{'actual_k': 3, 'was_impossible': False}",96,3,0.0
56,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Il Mattaglio Blanc de Noirs,5.0,5.0,"{'actual_k': 1, 'was_impossible': False}",96,1,0.0
63,5ccd8030-047b-432c-a630-d784ab415756,Randersacker Spätburgunder Rosé,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",60,1,0.0
81,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Amarone,3.0,3.0,"{'actual_k': 3, 'was_impossible': False}",96,3,0.0
178,5ccd8030-047b-432c-a630-d784ab415756,Dettelbach Silvaner,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",60,1,0.0
92,23b6603a-928c-4e54-9c34-bdf3dcb45435,Morellino di Scansano 2021,1.0,1.0,"{'actual_k': 3, 'was_impossible': False}",3,6,0.0
106,5ccd8030-047b-432c-a630-d784ab415756,Valpolicella Ripasso (Classico Superiore),4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",60,2,0.0
26,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Amarone,3.0,3.0,"{'actual_k': 3, 'was_impossible': False}",96,3,0.0
110,014e4ed1-6f8b-4b25-917d-c167a2acca17,Nespolino Rubicone Sangiovese - Merlot,1.0,1.0,"{'actual_k': 3, 'was_impossible': False}",1,5,0.0


* worst predictions

In [10]:
worst_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
150,bc79b0e3-064d-4240-86de-e86499f577e8,Veuve Ambal Methode Traditionelle Blanc de Blancs,5.0,2.925197,"{'was_impossible': True, 'reason': 'User and/o...",36,0,2.074803
61,c3371365-d499-4675-aa68-dbc7b11f0777,Picpoul de Pinet Higues de Beauvignac,5.0,2.925197,"{'was_impossible': True, 'reason': 'Not enough...",2,2,2.074803
80,4e9cb62e-a165-484b-ad78-cb215523cdd5,Le sol,5.0,2.925197,"{'was_impossible': True, 'reason': 'User and/o...",1,0,2.074803
82,defee34b-7775-4889-b703-92b730d85e70,I Saltari Valpolicella Superiore,5.0,2.925197,"{'was_impossible': True, 'reason': 'Not enough...",8,1,2.074803
83,bc79b0e3-064d-4240-86de-e86499f577e8,La Rocca Soave Classico,5.0,2.925197,"{'was_impossible': True, 'reason': 'User and/o...",36,0,2.074803
85,bc79b0e3-064d-4240-86de-e86499f577e8,3／4 大亂鬥 10,5.0,2.925197,"{'was_impossible': True, 'reason': 'Not enough...",36,1,2.074803
120,70c67292-57ce-4012-a939-5e6ddcb499f8,Sauvignon Blanc,5.0,2.925197,"{'was_impossible': True, 'reason': 'Not enough...",4,4,2.074803
132,b0fbc6bd-4b1e-48cd-92f3-f49b7a505941,Taurasi Vigna Cinque Querce,5.0,2.925197,"{'was_impossible': True, 'reason': 'Not enough...",4,1,2.074803
67,7e32fee4-f7c4-4600-8a34-db7801ac79d5,Brut (Carte Jaune) Champagne,5.0,2.925197,"{'was_impossible': True, 'reason': 'User and/o...",2,0,2.074803
190,457b85f5-7b46-4f52-85b9-59acb549b9a2,Refosco,5.0,2.925197,"{'was_impossible': True, 'reason': 'User and/o...",4,0,2.074803
