In [1]:
#Importing the data
import pandas as pd
import numpy as np
import pandas as pd
from surprise import Reader, Dataset, KNNBasic, accuracy, PredictionImpossible
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from collections import defaultdict
#Importing the data
data = pd.read_csv('/home/bbruno/all_here/python course/vinnie/data/cleaned_data/df_under.csv')
data.head()

Unnamed: 0,userId,wine,rate
0,91cf68cc-9436-43e1-871d-33beef4d2337,Vistamar Cabernet Sauvignon Reserva,1
1,877019f9-3c77-491d-912e-58b2404aed47,Chianti,1
2,eadb1fb9-7260-4618-aa5a-8f2ebdca2077,Bread & Butter Pinor Noir 2020,1
3,c88644be-2c9b-42f0-9c4b-7f8e81795f43,Terrano,1
4,bc79b0e3-064d-4240-86de-e86499f577e8,2/5 大亂鬥 2,1


In [2]:
class Knn (KNNBasic):
    def __init__(self, sim_options={}, bsl_options={}):
        KNNBasic.__init__(self, sim_options=sim_options, bsl_options=bsl_options)
    
    def create_reader(self, data):
        reader = Reader(rating_scale=(1, 5))
        self.data = Dataset.load_from_df(data[['userId', 'wine', 'rate']], reader)
    # option 1
    ########################################
    # for cross validation we have two functions, cross_validate and fit
    def cross_validate(self, measures=['RMSE'], cv=3, verbose=False):
        results = cross_validate(self, self.data, measures=measures, cv=cv, verbose=verbose)
        for measure in measures:
            print(f'{measure}: {results["test_" + measure.lower()].mean()}')
        return results
    
    def fit(self, trainset):
        predictions = KNNBasic.fit(self, trainset).test(trainset.build_testset())
        self.sim = self.compute_similarities()
        self.bu, self.bi = self.compute_baselines()
        return predictions
    ########################################
    # option 2
    ########################################
    # # fit funtion that works without cross validation
    # def fit (self):
    #     self.trainset, testset = train_test_split(self.data, test_size=0.2)
    #     predictions = KNNBasic.fit(self, self.trainset).test(testset)
    #     self.sim = self.compute_similarities()
    #     self.bu, self.bi = self.compute_baselines()
    #     return predictions
    ########################################

    def estimated(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible("User and/or item is unknown.")
        
        # Compute similarities between u and v, where v describes all other
        # users that have also rated item i.
        neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]]
        # Sort these neighbors by similarity
        neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)

        print("The 5 nearest neighbors of user", str(u), "are:")
        for v, sim_uv in neighbors[:5]:
            print(f"user {v} with sim {sim_uv:1.15f}")

        # ... Aaaaand return the baseline estimate anyway ;)
        bsl = self.trainset.global_mean + self.bu[u] + self.bi[i]
        return print(f"And the baseline estimate is: {bsl}")
    
    def get_Iu(self, uid):
        """Return the number of items rated by given user
        args:
          uid: the id of the user
        returns:
          the number of items rated by the user
        """
        try:
            return len(self.trainset.ur[self.trainset.to_inner_uid(uid)])
        except ValueError:  # user was not part of the trainset
            return 0

    def get_Ui(self, iid):
        """Return the number of users that have rated given item
        args:
          iid: the raw id of the item
        returns:
          the number of users that have rated the item.
        """
        try:
            return len(self.trainset.ir[self.trainset.to_inner_iid(iid)])
        except ValueError:
            return 0

    def inspect_predictions(self, predictions):
        print(f"uid means the user id and iid means the wine id\n")
        print(f"rui means the actual rating and est means the estimated rating\n")
        print(f"err means the error between the actual and the estimated rating\n")
        print(f"Iu means the number of items rated by given user\n")
        print(f"Ui means the number of users that have rated given item\n")
        # Create a dataframe with the predictions
        df_pred = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
        df_pred['Iu'] = df_pred.uid.apply(self.get_Iu)
        df_pred['Ui'] = df_pred.iid.apply(self.get_Ui)
        df_pred['err'] = abs(df_pred.est - df_pred.rui)
        return df_pred
    
    def get_accuracy(self, predictions, k=10, threshold=3.5):
        # Compute RMSE
        accuracy.rmse(predictions, verbose=True)
        
        # Compute precision and recall
        precisions, recalls = self.precision_recall_at_k(predictions, k=k, threshold=threshold)

        # Precision and recall can then be averaged over all users
        precision = sum(prec for prec in precisions.values()) / len(precisions)
        recall = sum(rec for rec in recalls.values()) / len(recalls)
        print(f'Precision: {precision:.2f}\nRecall: {recall:.2f}')

        # Count correct predictions
        correct = 0
        for uid, iid, true_r, est, _ in predictions:
            if round(est) == round(true_r):
                correct += 1

        # Compute accuracy
        accuracy_percentage = correct / len(predictions)
        return accuracy_percentage * 100
    
    @staticmethod 
    def precision_recall_at_k(predictions, k=10, threshold=3.5):
        """Return precision and recall at k metrics for each user"""

        # First map the predictions to each user.
        user_est_true = defaultdict(list)
        for uid, _, true_r, est, _ in predictions:
            user_est_true[uid].append((est, true_r))

        precisions = dict()
        recalls = dict()
        for uid, user_ratings in user_est_true.items():

            # Sort user ratings by estimated value
            user_ratings.sort(key=lambda x: x[0], reverse=True)

            # Number of relevant items
            n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

            # Number of recommended items in top k
            n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

            # Number of relevant and recommended items in top k
            n_rel_and_rec_k = sum(
                ((true_r >= threshold) and (est >= threshold))
                for (est, true_r) in user_ratings[:k]
            )

            # Precision@K: Proportion of recommended items that are relevant
            # When n_rec_k is 0, Precision is undefined. We here set it to 0.

            precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

            # Recall@K: Proportion of relevant items that are recommended
            # When n_rel is 0, Recall is undefined. We here set it to 0.

            recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

        return precisions, recalls
    
    # Mi function acc that works perfectly was modifyed to add precision and recall, for that reason this is commented
    # def get_accuracy(self, predictions):
    #     # # Compute RMSE
    #     # predictions = KNNBasic.test(self, testset)
    #     accuracy.rmse(predictions, verbose=True)
    #     # Count correct predictions
    #     correct = 0
    #     for uid, iid, true_r, est, _ in predictions:
    #         if round(est) == round(true_r):
    #             correct += 1

    #     # Compute accuracy
    #     accuracy_percentage = correct / len(predictions)
    #     return accuracy_percentage * 100

Here's a brief explanation of each step:

1. `knn.create_reader(data)`: This loads your data into the KNN model.
2. `knn.cross_validate(cv=3)`: This performs cross-validation on your data with 3 folds. It's good to do this before training your model to get an idea of how well it might perform.
3. `trainset = knn.data.build_full_trainset()`: This builds the full trainset from your data.
4. `predictions = knn.fit(trainset)`: This trains your model on the full trainset and generates predictions.
5. `knn.estimated(140, 10)`: This estimates the rating that user 140 would give to item 10.
6. `df_pred = knn.inspect_predictions(predictions)`: This inspects your predictions.
7. `df_pred.head(10)`: This displays the first 10 rows of your predictions.

For your next steps, you might want to:
- Check the accuracy of your model
- Try different values for `cv` to see how it affects your model's performance
- Experiment with different parameters for your KNN model to see if you can improve its performance.

In [3]:
knn = Knn(
    sim_options = {'name': 'pearson_baseline','user_based': True}, 
    bsl_options={'method': 'sgd', 'learning_rate': 0.00005, 'n_epochs':20, 'reg_u': 12 , 'reg_i': 5}
    )

In [4]:
knn.create_reader(data)

In [5]:
knn.cross_validate(cv=3)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0937387437434463


{'test_rmse': array([1.18827139, 1.05287607, 1.04006877]),
 'fit_time': (0.005422115325927734,
  0.005880832672119141,
  0.006092548370361328),
 'test_time': (0.001219034194946289,
  0.0008671283721923828,
  0.0022821426391601562)}

In [6]:
trainset = knn.data.build_full_trainset()
predictions = knn.fit(trainset)
# knn.cross_validate(knn.data)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [7]:
# predictions = knn.fit()

In [8]:
knn.estimated(14, 355)
print("If the baseline is {} then the value is a default value".format(knn.trainset.global_mean))

The 5 nearest neighbors of user 14 are:
user 6 with sim 0.000000000000000
And the baseline estimate is: 3.6176787644320485
If the baseline is 3.6339144215530905 then the value is a default value


In [9]:
df_pred = knn.inspect_predictions(predictions)
best_pred = df_pred.sort_values(by='err')[:10]
worst_pred = df_pred.sort_values(by='err')[-10:]
df_pred.head(10)

uid means the user id and iid means the wine id

rui means the actual rating and est means the estimated rating

err means the error between the actual and the estimated rating

Iu means the number of items rated by given user

Ui means the number of users that have rated given item



Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,91cf68cc-9436-43e1-871d-33beef4d2337,Vistamar Cabernet Sauvignon Reserva,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",19,1,0.0
1,91cf68cc-9436-43e1-871d-33beef4d2337,Kellermeister Barossa Vineyards,2.0,2.0,"{'actual_k': 1, 'was_impossible': False}",19,1,0.0
2,91cf68cc-9436-43e1-871d-33beef4d2337,Hermitage,2.0,2.0,"{'actual_k': 1, 'was_impossible': False}",19,1,0.0
3,91cf68cc-9436-43e1-871d-33beef4d2337,Legitimo Carinena,2.0,2.0,"{'actual_k': 1, 'was_impossible': False}",19,1,0.0
4,91cf68cc-9436-43e1-871d-33beef4d2337,St. Hallett Barossa,2.0,2.0,"{'actual_k': 1, 'was_impossible': False}",19,1,0.0
5,91cf68cc-9436-43e1-871d-33beef4d2337,Bubbly Pinot Grigio (Champagne),2.0,2.0,"{'actual_k': 1, 'was_impossible': False}",19,1,0.0
6,91cf68cc-9436-43e1-871d-33beef4d2337,Sacred Hill Whitecliff Marlborough,2.0,2.0,"{'actual_k': 1, 'was_impossible': False}",19,1,0.0
7,91cf68cc-9436-43e1-871d-33beef4d2337,Sacred Hill Pinot Noir,2.0,2.0,"{'actual_k': 1, 'was_impossible': False}",19,1,0.0
8,91cf68cc-9436-43e1-871d-33beef4d2337,Bella Cabernet Sauvignon,2.0,2.0,"{'actual_k': 1, 'was_impossible': False}",19,1,0.0
9,91cf68cc-9436-43e1-871d-33beef4d2337,Pinot Grigio,2.0,2.0,"{'actual_k': 1, 'was_impossible': False}",19,6,0.0


In [10]:
knn.get_accuracy(predictions)

RMSE: 0.1870
Precision: 0.72
Recall: 0.70


96.98890649762282

* best predictions

In [11]:
best_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,91cf68cc-9436-43e1-871d-33beef4d2337,Vistamar Cabernet Sauvignon Reserva,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",19,1,0.0
419,76b71bb0-6cc9-4168-8a45-bbafc1e9a256,Brunello di Montalcino Riserva Vigna Paganelli,4.0,4.0,"{'actual_k': 2, 'was_impossible': False}",10,2,0.0
420,76b71bb0-6cc9-4168-8a45-bbafc1e9a256,Brunello di Montalcino Riserva Vigna Paganelli,4.0,4.0,"{'actual_k': 2, 'was_impossible': False}",10,2,0.0
421,c60f0ff1-ed30-4df6-a612-b9ae7ca0814b,Chablis Saint Martin,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",2,1,0.0
423,d48fd3bd-06e9-4368-97b9-aae545388325,Chardonnay,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",1,9,0.0
424,8a5ddd23-7ed8-4ac0-9c5d-12ff3b908053,Malbec,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",2,1,0.0
425,8a5ddd23-7ed8-4ac0-9c5d-12ff3b908053,Altitud Chardonnay,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",2,1,0.0
417,76b71bb0-6cc9-4168-8a45-bbafc1e9a256,Ale Pinot Nero Alto Adige,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",10,1,0.0
426,c15fdd5b-3f75-42c3-9dde-5735828c88e7,Oltrepo pavese riesling poderi riccagioia,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",1,1,0.0
428,e336887b-883b-4d9a-8cd3-060c25856bfb,Tenuta Sant'Helena Pinot Grigio,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",3,1,0.0


* worst predictions

In [12]:
worst_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
412,76b71bb0-6cc9-4168-8a45-bbafc1e9a256,Pinot Grigio,3.0,3.666667,"{'actual_k': 3, 'was_impossible': False}",10,6,0.666667
365,bc8f3005-c2c6-4277-9fd7-340248f4e7ec,Franciacorta,5.0,4.333333,"{'actual_k': 3, 'was_impossible': False}",7,3,0.666667
366,bc8f3005-c2c6-4277-9fd7-340248f4e7ec,Franciacorta,5.0,4.333333,"{'actual_k': 3, 'was_impossible': False}",7,3,0.666667
486,2ae69fac-d491-412f-8164-ccaf59a230c2,Ronco del Balbo Merlot,4.0,4.75,"{'actual_k': 4, 'was_impossible': False}",6,4,0.75
140,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Amarone,4.0,3.198915,"{'actual_k': 6, 'was_impossible': False}",92,6,0.801085
361,bc8f3005-c2c6-4277-9fd7-340248f4e7ec,Franciacorta,3.0,4.333333,"{'actual_k': 3, 'was_impossible': False}",7,3,1.333333
263,014e4ed1-6f8b-4b25-917d-c167a2acca17,New Wine,1.0,2.5,"{'actual_k': 2, 'was_impossible': False}",2,3,1.5
264,014e4ed1-6f8b-4b25-917d-c167a2acca17,New Wine,4.0,2.5,"{'actual_k': 2, 'was_impossible': False}",2,3,1.5
194,4ed273f7-1816-4f36-88c7-789125f011c7,Moscato d'Asti,1.0,3.0,"{'actual_k': 2, 'was_impossible': False}",4,3,2.0
196,4ed273f7-1816-4f36-88c7-789125f011c7,Moscato d'Asti,5.0,3.0,"{'actual_k': 2, 'was_impossible': False}",4,3,2.0
