In [1]:
#Importing the data
import pandas as pd
import numpy as np
import pandas as pd
from surprise import Reader, Dataset, KNNBasic, accuracy, PredictionImpossible
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from collections import defaultdict
#Importing the data
data = pd.read_csv('/home/bbruno/all_here/python course/vinnie/data/cleaned_data/upsampled_df_smote.csv')
data.head()

Unnamed: 0,userId,wine,rate
0,5ccd8030-047b-432c-a630-d784ab415756,Valpolicella Ripasso,3
1,df904a46-c461-4233-9d44-6ac11a8bbddc,Astrale,3
2,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Bricco Riva Bianca,3
3,bc79b0e3-064d-4240-86de-e86499f577e8,3/11 Syrah and blend #4,3
4,5ccd8030-047b-432c-a630-d784ab415756,Valpolicella Ripasso Classico Superiore Pojega,4


In [2]:
class Knn (KNNBasic):
    def __init__(self, sim_options={}, bsl_options={}):
        KNNBasic.__init__(self, sim_options=sim_options, bsl_options=bsl_options)
    
    def create_reader(self, data):
        reader = Reader(rating_scale=(1, 5))
        self.data = Dataset.load_from_df(data[['userId', 'wine', 'rate']], reader)
    # option 1
    ########################################
    # for cross validation we have two functions, cross_validate and fit
    def cross_validate(self, measures=['RMSE'], cv=3, verbose=False):
        results = cross_validate(self, self.data, measures=measures, cv=cv, verbose=verbose)
        for measure in measures:
            print(f'{measure}: {results["test_" + measure.lower()].mean()}')
        return results
    
    def fit(self, trainset):
        predictions = KNNBasic.fit(self, trainset).test(trainset.build_testset())
        self.sim = self.compute_similarities()
        self.bu, self.bi = self.compute_baselines()
        return predictions
    ########################################
    # option 2
    ########################################
    # # fit funtion that works without cross validation
    # def fit (self):
    #     self.trainset, testset = train_test_split(self.data, test_size=0.2)
    #     predictions = KNNBasic.fit(self, self.trainset).test(testset)
    #     self.sim = self.compute_similarities()
    #     self.bu, self.bi = self.compute_baselines()
    #     return predictions
    ########################################

    def estimated(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible("User and/or item is unknown.")
        
        # Compute similarities between u and v, where v describes all other
        # users that have also rated item i.
        neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]]
        # Sort these neighbors by similarity
        neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)

        print("The 5 nearest neighbors of user", str(u), "are:")
        for v, sim_uv in neighbors[:5]:
            print(f"user {v} with sim {sim_uv:1.15f}")

        # ... Aaaaand return the baseline estimate anyway ;)
        bsl = self.trainset.global_mean + self.bu[u] + self.bi[i]
        return print(f"And the baseline estimate is: {bsl}")
    
    def get_Iu(self, uid):
        """Return the number of items rated by given user
        args:
          uid: the id of the user
        returns:
          the number of items rated by the user
        """
        try:
            return len(self.trainset.ur[self.trainset.to_inner_uid(uid)])
        except ValueError:  # user was not part of the trainset
            return 0

    def get_Ui(self, iid):
        """Return the number of users that have rated given item
        args:
          iid: the raw id of the item
        returns:
          the number of users that have rated the item.
        """
        try:
            return len(self.trainset.ir[self.trainset.to_inner_iid(iid)])
        except ValueError:
            return 0

    def inspect_predictions(self, predictions):
        print(f"uid means the user id and iid means the wine id\n")
        print(f"rui means the actual rating and est means the estimated rating\n")
        print(f"err means the error between the actual and the estimated rating\n")
        print(f"Iu means the number of items rated by given user\n")
        print(f"Ui means the number of users that have rated given item\n")
        # Create a dataframe with the predictions
        df_pred = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
        df_pred['Iu'] = df_pred.uid.apply(self.get_Iu)
        df_pred['Ui'] = df_pred.iid.apply(self.get_Ui)
        df_pred['err'] = abs(df_pred.est - df_pred.rui)
        return df_pred
    
    def get_accuracy(self, predictions, k=10, threshold=3.5):
        # Compute RMSE
        accuracy.rmse(predictions, verbose=True)
        
        # Compute precision and recall
        precisions, recalls = self.precision_recall_at_k(predictions, k=k, threshold=threshold)

        # Precision and recall can then be averaged over all users
        precision = sum(prec for prec in precisions.values()) / len(precisions)
        recall = sum(rec for rec in recalls.values()) / len(recalls)
        print(f'Precision: {precision:.2f}\nRecall: {recall:.2f}')

        # Count correct predictions
        correct = 0
        for uid, iid, true_r, est, _ in predictions:
            if round(est) == round(true_r):
                correct += 1

        # Compute accuracy
        accuracy_percentage = correct / len(predictions)
        return accuracy_percentage * 100
    
    @staticmethod 
    def precision_recall_at_k(predictions, k=10, threshold=3.5):
        """Return precision and recall at k metrics for each user"""

        # First map the predictions to each user.
        user_est_true = defaultdict(list)
        for uid, _, true_r, est, _ in predictions:
            user_est_true[uid].append((est, true_r))

        precisions = dict()
        recalls = dict()
        for uid, user_ratings in user_est_true.items():

            # Sort user ratings by estimated value
            user_ratings.sort(key=lambda x: x[0], reverse=True)

            # Number of relevant items
            n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

            # Number of recommended items in top k
            n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

            # Number of relevant and recommended items in top k
            n_rel_and_rec_k = sum(
                ((true_r >= threshold) and (est >= threshold))
                for (est, true_r) in user_ratings[:k]
            )

            # Precision@K: Proportion of recommended items that are relevant
            # When n_rec_k is 0, Precision is undefined. We here set it to 0.

            precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

            # Recall@K: Proportion of relevant items that are recommended
            # When n_rel is 0, Recall is undefined. We here set it to 0.

            recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

        return precisions, recalls
    
    # Mi function acc that works perfectly was modifyed to add precision and recall, for that reason this is commented
    # def get_accuracy(self, predictions):
    #     # # Compute RMSE
    #     # predictions = KNNBasic.test(self, testset)
    #     accuracy.rmse(predictions, verbose=True)
    #     # Count correct predictions
    #     correct = 0
    #     for uid, iid, true_r, est, _ in predictions:
    #         if round(est) == round(true_r):
    #             correct += 1

    #     # Compute accuracy
    #     accuracy_percentage = correct / len(predictions)
    #     return accuracy_percentage * 100

Here's a brief explanation of each step:

1. `knn.create_reader(data)`: This loads your data into the KNN model.
2. `knn.cross_validate(cv=3)`: This performs cross-validation on your data with 3 folds. It's good to do this before training your model to get an idea of how well it might perform.
3. `trainset = knn.data.build_full_trainset()`: This builds the full trainset from your data.
4. `predictions = knn.fit(trainset)`: This trains your model on the full trainset and generates predictions.
5. `knn.estimated(140, 10)`: This estimates the rating that user 140 would give to item 10.
6. `df_pred = knn.inspect_predictions(predictions)`: This inspects your predictions.
7. `df_pred.head(10)`: This displays the first 10 rows of your predictions.

For your next steps, you might want to:
- Check the accuracy of your model
- Try different values for `cv` to see how it affects your model's performance
- Experiment with different parameters for your KNN model to see if you can improve its performance.

In [3]:
knn = Knn(
    sim_options = {'name': 'pearson_baseline','user_based': True}, 
    bsl_options={'method': 'sgd', 'learning_rate': 0.00005, 'n_epochs':20, 'reg_u': 12 , 'reg_i': 5}
    )

In [13]:
knn.create_reader(data)

In [5]:
knn.cross_validate(cv=3)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.3369719792067294


{'test_rmse': array([1.37748969, 1.40640654, 1.2270197 ]),
 'fit_time': (0.02533745765686035, 0.013176918029785156, 0.011109352111816406),
 'test_time': (0.0042591094970703125,
  0.0023162364959716797,
  0.0014386177062988281)}

In [6]:
trainset = knn.data.build_full_trainset()
predictions = knn.fit(trainset)
# knn.cross_validate(knn.data)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [7]:
# predictions = knn.fit()

In [8]:
knn.estimated(147, 355)
print("If the baseline is {} then the value is a default value".format(knn.trainset.global_mean))

The 5 nearest neighbors of user 147 are:
user 8 with sim 0.000000000000000
user 131 with sim 0.000000000000000
user 104 with sim 0.000000000000000
And the baseline estimate is: 2.9153776190875744
If the baseline is 2.9160545645330536 then the value is a default value


In [9]:
df_pred = knn.inspect_predictions(predictions)
best_pred = df_pred.sort_values(by='err')[:10]
worst_pred = df_pred.sort_values(by='err')[-10:]
df_pred.head(10)

uid means the user id and iid means the wine id

rui means the actual rating and est means the estimated rating

err means the error between the actual and the estimated rating

Iu means the number of items rated by given user

Ui means the number of users that have rated given item



Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,5ccd8030-047b-432c-a630-d784ab415756,Valpolicella Ripasso,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",84,3,0.0
1,5ccd8030-047b-432c-a630-d784ab415756,Valpolicella Ripasso Classico Superiore Pojega,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",84,1,0.0
2,5ccd8030-047b-432c-a630-d784ab415756,Côtes du Rhône Cuvée Pierre Perrin,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",84,1,0.0
3,5ccd8030-047b-432c-a630-d784ab415756,Soave classico,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",84,2,0.0
4,5ccd8030-047b-432c-a630-d784ab415756,Randersacker Sonnenstuhl Silvaner,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",84,2,0.0
5,5ccd8030-047b-432c-a630-d784ab415756,Frühburgunder,4.0,3.5,"{'actual_k': 2, 'was_impossible': False}",84,2,0.5
6,5ccd8030-047b-432c-a630-d784ab415756,Sauvignon Blanc,3.0,3.516469,"{'actual_k': 6, 'was_impossible': False}",84,6,0.516469
7,5ccd8030-047b-432c-a630-d784ab415756,Baden-Badener Spätburgunder,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",84,1,0.0
8,5ccd8030-047b-432c-a630-d784ab415756,Acinaia,2.0,2.0,"{'actual_k': 1, 'was_impossible': False}",84,1,0.0
9,5ccd8030-047b-432c-a630-d784ab415756,Blanc de Noir,4.0,3.5,"{'actual_k': 2, 'was_impossible': False}",84,2,0.5


In [10]:
knn.get_accuracy(predictions)

RMSE: 0.1422
Precision: 0.76
Recall: 0.74


97.79643231899266

* best predictions

In [11]:
best_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,5ccd8030-047b-432c-a630-d784ab415756,Valpolicella Ripasso,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",84,3,0.0
614,e336887b-883b-4d9a-8cd3-060c25856bfb,Chateau coufran,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",6,1,0.0
615,e336887b-883b-4d9a-8cd3-060c25856bfb,Tenuta Sant'Helena Pinot Grigio,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",6,1,0.0
616,e336887b-883b-4d9a-8cd3-060c25856bfb,Casillero del diabolo merlot Chile,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",6,2,0.0
617,e336887b-883b-4d9a-8cd3-060c25856bfb,Montepulciano 60,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",6,4,0.0
618,e336887b-883b-4d9a-8cd3-060c25856bfb,Zinfandel,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",6,4,0.0
619,e336887b-883b-4d9a-8cd3-060c25856bfb,Montebruna Barbera d'Asti,1.0,1.0,"{'actual_k': 1, 'was_impossible': False}",6,2,0.0
620,7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7,Le Relais de Patache d'Aux,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",30,1,0.0
621,7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7,Quintessence Montagny 1er Cru,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",30,2,0.0
622,7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7,Mercurey 'Clos les Bussières',4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",30,1,0.0


* worst predictions

In [12]:
worst_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
266,76b71bb0-6cc9-4168-8a45-bbafc1e9a256,Pinot Grigio,4.0,3.498264,"{'actual_k': 4, 'was_impossible': False}",11,6,0.501736
6,5ccd8030-047b-432c-a630-d784ab415756,Sauvignon Blanc,3.0,3.516469,"{'actual_k': 6, 'was_impossible': False}",84,6,0.516469
418,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Nativi Gewürtztraminer,1.0,1.639255,"{'actual_k': 8, 'was_impossible': False}",64,8,0.639255
416,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Nativi Gewürtztraminer,1.0,1.639255,"{'actual_k': 8, 'was_impossible': False}",64,8,0.639255
100,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Barolo,2.0,2.666667,"{'actual_k': 3, 'was_impossible': False}",117,3,0.666667
538,bc8f3005-c2c6-4277-9fd7-340248f4e7ec,Franciacorta,4.0,4.666667,"{'actual_k': 3, 'was_impossible': False}",10,3,0.666667
593,2ae69fac-d491-412f-8164-ccaf59a230c2,Ronco del Balbo Merlot,4.0,4.75,"{'actual_k': 4, 'was_impossible': False}",6,5,0.75
405,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Nativi Gewürtztraminer,3.0,1.639255,"{'actual_k': 8, 'was_impossible': False}",64,8,1.360745
656,4ed273f7-1816-4f36-88c7-789125f011c7,Moscato d'Asti,5.0,3.000463,"{'actual_k': 3, 'was_impossible': False}",11,4,1.999537
654,4ed273f7-1816-4f36-88c7-789125f011c7,Moscato d'Asti,1.0,3.000463,"{'actual_k': 3, 'was_impossible': False}",11,4,2.000463
