In [1]:
#Importing the data
import pandas as pd
import numpy as np
import pandas as pd
from surprise import Reader, Dataset, KNNBasic, accuracy, PredictionImpossible
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from collections import defaultdict

#Importing the data
data = pd.read_csv('/home/bbruno/all_here/python course/vinnie/data/cleaned_data/df_oversamling.csv')
data.head()

Unnamed: 0,userId,wine,rate
0,ecc462e1-5041-43ca-94c3-c2bfc9b3215e,Merlot le vigne di zamo,1
1,ecc462e1-5041-43ca-94c3-c2bfc9b3215e,le vigne di zamo refosco,1
2,bf82194f-6108-4524-9cef-69c0231ac74b,Nerello mascalese,1
3,e5b0e77a-ed45-4436-a771-6e01c5782973,Campofiorin Rosso Veronese,1
4,ecc462e1-5041-43ca-94c3-c2bfc9b3215e,le vigne di zamo refosco,1


In [2]:
class Knn (KNNBasic):
    def __init__(self, sim_options={}, bsl_options={}):
        KNNBasic.__init__(self, sim_options=sim_options, bsl_options=bsl_options)
    
    def create_reader(self, data):
        reader = Reader(rating_scale=(1, 5))
        self.data = Dataset.load_from_df(data[['userId', 'wine', 'rate']], reader)

    #################################
    ## for cross validation we have two functions, cross_validate and fit
    # def cross_validate(self, data, measures=['RMSE'], cv=3, verbose=False):
    #     results = cross_validate(self, data, measures=measures, cv=cv, verbose=verbose)
    #     for measure in measures:
    #         print(f'{measure}: {results["test_" + measure.lower()].mean()}')
    #     return results
    #
    # def fit(self, trainset):
    #     predictions = KNNBasic.fit(self, trainset).test(trainset.build_testset())
    #     self.sim = self.compute_similarities()
    #     self.bu, self.bi = self.compute_baselines()
    #     return predictions
    #################################

    #################################
    # fit funtion that works without cross validation
    def fit (self):
        self.trainset, testset = train_test_split(self.data, test_size=0.2)
        predictions = KNNBasic.fit(self, self.trainset).test(testset)
        self.sim = self.compute_similarities()
        self.bu, self.bi = self.compute_baselines()
        return predictions
    
    def estimated(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible("User and/or item is unknown.")
        
        # Compute similarities between u and v, where v describes all other
        # users that have also rated item i.
        neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]]
        # Sort these neighbors by similarity
        neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)

        print("The 5 nearest neighbors of user", str(u), "are:")
        for v, sim_uv in neighbors[:5]:
            print(f"user {v} with sim {sim_uv:1.2f}")

        # ... Aaaaand return the baseline estimate anyway ;)
        bsl = self.trainset.global_mean + self.bu[u] + self.bi[i]
        return print(f"And the baseline estimate is: {bsl}")
    
    def get_Iu(self, uid):
        """Return the number of items rated by given user
        args:
          uid: the id of the user
        returns:
          the number of items rated by the user
        """
        try:
            return len(self.trainset.ur[self.trainset.to_inner_uid(uid)])
        except ValueError:  # user was not part of the trainset
            return 0

    def get_Ui(self, iid):
        """Return the number of users that have rated given item
        args:
          iid: the raw id of the item
        returns:
          the number of users that have rated the item.
        """
        try:
            return len(self.trainset.ir[self.trainset.to_inner_iid(iid)])
        except ValueError:
            return 0

    def inspect_predictions(self, predictions):
        print(f"uid means the user id and iid means the wine id\n")
        print(f"rui means the actual rating and est means the estimated rating\n")
        print(f"err means the error between the actual and the estimated rating\n")
        print(f"Iu means the number of items rated by given user\n")
        print(f"Ui means the number of users that have rated given item\n")
        # Create a dataframe with the predictions
        df_pred = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
        df_pred['Iu'] = df_pred.uid.apply(self.get_Iu)
        df_pred['Ui'] = df_pred.iid.apply(self.get_Ui)
        df_pred['err'] = abs(df_pred.est - df_pred.rui)
        return df_pred
    
    def get_accuracy(self, predictions, k=10, threshold=3.5):
        # Compute RMSE
        accuracy.rmse(predictions, verbose=True)
        
        # Compute precision and recall
        precisions, recalls = self.precision_recall_at_k(predictions, k=k, threshold=threshold)

        # Precision and recall can then be averaged over all users
        precision = sum(prec for prec in precisions.values()) / len(precisions)
        recall = sum(rec for rec in recalls.values()) / len(recalls)
        print(f'Precision: {precision:.2f}\nRecall: {recall:.2f}')

        # Count correct predictions
        correct = 0
        for uid, iid, true_r, est, _ in predictions:
            if round(est) == round(true_r):
                correct += 1

        # Compute accuracy
        accuracy_percentage = correct / len(predictions)
        return accuracy_percentage * 100
    
    @staticmethod 
    def precision_recall_at_k(predictions, k=10, threshold=3.5):
        """Return precision and recall at k metrics for each user"""

        # First map the predictions to each user.
        user_est_true = defaultdict(list)
        for uid, _, true_r, est, _ in predictions:
            user_est_true[uid].append((est, true_r))

        precisions = dict()
        recalls = dict()
        for uid, user_ratings in user_est_true.items():

            # Sort user ratings by estimated value
            user_ratings.sort(key=lambda x: x[0], reverse=True)

            # Number of relevant items
            n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

            # Number of recommended items in top k
            n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

            # Number of relevant and recommended items in top k
            n_rel_and_rec_k = sum(
                ((true_r >= threshold) and (est >= threshold))
                for (est, true_r) in user_ratings[:k]
            )

            # Precision@K: Proportion of recommended items that are relevant
            # When n_rec_k is 0, Precision is undefined. We here set it to 0.

            precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

            # Recall@K: Proportion of relevant items that are recommended
            # When n_rel is 0, Recall is undefined. We here set it to 0.

            recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

        return precisions, recalls
    
    # Mi function acc that works perfectly was modifyed to add precision and recall, for that reason this is commented
    # def get_accuracy(self, predictions):
    #     # # Compute RMSE
    #     # predictions = KNNBasic.test(self, testset)
    #     accuracy.rmse(predictions, verbose=True)
    #     # Count correct predictions
    #     correct = 0
    #     for uid, iid, true_r, est, _ in predictions:
    #         if round(est) == round(true_r):
    #             correct += 1

    #     # Compute accuracy
    #     accuracy_percentage = correct / len(predictions)
    #     return accuracy_percentage * 100

In [3]:
knn = Knn(
    sim_options = {'name': 'pearson_baseline','user_based': True}, 
    bsl_options={'method': 'sgd', 'learning_rate': 0.00005, 'n_epochs':20, 'reg_u': 12 , 'reg_i': 5}
    )

In [4]:
knn.create_reader(data)

In [5]:
predictions = knn.fit()

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [6]:
knn.estimated(140, 10)
print("If the baseline is {} then the value is a default value".format(knn.trainset.global_mean))

The 5 nearest neighbors of user 140 are:
user 0 with sim 0.00
user 0 with sim 0.00
user 0 with sim 0.00
And the baseline estimate is: 3.0150966155431904
If the baseline is 3.0150918635170605 then the value is a default value


In [7]:
df_pred = knn.inspect_predictions(predictions)
best_pred = df_pred.sort_values(by='err')[:10]
worst_pred = df_pred.sort_values(by='err')[-10:]
df_pred.head(10)

uid means the user id and iid means the wine id

rui means the actual rating and est means the estimated rating

err means the error between the actual and the estimated rating

Iu means the number of items rated by given user

Ui means the number of users that have rated given item



Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,7ed7fb0c-884d-46bf-a14a-205474a0949d,Bardolino,2.0,2.0,"{'actual_k': 4, 'was_impossible': False}",8,5,0.0
1,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Il Mattaglio Blanc de Noirs,5.0,5.0,"{'actual_k': 4, 'was_impossible': False}",214,4,0.0
2,ecc462e1-5041-43ca-94c3-c2bfc9b3215e,Daphne,2.0,2.0,"{'actual_k': 3, 'was_impossible': False}",130,3,0.0
3,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Bitornino,5.0,5.0,"{'actual_k': 2, 'was_impossible': False}",214,2,0.0
4,7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7,Rheinhessen Kabinett Riesling,1.0,1.0,"{'actual_k': 21, 'was_impossible': False}",80,21,0.0
5,9250d444-805f-4756-a03b-93a597ab320d,Domaine des Granges de Mirabel Coteaux-de-l'Ar...,5.0,3.015092,"{'was_impossible': True, 'reason': 'User and/o...",1,0,1.984908
6,3b1f7bc8-6a23-47e5-97df-043bd1b97f12,White cava,2.0,2.0,"{'actual_k': 4, 'was_impossible': False}",4,4,0.0
7,5ccd8030-047b-432c-a630-d784ab415756,Axel Rosė - Gesellenstück,4.0,3.015092,"{'was_impossible': True, 'reason': 'User and/o...",149,0,0.9849081
8,dc90e649-113d-4cec-983e-d03a7d4acff3,Cabernet Sauvignon,5.0,5.0,"{'actual_k': 6, 'was_impossible': False}",1,9,8.881784e-16
9,3b1f7bc8-6a23-47e5-97df-043bd1b97f12,White cava,2.0,2.0,"{'actual_k': 4, 'was_impossible': False}",4,4,0.0


In [8]:
knn.get_accuracy(predictions)

RMSE: 0.5910
Precision: 0.33
Recall: 0.28


74.01574803149606

* best predictions


In [9]:
best_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,7ed7fb0c-884d-46bf-a14a-205474a0949d,Bardolino,2.0,2.0,"{'actual_k': 4, 'was_impossible': False}",8,5,0.0
202,2a62eecd-b0cd-4395-9f8c-7c912a208be2,Grande reserve,5.0,5.0,"{'actual_k': 2, 'was_impossible': False}",67,2,0.0
203,ecc462e1-5041-43ca-94c3-c2bfc9b3215e,Ribolla Gialla,2.0,2.0,"{'actual_k': 2, 'was_impossible': False}",130,6,0.0
206,09f6ca6e-905f-4afd-bab2-d928fe046f18,Vitovska legno,4.0,4.0,"{'actual_k': 1, 'was_impossible': False}",6,1,0.0
207,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Pinot grigio,2.0,2.0,"{'actual_k': 11, 'was_impossible': False}",214,12,0.0
208,2a62eecd-b0cd-4395-9f8c-7c912a208be2,Mcguigan black label sauvignon blanc,5.0,5.0,"{'actual_k': 1, 'was_impossible': False}",67,1,0.0
211,f8c400a6-ded1-40b8-95d2-a263d05c30ce,Rosso d’Asia,3.0,3.0,"{'actual_k': 1, 'was_impossible': False}",214,1,0.0
212,7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7,Mendoza Chardonnay,2.0,2.0,"{'actual_k': 5, 'was_impossible': False}",80,5,0.0
213,bc79b0e3-064d-4240-86de-e86499f577e8,3／4 大亂鬥 6,1.0,1.0,"{'actual_k': 15, 'was_impossible': False}",117,15,0.0
216,7f6dede6-b8b5-4bb6-a1ca-ae58c79c0ea7,Groot Constantia,2.0,2.0,"{'actual_k': 5, 'was_impossible': False}",80,5,0.0


* worst predictions

In [10]:
worst_pred

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
215,36fec3ea-e183-491d-b65a-6c7ac90fafa5,Inganno 572,5.0,3.015092,"{'was_impossible': True, 'reason': 'User and/o...",88,0,1.984908
154,e2723dcb-0b44-477f-bf74-678640f5c06d,Terrano,5.0,3.015092,"{'was_impossible': True, 'reason': 'User and/o...",0,41,1.984908
99,bc79b0e3-064d-4240-86de-e86499f577e8,4/15 新世界紅 2 RAka Pinotage,5.0,3.015092,"{'was_impossible': True, 'reason': 'User and/o...",117,0,1.984908
372,2a62eecd-b0cd-4395-9f8c-7c912a208be2,Dark (Red Blend),5.0,3.015092,"{'was_impossible': True, 'reason': 'Not enough...",67,1,1.984908
332,1b02b02d-e152-4b2e-a276-56fcc4069250,Riesling,5.0,3.015092,"{'was_impossible': True, 'reason': 'User and/o...",0,3,1.984908
5,9250d444-805f-4756-a03b-93a597ab320d,Domaine des Granges de Mirabel Coteaux-de-l'Ar...,5.0,3.015092,"{'was_impossible': True, 'reason': 'User and/o...",1,0,1.984908
11,2a62eecd-b0cd-4395-9f8c-7c912a208be2,Dark (Red Blend),5.0,3.015092,"{'was_impossible': True, 'reason': 'Not enough...",67,1,1.984908
343,4313b57d-e3f7-495e-bcfd-ff5c1bf15fcc,P’tit Piaf Rouge,5.0,3.015092,"{'was_impossible': True, 'reason': 'User and/o...",2,0,1.984908
330,2a62eecd-b0cd-4395-9f8c-7c912a208be2,Dark (Red Blend),5.0,3.015092,"{'was_impossible': True, 'reason': 'Not enough...",67,1,1.984908
246,2f3064a7-12af-43d8-90fd-6c4555830fcc,Nuovo Vino,5.0,3.015092,"{'was_impossible': True, 'reason': 'User and/o...",0,0,1.984908
