In [116]:
from collections import defaultdict

import pandas as pd

from surprise import Dataset
from surprise import NormalPredictor, KNNWithMeans, SVD
from surprise import accuracy
from surprise.model_selection import cross_validate

from surprise.model_selection import train_test_split


# From examples/top_n_recommendations.py. Slightly modified
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
        
    result = {k: dict(v) for k, v in top_n.items()}
    return result


# See examples/precision_recall_at_k.py
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls


def estimate(name, predictor, dataset, trainset, testset, n_cv):
    """
    Fits, tests and cross-validates given predictor using provided sets
    """
    print("======\t\t\tEstimating", name, "\t\t\t======")
    
    predictor.fit(trainset)
    predictions = predictor.test(testset)
    rmse = accuracy.rmse(predictions)
    cv = cross_validate(predictor, dataset, measures=['RMSE'], cv=n_cv, verbose=True)
    
    return rmse, cv, predictions


def best_rmse(dataset, trainset, testset):
    """
    Finds out the best predictor for provided sets using RMSE as ranking function
    """
    k = 30
    n_cv = 5
    predictors = {
        # Normal
        "Normal": NormalPredictor(),
        # Cosine
        "kNN Cosine": KNNWithMeans(k=k, sim_options={'name': 'cosine'}),
        # MSD
        "kNN MSD": KNNWithMeans(k=k),
        # Pearson
        "kNN Pearson": KNNWithMeans(k=k, sim_options={'name': 'pearson_baseline'}),
        # SVD
        "SVD": SVD()
    }
    results = {}
    for name, predictor in predictors.items():
        results[name] = (estimate(name, predictor, dataset, trainset, testset, n_cv))
        
    # Find the most murkiest murkman (default sort is ascending, need first element)
    fittest = sorted(list(predictors.keys()), key=lambda x: results[x][0])[0]
    print("And the winner is...", fittest + "!")
    return predictors[fittest], results[fittest][2]


def calculate_pr(predictions, k, threshold, roundto=3):
    """
    Calculates precision and recall values (rounding to roundto)
    """
    precision, recall = precision_recall_at_k(predictions, k=k, threshold=threshold)
    
    precision_val = round(sum(precision.values()) / len(precision), roundto)
    recall_val = round(sum(recall.values()) / len(recall), roundto)
    
    return precision_val, recall_val


def top_n_for_user(user_id, predictions, n=5):
    """
    Returns top N predictions for user
    """
    top_n = get_top_n(predictions, n=n)
    return top_n[user_id]

def get_movie_names(ids):
    """
    Extracts movie info by id. Not really reusable, due to hard-coded dataset path (venv?)
    """
    items_file = '~/.surprise_data/ml-100k/ml-100k/u.item'
    items = pd.read_csv(items_file, sep='|', header=None, encoding='ISO-8859-1')
    return {i: items.iloc[int(i)] for i in ids}

def main(user_id="2", acc=3):
    # Load dataset
    data = Dataset.load_builtin('ml-100k')
    # Prepare sets
    trainset, testset = train_test_split(data, test_size=.25)
    
    # Determine best predictor based on RMSE metric
    predictor, predictions = best_rmse(data, trainset, testset)
    
    # Calculate and print precision and recall
    precision_k, recall_k = calculate_pr(predictions, 5, 3.52, roundto=acc)
    print("Calculated Precision@k:", precision_k, " | Recall@k:", recall_k)
    
    # Predict movies for the user
    predicted = predict_for_user(user_id, predictions)
    
    # Save prediction to the file
    with open("output.txt", "w") as f:
        names = get_movie_names(predicted.keys())
        result = [
            (int(k) + 1, tuple(names[k][1:3]), round(v, 3))
            for k, v in predicted.items()
        ]
        line = "User " + user_id + "\n"
        print(line)
        f.write(line)
        for i in result:
            line = "\t" + "\t".join(map(str, i))
            print(line)
            f.write(line + "\n")


In [81]:
if __name__ == "__main__":
    main()

In [117]:
main()

RMSE: 1.5208
Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5176  1.5113  1.5210  1.5128  1.5138  1.5153  0.0035  
Fit time          0.09    0.10    0.10    0.09    0.10    0.09    0.00    
Test time         0.11    0.11    0.11    0.22    0.11    0.13    0.04    
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9617
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9623