In [70]:
import os
import pickle
import pandas as pd
import numpy as np
from collections import defaultdict
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

RAW_PATH = "../data/raw/"
INTERIM_PATH = "../data/interim/"
MODELS_PATH = "../models/"
SEED = 42

# Data preparation

In [64]:
data_cols = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(os.path.join(INTERIM_PATH, "svd_dataset.csv"), index_col=0)
df.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [65]:
reader = Reader(line_format='user item rating', rating_scale=(1, 5))
data = Dataset.load_from_df(df, reader)

trainset, testset =  train_test_split(data, test_size=0.20, random_state=SEED)

# Training model

In [66]:
algo = SVD(random_state=SEED)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f589ca1300>

# Evaluation

In [67]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [99]:
predictions = algo.test(testset)
precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=3)

accuracy.mse(predictions)
accuracy.rmse(predictions)
print(f"Precision at K: {round(sum(prec for prec in precisions.values()) / len(precisions), 4)}")
print(f"Recall at K: {round(sum(rec for rec in recalls.values()) / len(recalls), 4)}")


MSE: 0.8745
RMSE: 0.9352
Precision at K: 0.9059
Recall at K: 0.6365


# Make Recomendation

In [71]:
item_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']
items = pd.read_csv(os.path.join(RAW_PATH, "u.item"), sep="|", names=item_cols, encoding='latin-1')

In [90]:
def make_recommendation(user_id, top_k, model):
    """Make recommendation of k movies for specific user"""
    
    # take unseen movies for specific user
    all_movies = set(items['movie_id'].tolist())
    watched_movies = set(df[df['user_id'] == user_id]['movie_id'].tolist())
    unseen_movies = all_movies - watched_movies

    # make prediction
    ans = []
    for movie in unseen_movies:
        ans.append((movie, model.predict(user_id, movie).est))
    ans = sorted(ans, key=lambda x: x[1], reverse=True)
    ans = [items[items['movie_id'] == x[0]]['movie_title'].item() for x in ans[:top_k]]
    
    return ans

In [97]:
user_id = np.random.choice(df['user_id'].tolist())

print(f"Recommendations for user_id - {user_id}")
for idx, movie_name in enumerate(make_recommendation(user_id, 10, algo), 1):
    print(f"{idx}. {movie_name}")

Recommendations for user_id - 457
1. Lone Star (1996)
2. Godfather: Part II, The (1974)
3. Titanic (1997)
4. Boot, Das (1981)
5. Rear Window (1954)
6. Raise the Red Lantern (1991)
7. Lawrence of Arabia (1962)
8. North by Northwest (1959)
9. Close Shave, A (1995)
10. To Kill a Mockingbird (1962)


# Save model

In [69]:
with open(os.path.join(MODELS_PATH, 'SVD.pkl'),'wb') as f:
    pickle.dump(algo, f)