I decided to use the KNN model for this data. One of the benefits of this model is that we don't need to train the model.
You may the implementation [here](../models/model.py).
Now let's check if it works.

In [12]:
import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
from models.model import Recommender

In [13]:
df = pd.read_csv("../data/interim/ratings.csv")

In [14]:
recommender = Recommender(5, df)

In [15]:
test_user = [20, "F", "writer", [("Back to the Future (1985)", 5)]]
recommender.suggest(*test_user)

[('Somewhere in Time (1980)',
  'http://us.imdb.com/M/title-exact?Somewhere%20in%20Time%20(1980)'),
 ('Crash (1996)', 'http://us.imdb.com/M/title-exact?Crash%20(1996)'),
 ('Room with a View, A (1986)',
  'http://us.imdb.com/M/title-exact?Room%20with%20a%20View,%20A%20(1986)'),
 ('Deceiver (1997)', 'http://us.imdb.com/M/title-exact?Liar+(1997)'),
 ('Alien 3 (1992)', 'http://us.imdb.com/M/title-exact?Alien%203%20(1992)')]

As we can see, it works.
Now let's fine tune the model to get the best results.
To measure models with different parameters, I use accuracy metric as it's used in the benchmark.

In [16]:
user_id = 150
user = df[df["UserId"] == user_id][["Age", "Gender", "Occupation"]].iloc[0].tolist()
user.append([])
for title, rating in df[df["UserId"] == user_id][["Title", "Rating"]].values:
    if rating > 3:
        user[-1].append((str(title), int(rating)))
test_movie = user[-1].pop()[0]
movies, _ = zip(*recommender.suggest(*user))
print(test_movie, movies)

Postino, Il (1994) ('Psycho (1960)', 'Miracle on 34th Street (1994)', 'Body Snatchers (1993)', 'Maltese Falcon, The (1941)', 'Sling Blade (1996)')


In [38]:
def around(x: float):
    return int(np.around(x))

def objective_function(k: int, leaf_size: int):
    global df
    _recommender = Recommender(around(k), df, leaf_size=around(leaf_size))
    _total = []
    while len(_total) < 100:
        _user_id = np.random.choice(df["UserId"].unique())
        _test_user = df[df["UserId"] == _user_id][["Age", "Gender", "Occupation"]].iloc[0].tolist()
        _test_user.append([])
        for _title, _rating in df[df["UserId"] == _user_id][["Title", "Rating"]].values:
            if _rating > 3:
                _test_user[-1].append((str(_title), int(_rating)))

        if not len(_test_user[-1]):
            continue
        _test_movie = _test_user[-1].pop()[0]

        _movies = list(map(lambda x: x[0], recommender.suggest(*_test_user)))
        _total.append(_test_movie in _movies)

    return sum(_total) / len(_total)

In [39]:
parameter_bounds = {"k": (5, 20), "leaf_size": (10, 100)}
bayesian_optimizer = BayesianOptimization(objective_function, parameter_bounds, random_state=42)

In [40]:
ayesian_optimizer.maximize(init_points=5, n_iter=10)

|   iter    |  target   |     k     | leaf_size |
-------------------------------------------------
| [0m1        [0m | [0m0.0      [0m | [0m10.62    [0m | [0m95.56    [0m |
| [95m2        [0m | [95m0.01     [0m | [95m15.98    [0m | [95m63.88    [0m |
| [0m3        [0m | [0m0.0      [0m | [0m7.34     [0m | [0m24.04    [0m |
| [0m4        [0m | [0m0.0      [0m | [0m5.871    [0m | [0m87.96    [0m |
| [0m5        [0m | [0m0.0      [0m | [0m14.02    [0m | [0m73.73    [0m |
| [0m6        [0m | [0m0.0      [0m | [0m16.83    [0m | [0m63.89    [0m |
| [0m7        [0m | [0m0.0      [0m | [0m17.43    [0m | [0m37.02    [0m |
| [0m8        [0m | [0m0.0      [0m | [0m6.52     [0m | [0m56.95    [0m |
| [0m9        [0m | [0m0.0      [0m | [0m5.085    [0m | [0m51.74    [0m |
| [0m10       [0m | [0m0.0      [0m | [0m15.01    [0m | [0m67.8     [0m |
| [0m11       [0m | [0m0.01     [0m | [0m5.487    [0m | [0m29.37    

So the best parameters are `k=16` and `leaf_size=64`