In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cityblock, cosine, euclidean, hamming,jaccard, rogerstanimoto, correlation
from tqdm import notebook

In [2]:
movies = pd.read_csv('movies.dat', sep='::',
                     header=None, names=['movie_id', 'movie_title', 'genres'],
                     engine='python')

In [3]:
ratings = pd.read_csv('ratings.dat', sep='::',
                      names=['user_id', 'movie_id', 'rating', 'rating_timestamp'],
                      engine='python'
                      ).sort_values("rating_timestamp")

In [4]:
movies_with_ratings = movies.join(ratings.set_index('movie_id'), on = 'movie_id')

In [5]:
movies_with_ratings.dropna(inplace = True)

In [6]:
movies_with_ratings.head()

Unnamed: 0,movie_id,movie_title,genres,user_id,rating,rating_timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,6035.0,4.0,956712849.0
0,1,Toy Story (1995),Animation|Children's|Comedy,6032.0,4.0,956718127.0
0,1,Toy Story (1995),Animation|Children's|Comedy,6022.0,5.0,956755763.0
0,1,Toy Story (1995),Animation|Children's|Comedy,6021.0,3.0,956757147.0
0,1,Toy Story (1995),Animation|Children's|Comedy,6016.0,4.0,956778750.0


In [7]:
movies_with_ratings.shape

(1000209, 6)

In [8]:
num_users = movies_with_ratings.user_id.unique().shape[0]
num_films = movies_with_ratings.movie_id.unique().shape[0]

In [9]:
num_films

3706

In [10]:
num_users

6040

In [11]:
movie_vector = {}
for movie, group in notebook.tqdm(movies_with_ratings.groupby('movie_title')):
    movie_vector[movie] = np.zeros(num_users)
    
    for i in range(len(group.user_id.values)):
        u = group.user_id.values[i]
        r = group.rating.values[i]
        movie_vector[movie][int(u - 1)] = r

HBox(children=(FloatProgress(value=0.0, max=3706.0), HTML(value='')))




In [12]:
movie_vector['Toy Story (1995)']

array([5., 0., 0., ..., 0., 0., 3.])

In [13]:
# Реализуем item-to-item коллаборативную фильтрацию
def get_nearest_films(film_title):
    my_fav_film = film_title

    titles = []
    distances = []

    for key in movie_vector.keys():
        if key == my_fav_film:
            continue

        titles.append(key)
        distances.append(correlation(movie_vector[my_fav_film], movie_vector[key]))

    best_indexes = np.argsort(distances)[:5]
    best_movies = [(titles[i], distances[i]) for i in best_indexes]
    
    return best_movies

In [14]:
get_nearest_films('Toy Story (1995)')

[('Toy Story 2 (1999)', 0.5126295237641594),
 ('Aladdin (1992)', 0.529247098927986),
 ('Lion King, The (1994)', 0.588868815603533),
 ('Groundhog Day (1993)', 0.5924534982676297),
 ("Bug's Life, A (1998)", 0.5973209833454718)]

In [15]:
# Функция для получения фильмов, которые пользователь посмотрел недавно,
# и эти фильмы пользователю понравились. Рассматриваются 20 последних просмотренных фильмов
# и из них выбираются 5 фильмов с наивысшей оценкой пользователя
def get_favourite_recent_user_movies(user_id):
    current_user_id = user_id
    user_movies = movies_with_ratings[movies_with_ratings.user_id == current_user_id]
    recent_user_movies = user_movies.sort_values(by = "rating_timestamp").tail(20)
    favourite_recent_user_movies = recent_user_movies.sort_values(by = "rating").tail(5)
    return favourite_recent_user_movies

In [16]:
favourite_recent_for_user_6016 = get_favourite_recent_user_movies(6016)

In [17]:
favourite_recent_for_user_6016

Unnamed: 0,movie_id,movie_title,genres,user_id,rating,rating_timestamp
3566,3635,"Spy Who Loved Me, The (1977)",Action,6016.0,3.0,995664198.0
3694,3763,F/X (1986),Action|Crime|Thriller,6016.0,3.0,995664198.0
1072,1088,Dirty Dancing (1987),Musical|Romance,6016.0,3.0,994455257.0
3459,3528,"Prince of Tides, The (1991)",Drama|Romance,6016.0,3.0,994455301.0
2806,2875,Sommersby (1993),Drama|Mystery|Romance,6016.0,4.0,994455257.0


In [18]:
# В данной функции выбираются 5 последних просмотренных пользователей фильмов, которые пользователю понравились.
# Для каждого фильма находятся 5 ближайших фильмов по оценкам пользователей. Получившиеся 25 фильмов сортируются
# по расстоянию и выбираются 5 фильмов из 25 с наименьшим значением расстояния.
# В данном случае для построения гибридной системы используется стратегия смешивания.
def recommend_for_user(user_id):
    current_user_id = user_id
    favourite_recent_user_movies = get_favourite_recent_user_movies(current_user_id)
    
    recommended_films = []
    for movie in favourite_recent_user_movies.movie_title:
        nearest_films = get_nearest_films(movie)
        recommended_films = recommended_films + nearest_films

    titles = []
    distances = []
    
    user_movies = movies_with_ratings[movies_with_ratings.user_id == current_user_id]
    
    for recommended_films_item in recommended_films:
        titles.append(recommended_films_item[0])
        distances.append(recommended_films_item[1])
        
    best_indexes = np.argsort(distances)[:5]
    best_movies = [(titles[i], distances[i]) for i in best_indexes]
        
    return  best_movies

In [19]:
recommend_for_user_6016 = recommend_for_user(6016)

In [20]:
recommend_for_user_6016

[('Man with the Golden Gun, The (1974)', 0.3415665180135836),
 ('Live and Let Die (1973)', 0.4400176646537637),
 ('For Your Eyes Only (1981)', 0.4584431948281512),
 ('Dr. No (1962)', 0.48150853504834923),
 ("On Her Majesty's Secret Service (1969)", 0.4824363948655366)]