## Рекомендательные системы (матрица предпочтений и SVD)

Датасеты: https://grouplens.org/datasets/movielens/

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [2]:
ratings_df.info()
print('-'*40)
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
userId       100836 non-null int64
movieId      100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null int64
title      9742 non-null object
genres     9742 non-null object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_df.nunique()

userId         610
movieId       9724
rating          10
timestamp    85043
dtype: int64

In [6]:
ratings_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [7]:
movies_df.nunique()

movieId    9742
title      9737
genres      951
dtype: int64

In [8]:
movies_df.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


Из обзоров видно, что, по крайней мере, movieId требует кодировки. Также видно, что количество уникальных идентификаторов movieId в "movies" (9742) и "ratings" (9724) отличается, то есть не все фильмы из "movies" задействованы в "ratings". Оставим в "movies" только те позиции, которые имеются в "ratings".

In [9]:
idx_ = movies_df.loc[:,'movieId'].isin(ratings_df.movieId.values)

# Ради интереса посмотрим, на фильмы, не вошедшие в ratings
movies_df.loc[idx_ == False]

Unnamed: 0,movieId,title,genres
816,1076,"Innocents, The (1961)",Drama|Horror|Thriller
2211,2939,Niagara (1953),Drama|Thriller
2499,3338,For All Mankind (1989),Documentary
2587,3456,"Color of Paradise, The (Rang-e khoda) (1999)",Drama
3118,4194,I Know Where I'm Going! (1945),Drama|Romance|War
4037,5721,"Chosen, The (1981)",Drama
4506,6668,"Road Home, The (Wo de fu qin mu qin) (1999)",Drama|Romance
4598,6849,Scrooge (1970),Drama|Fantasy|Musical
4704,7020,Proof (1991),Comedy|Drama|Romance
5020,7792,"Parallax View, The (1974)",Thriller


In [10]:
movies_df = movies_df.loc[idx_].reset_index()
movies_df

Unnamed: 0,index,movieId,title,genres
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,2,Jumanji (1995),Adventure|Children|Fantasy
2,2,3,Grumpier Old Men (1995),Comedy|Romance
3,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...,...
9719,9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9720,9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9721,9739,193585,Flint (2017),Drama
9722,9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [11]:
# Кодируем идентификаторы movieId и userId
le_movie = LabelEncoder()
le_user = LabelEncoder()

le_movie = le_movie.fit(ratings_df.movieId.values)
le_user = le_user.fit(ratings_df.userId.values)

ratings_df.loc[:,'movieId'] = le_movie.transform(ratings_df.loc[:,'movieId'].values)
ratings_df.loc[:,'userId'] = le_user.transform(ratings_df.loc[:,'userId'].values)
movies_df.loc[:,'movieId'] = le_movie.transform(movies_df.loc[:,'movieId'].values)

In [12]:
ratings_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,325.127564,3101.735561,3.501557,1205946000.0
std,182.618491,2627.050983,1.042529,216261000.0
min,0.0,0.0,0.5,828124600.0
25%,176.0,900.0,3.0,1019124000.0
50%,324.0,2252.0,3.5,1186087000.0
75%,476.0,5095.25,4.0,1435994000.0
max,609.0,9723.0,5.0,1537799000.0


In [13]:
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds


# формируем разреженную матрицу рейтингов
R = coo_matrix((ratings_df.rating.values, (ratings_df.userId.values, ratings_df.movieId.values)))
R

<610x9724 sparse matrix of type '<class 'numpy.float64'>'
	with 100836 stored elements in COOrdinate format>

In [14]:
# применим сингулярное разложение для разреженной матрицы R
u, s, vt = svds(R, k=6)

In [15]:
u.shape

(610, 6)

In [16]:
s.shape

(6,)

In [17]:
vt.shape

(6, 9724)

In [18]:
from sklearn.neighbors import NearestNeighbors


nn = NearestNeighbors(n_neighbors=10)
v = vt.T
nn.fit(v)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

In [19]:
# найдем индексы 10-ти ближайших соседей для каждого фильма
_, ind = nn.kneighbors(v, n_neighbors=10)

In [20]:
ind.shape

(9724, 10)

In [21]:
ind

array([[   0,  615,  546, ...,  815,   32,  325],
       [   1,  276,  197, ...,  131,   44,  302],
       [   2,  291,  630, ...,  144,  379,  364],
       ...,
       [9340, 9718, 9721, ..., 9720, 9439, 9716],
       [9340, 9718, 9721, ..., 9720, 9439, 9716],
       [9723, 7976, 8249, ..., 7218, 8410, 4210]], dtype=int64)

In [22]:
movie_titles = movies_df.sort_values('movieId').loc[:,'title'].values
movie_titles

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       ..., 'Flint (2017)', 'Bungo Stray Dogs: Dead Apple (2018)',
       'Andrew Dice Clay: Dice Rules (1991)'], dtype=object)

In [23]:
nn10 = pd.DataFrame(data=movie_titles[ind])

In [24]:
idx = nn10[0].str.contains('Shrek')
nn10[idx].head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
3189,Shrek (2001),Finding Nemo (2003),"Monsters, Inc. (2001)",Pirates of the Caribbean: The Curse of the Bla...,"Incredibles, The (2004)",Shrek 2 (2004),Harry Potter and the Prisoner of Azkaban (2004),"Beautiful Mind, A (2001)",Harry Potter and the Sorcerer's Stone (a.k.a. ...,Ocean's Eleven (2001)


In [25]:
idx = nn10[0].str.contains('Star Wars')
nn10[idx].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
224,Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back...,Raiders of the Lost Ark (Indiana Jones and the...,Saving Private Ryan (1998),Star Wars: Episode VI - Return of the Jedi (1983),"Sixth Sense, The (1999)",Indiana Jones and the Last Crusade (1989),"Terminator, The (1984)",Back to the Future (1985),Die Hard (1988)
897,Star Wars: Episode V - The Empire Strikes Back...,Star Wars: Episode IV - A New Hope (1977),Raiders of the Lost Ark (Indiana Jones and the...,Star Wars: Episode VI - Return of the Jedi (1983),Indiana Jones and the Last Crusade (1989),Saving Private Ryan (1998),"Terminator, The (1984)","Sixth Sense, The (1999)",Back to the Future (1985),Die Hard (1988)
910,Star Wars: Episode VI - Return of the Jedi (1983),Indiana Jones and the Last Crusade (1989),Star Wars: Episode V - The Empire Strikes Back...,Raiders of the Lost Ark (Indiana Jones and the...,Star Wars: Episode IV - A New Hope (1977),Back to the Future (1985),"Terminator, The (1984)",Die Hard (1988),"Fifth Element, The (1997)",Saving Private Ryan (1998)
1978,Star Wars: Episode I - The Phantom Menace (1999),"Fifth Element, The (1997)",Men in Black (a.k.a. MIB) (1997),Indiana Jones and the Temple of Doom (1984),Austin Powers: The Spy Who Shagged Me (1999),Ghostbusters (a.k.a. Ghost Busters) (1984),Back to the Future Part II (1989),Austin Powers: International Man of Mystery (1...,Back to the Future Part III (1990),"South Park: Bigger, Longer and Uncut (1999)"
3827,Star Wars: Episode II - Attack of the Clones (...,X2: X-Men United (2003),Dogma (1999),"Patriot, The (2000)",Spider-Man 2 (2004),American Pie (1999),"Bourne Supremacy, The (2004)",Mission: Impossible II (2000),Shaun of the Dead (2004),Back to the Future Part III (1990)


In [26]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform


D = cosine_similarity(R)
D.shape

(610, 610)

In [27]:
D

array([[1.        , 0.02728287, 0.05972026, ..., 0.29109737, 0.09357193,
        0.14532081],
       [0.02728287, 1.        , 0.        , ..., 0.04621095, 0.0275654 ,
        0.10242675],
       [0.05972026, 0.        , 1.        , ..., 0.02112846, 0.        ,
        0.03211875],
       ...,
       [0.29109737, 0.04621095, 0.02112846, ..., 1.        , 0.12199271,
        0.32205486],
       [0.09357193, 0.0275654 , 0.        , ..., 0.12199271, 1.        ,
        0.05322546],
       [0.14532081, 0.10242675, 0.03211875, ..., 0.32205486, 0.05322546,
        1.        ]])

In [28]:
# "схожесть" пользователей нужно рассчитывать только по тем фильмам, которые ими оценены.
# u - рейтинги пользователя u
# v - рейтинги пользователя v
def similarity(u, v):
    idx = (u != 0) & (v != 0)
    if np.any(idx):
        sim = 1-cosine(u[idx], v[idx])
        return sim
    else:
        return 0

In [30]:
d = pdist(R.toarray(), metric=similarity)

In [31]:
d.shape

(185745,)

In [32]:
D = squareform(d)
D.shape

(610, 610)

In [33]:
D

array([[0.        , 1.        , 0.79190331, ..., 0.95657271, 0.97706762,
        0.97688086],
       [1.        , 0.        , 0.        , ..., 0.99007734, 1.        ,
        0.99050108],
       [0.79190331, 0.        , 0.        , ..., 0.61178709, 0.        ,
        0.8602937 ],
       ...,
       [0.95657271, 0.99007734, 0.61178709, ..., 0.        , 0.9723029 ,
        0.95789507],
       [0.97706762, 1.        , 0.        , ..., 0.9723029 , 0.        ,
        0.96310385],
       [0.97688086, 0.99050108, 0.8602937 , ..., 0.95789507, 0.96310385,
        0.        ]])