In [70]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
data_movies = pd.read_csv('ml-latest-small/movies.csv')
data_ratings = pd.read_csv('ml-latest-small/ratings.csv')
data_tags = pd.read_csv('ml-latest-small/tags.csv')

In [4]:
print(data_movies)
print(data_ratings)
print(data_tags)

      movieId                                      title  \
0           1                           Toy Story (1995)   
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
...       ...                                        ...   
9737   193581  Black Butler: Book of the Atlantic (2017)   
9738   193583               No Game No Life: Zero (2017)   
9739   193585                               Flint (2017)   
9740   193587        Bungo Stray Dogs: Dead Apple (2018)   
9741   193609        Andrew Dice Clay: Dice Rules (1991)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
1                      Adventure|Children|Fantasy  
2                                  Comedy|Romance  
3                            Comedy|Drama|Romance  
4                  

In [13]:
all_movies = data_movies['movieId'].unique()
print(len(all_movies))
users_all = data_ratings['userId'].unique()
print(len(users_all))

9742
610


In [20]:
data_movies['genres'] = data_movies['genres'].apply(
    lambda x: [] if [genre.strip() for genre in x.split('|')] == ['(no genres listed)'] else [genre.strip() for genre in x.split('|')]
)   

In [21]:
data_movies['genres']

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9737                 [Action, Animation, Comedy, Fantasy]
9738                         [Animation, Comedy, Fantasy]
9739                                              [Drama]
9740                                  [Action, Animation]
9741                                             [Comedy]
Name: genres, Length: 9742, dtype: object

In [9]:
all_genres = set()
for i in data_movies['genres']:
    all_genres.update(i)

{'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [12]:
all_movies

array([     1,      2,      3, ..., 193585, 193587, 193609], dtype=int64)

### Реализация item-based

In [22]:
# Создание фрейма по жанрам и среднему рейтингу
mean_ratings = data_ratings.groupby('movieId')['rating'].mean().reset_index()

mean_ratings.rename(columns={'rating': 'mean_rating'}, inplace=True)

print(mean_ratings)

item_matrix = pd.DataFrame(0, columns=list(all_genres), index=all_movies)

      movieId  mean_rating
0           1     3.920930
1           2     3.431818
2           3     3.259615
3           4     2.357143
4           5     3.071429
...       ...          ...
9719   193581     4.000000
9720   193583     3.500000
9721   193585     3.500000
9722   193587     3.500000
9723   193609     4.000000

[9724 rows x 2 columns]


In [24]:
for _, row in data_movies.iterrows():
    movie_id = row['movieId']
    genres = row['genres']  

    for genre in genres:
        if genre in item_matrix.columns:
            item_matrix.loc[movie_id, genre] = 1

print(item_matrix)

        IMAX  Horror  Western  Sci-Fi  Animation  Film-Noir  Mystery  Action  \
1          0       0        0       0          1          0        0       0   
2          0       0        0       0          0          0        0       0   
3          0       0        0       0          0          0        0       0   
4          0       0        0       0          0          0        0       0   
5          0       0        0       0          0          0        0       0   
...      ...     ...      ...     ...        ...        ...      ...     ...   
193581     0       0        0       0          1          0        0       1   
193583     0       0        0       0          1          0        0       0   
193585     0       0        0       0          0          0        0       0   
193587     0       0        0       0          1          0        0       1   
193609     0       0        0       0          0          0        0       0   

        Crime  War  Romance  Documentar

In [38]:
df_index = item_matrix.reset_index()
df_index.rename(columns={'index': 'movieId'}, inplace=True)

df_finish= df_index.merge(mean_ratings, on='movieId', how='left')
print(df_finish)

      movieId  IMAX  Horror  Western  Sci-Fi  Animation  Film-Noir  Mystery  \
0           1     0       0        0       0          1          0        0   
1           2     0       0        0       0          0          0        0   
2           3     0       0        0       0          0          0        0   
3           4     0       0        0       0          0          0        0   
4           5     0       0        0       0          0          0        0   
...       ...   ...     ...      ...     ...        ...        ...      ...   
9737   193581     0       0        0       0          1          0        0   
9738   193583     0       0        0       0          1          0        0   
9739   193585     0       0        0       0          0          0        0   
9740   193587     0       0        0       0          1          0        0   
9741   193609     0       0        0       0          0          0        0   

      Action  Crime  ...  Romance  Documentary  Mus

In [39]:
df_indexes = df_finish.pop('movieId')

In [40]:
df_finish

Unnamed: 0,IMAX,Horror,Western,Sci-Fi,Animation,Film-Noir,Mystery,Action,Crime,War,Romance,Documentary,Musical,Thriller,Fantasy,Comedy,Children,Drama,Adventure,mean_rating
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,1,3.920930
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,3.431818
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,3.259615
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,2.357143
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3.071429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,4.000000
9738,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,3.500000
9739,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3.500000
9740,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3.500000


In [52]:
df_finish['mean_rating'] = df_finish['mean_rating'].fillna(0)


In [53]:
df_columns = df_finish.columns
print(list(df_columns))

['IMAX', 'Horror', 'Western', 'Sci-Fi', 'Animation', 'Film-Noir', 'Mystery', 'Action', 'Crime', 'War', 'Romance', 'Documentary', 'Musical', 'Thriller', 'Fantasy', 'Comedy', 'Children', 'Drama', 'Adventure', 'mean_rating']


In [58]:
df_numpy = df_finish.to_numpy()

In [59]:
# Вычисление матрицы сходства
cosinuse_similar = cosine_similarity(df_numpy)

In [63]:
print(len(cosinuse_similar))
print(cosinuse_similar)

9742
[[1.         0.94839359 0.85925067 ... 0.83524586 0.86409509 0.89646547]
 [0.94839359 1.         0.81898262 ... 0.8583921  0.82772528 0.86608624]
 [0.85925067 0.81898262 1.         ... 0.88208245 0.85056927 0.95824767]
 ...
 [0.83524586 0.8583921  0.88208245 ... 1.         0.89149871 0.93281525]
 [0.86409509 0.82772528 0.85056927 ... 0.89149871 1.         0.89948959]
 [0.89646547 0.86608624 0.95824767 ... 0.93281525 0.89948959 1.        ]]


In [64]:
df_indexes

0            1
1            2
2            3
3            4
4            5
         ...  
9737    193581
9738    193583
9739    193585
9740    193587
9741    193609
Name: movieId, Length: 9742, dtype: int64

In [68]:
df_indexes[df_indexes == 5].index[0]

4

In [85]:
def find_k_similar_films(movie_id, k=5):
    target_index = df_indexes[df_indexes == movie_id].index[0]
    print(target_index)

    similarities = cosinuse_similar[target_index]

    # Ищем индексы топ-k значений (исключая сам объект)
    top_k_indices = np.argsort(similarities)[::-1]
    top_k_indices = top_k_indices[top_k_indices != target_index][:k:]
    print(top_k_indices)
    return target_index, top_k_indices
    

In [86]:
target_index, top_k_indexes = find_k_similar_films(5)

4
[2129  644 1528 2960 3459]


In [93]:
def print_item_based_result(target_index, top_k_indexes):
    target_movieId = df_indexes.loc[target_index]
    # print(target_movieId)

    top_k_moviesId = df_indexes.loc[top_k_indexes].values
    moviesId = np.insert(top_k_moviesId, 0, target_movieId)
    # print(moviesId)
    # return moviesId

    found_items = data_movies[data_movies['movieId'].isin(moviesId)]

    print(found_items)

print_item_based_result(target_index, top_k_indexes)

      movieId                               title    genres
4           5  Father of the Bride Part II (1995)  [Comedy]
644       830        First Wives Club, The (1996)  [Comedy]
1528     2060                  BASEketball (1998)  [Comedy]
2129     2829                    Muse, The (1999)  [Comedy]
2960     3968                    Bedazzled (2000)  [Comedy]
3459     4718               American Pie 2 (2001)  [Comedy]


### user-based

In [119]:
df_matrix_user = pd.DataFrame(0, index=all_movies, columns=users_all)

df_pivot = data_ratings.pivot(index='movieId', columns='userId', values='rating')

# Заполняем пропуски значением 0
df_pivot_filled = df_pivot.fillna(0)
del df_pivot

In [120]:
print(df_matrix_user)
print(df_pivot_filled)

        1    2    3    4    5    6    7    8    9    10   ...  601  602  603  \
1         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
2         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
3         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
4         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
5         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
...     ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
193581    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
193583    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
193585    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
193587    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
193609    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   

        604  605  606  607  608  609  6

In [121]:
df_matrix_user = df_matrix_user.astype('float64') 
df_matrix_user.update(df_pivot_filled)

In [125]:
df_matrix_user

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
df_reset = df_matrix_user.reset_index()
df_reset.rename(columns={'index': 'movieId'}, inplace=True)



In [131]:
df_indexes_u_m = df_reset.pop('movieId')

In [135]:
numpy_user_based = df_reset.T.to_numpy()
print(numpy_user_based)

[[4.  0.  4.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [2.5 2.  2.  ... 0.  0.  0. ]
 [3.  0.  0.  ... 0.  0.  0. ]
 [5.  0.  0.  ... 0.  0.  0. ]]


In [137]:
cosine_sim_user_based = cosine_similarity(numpy_user_based)
print(len(cosine_sim_user_based))
print(cosine_sim_user_based)

610
[[1.         0.02728287 0.05972026 ... 0.29109737 0.09357193 0.14532081]
 [0.02728287 1.         0.         ... 0.04621095 0.0275654  0.10242675]
 [0.05972026 0.         1.         ... 0.02112846 0.         0.03211875]
 ...
 [0.29109737 0.04621095 0.02112846 ... 1.         0.12199271 0.32205486]
 [0.09357193 0.0275654  0.         ... 0.12199271 1.         0.05322546]
 [0.14532081 0.10242675 0.03211875 ... 0.32205486 0.05322546 1.        ]]


In [162]:
indexes_user_list = df_matrix_user.columns
print(indexes_user_list)

Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
       ...
       601, 602, 603, 604, 605, 606, 607, 608, 609, 610],
      dtype='int64', length=610)


In [163]:
def find_similar_k_users(userId, k=5):
    target_index = np.where(indexes_user_list == userId)[0][0]
    print(target_index)
    similarities = cosine_sim_user_based[target_index]

    # Ищем индексы топ-k значений (исключая сам объект)
    top_k_indices = np.argsort(similarities)[::-1]
    top_k_indices = top_k_indices[top_k_indices != target_index][:k:]
    print(top_k_indices)
    return target_index, top_k_indices

target_id_user, top_k_id_users = find_similar_k_users(1)

0
[265 312 367  56  90]


In [164]:
def get_rated_movies(df, user_id):
    user_ratings = df[user_id]
    rated_movie_ids = user_ratings[user_ratings != 0].index

    return rated_movie_ids.tolist()

In [165]:
def print_films_user_based_result(target_index, top_k_indexes):
    target_userId = indexes_user_list[target_index]
    print(target_userId)

    top_k_userId = list(indexes_user_list[list(top_k_indexes)])
    print(top_k_userId)

    target_movies_id = get_rated_movies(df_matrix_user, target_userId)
    print(target_movies_id)

    set_movies_id_recommended = set()
    for i in top_k_userId:
        set_movies_id_recommended.update(get_rated_movies(df_matrix_user, i))

    print(set_movies_id_recommended)

    set_movies_id_recommended.difference_update(target_movies_id)

    print('Фильмы, которые посмотрел пользователь:')

    user_movies = data_movies[data_movies['movieId'].isin(target_movies_id)]
    print(user_movies, '\n')

    print('Рекомендованные фильмы, которые смотрели похожие пользователи:')
    recommended_movies = data_movies[data_movies['movieId'].isin(set_movies_id_recommended)]
    print(recommended_movies, '\n')

print_films_user_based_result(target_id_user, top_k_id_users)

1
[266, 313, 368, 57, 91]
[1, 3, 6, 47, 50, 70, 101, 110, 151, 157, 163, 216, 223, 231, 235, 260, 296, 316, 333, 349, 356, 362, 367, 423, 441, 457, 480, 500, 527, 543, 552, 553, 590, 592, 593, 596, 608, 648, 661, 673, 733, 736, 780, 804, 919, 923, 940, 943, 954, 1009, 1023, 1024, 1025, 1029, 1030, 1031, 1032, 1042, 1049, 1060, 1073, 1080, 1089, 1090, 1092, 1097, 1127, 1136, 1196, 1197, 1198, 1206, 1208, 1210, 1213, 1214, 1219, 1220, 1222, 1224, 1226, 1240, 1256, 1258, 1265, 1270, 1275, 1278, 1282, 1291, 1298, 1348, 1377, 1396, 1408, 1445, 1473, 1500, 1517, 1552, 1573, 1580, 1587, 1617, 1620, 1625, 1644, 1676, 1732, 1777, 1793, 1804, 1805, 1920, 1927, 1954, 1967, 2000, 2005, 2012, 2018, 2028, 2033, 2046, 2048, 2054, 2058, 2078, 2090, 2093, 2094, 2096, 2099, 2105, 2115, 2116, 2137, 2139, 2141, 2143, 2161, 2174, 2193, 2253, 2268, 2273, 2291, 2329, 2338, 2353, 2366, 2387, 2389, 2395, 2406, 2414, 2427, 2450, 2459, 2470, 2478, 2492, 2502, 2528, 2529, 2542, 2571, 2580, 2596, 2616, 2617, 2628,

## Модель подсчета рейтинга с использованием кластеризации