In [1]:
# эти библиотеки нам уже знакомы
import pandas as pd
 
# модуль sparse библиотеки scipy понадобится 
# для работы с разреженными матрицами (об этом ниже)
from scipy.sparse import csr_matrix
 
# из sklearn мы импортируем алгоритм k-ближайших соседей
from sklearn.neighbors import NearestNeighbors

In [2]:
animes = pd.read_csv('animes.csv')
reviews = pd.read_csv('reviews.csv')
# убираю лишние колонны
reviews.drop(['uid', 'text', 'scores', 'link'], axis = 1, inplace = True)
reviews.head(3)

Unnamed: 0,profile,anime_uid,score
0,DesolatePsyche,34096,8
1,baekbeans,34599,10
2,skrn,28891,7


In [3]:
# убираю лишние колонны
animes.drop(['synopsis', 'genre', 'aired', 'episodes', 'members', 'popularity', 'ranked', 'score', 'img_url', 'link'], axis = 1, inplace = True)
animes.head(3)

Unnamed: 0,uid,title
0,28891,Haikyuu!! Second Season
1,23273,Shigatsu wa Kimi no Uso
2,34599,Made in Abyss


In [4]:
# создаю связь юзер-айтем, чтобы дальше использовать подходящие аниме для других юзеров
user_item_matrix = pd.pivot_table(reviews, index = 'anime_uid', columns = 'profile', values = 'score')
user_item_matrix.head()

profile,-----noname-----,---SnowFlake---,---was-----,--EYEPATCH--,--Mizu--,--Sunclaudius,--animeislife--,--d41,--mimika--,--skeletor--,...,zz980718,zzSorazz,zzeroparticle,zzs,zzxcvb,zzyamuraihazz,zzz-anime,zzzb,zzzjynne,zzzzzzzz
anime_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# параметр inplace = True опять же поможет сохранить результат, и заменяю NaN на числовые значения(0)
user_item_matrix.fillna(0, inplace = True)
user_item_matrix.head()

profile,-----noname-----,---SnowFlake---,---was-----,--EYEPATCH--,--Mizu--,--Sunclaudius,--animeislife--,--d41,--mimika--,--skeletor--,...,zz980718,zzSorazz,zzeroparticle,zzs,zzxcvb,zzyamuraihazz,zzz-anime,zzzb,zzzjynne,zzzzzzzz
anime_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# вначале сгруппируем (объединим) пользователей, возьмем только столбец score 
# и посчитаем, сколько было оценок у каждого пользователя
users_votes = reviews.groupby('profile')['score'].agg('count')
 
# сделаем то же самое, только для аниме
animes_votes = reviews.groupby('anime_uid')['score'].agg('count')
 
# теперь создадим фильтр (mask)
user_mask = users_votes[users_votes > 50].index
anime_mask = animes_votes[animes_votes > 10].index
 
# применим фильтры и отберем аниме с достаточным количеством оценок
user_item_matrix = user_item_matrix.loc[anime_mask,:]
 
# а также активных пользователей
user_item_matrix = user_item_matrix.loc[:,user_mask]
user_item_matrix.shape

(2362, 269)

In [7]:
csr_data = csr_matrix(user_item_matrix.values)

In [8]:
user_item_matrix = user_item_matrix.rename_axis(None, axis = 1).reset_index()
user_item_matrix.head()

Unnamed: 0,anime_uid,0MAN0,11chadwick,5camp,711nono,84DaysWithout,8thSin,AbsarNaeem,Agent_Redacted,Alpharon,...,starshinesMonet,sushiisawesome,themegamancave,tinypinkghostie,tuva,usaking,vigorousjammer,vitriolcocktail,wolfwing,xcomprr
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# создадим объект класса NearestNeighbors
knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20, n_jobs = -1)
 
# обучим модель
knn.fit(csr_data)

In [10]:
recommendations = 10
search_word = 'Haikyuu'
# для начала найдем аниме в заголовках датафрейма animes
anime_search = animes[animes['title'].str.contains(search_word)]
anime_search

Unnamed: 0,uid,title
0,28891,Haikyuu!! Second Season
10,32935,Haikyuu!!: Karasuno Koukou vs. Shiratorizawa G...
30,30364,Haikyuu!! Movie 2: Shousha to Haisha
86,29755,Haikyuu!! Movie 1: Owari to Hajimari
134,35111,Haikyuu!! Movie 4: Concept no Tatakai
163,35110,Haikyuu!! Movie 3: Sainou to Sense
735,20583,Haikyuu!!
867,25303,Haikyuu!!: Lev Genzan!
1675,40776,Haikyuu!!: To the Top 2nd Season
1676,38883,Haikyuu!!: To the Top


In [11]:
# вариантов может быть несколько, для простоты всегда будем брать первый вариант
# через iloc[0] мы берем первую строку столбца ['anime_uid']
anime_uid = anime_search.iloc[0]['uid']
 
# далее по индексу фильма в датасете animes найдем соответствующий индекс
# в матрице предпочтений
anime_uid = user_item_matrix[user_item_matrix['anime_uid'] == anime_uid].index[0]
anime_uid

1678

In [12]:
distances, indices = knn.kneighbors(csr_data[anime_uid], n_neighbors = recommendations + 1)

In [13]:
# уберем лишние измерения через squeeze() и преобразуем массивы в списки с помощью tolist()
indices_list = indices.squeeze().tolist()
distances_list = distances.squeeze().tolist()
 
# далее с помощью функций zip и list преобразуем наши списки
indices_distances = list(zip(indices_list, distances_list))
 
# в набор кортежей (tuple)
print(type(indices_distances[0]))
 
# и посмотрим на первые три пары/кортежа
print(indices_distances[:3])
# остается отсортировать список по расстояниям через key = lambda x: x[1] (то есть по второму элементу)
# в возрастающем порядке reverse = False
indices_distances_sorted = sorted(indices_distances, key = lambda x: x[1], reverse = False)
 
indices_distances_sorted = indices_distances_sorted[1:]
indices_distances_sorted

<class 'tuple'>
[(1678, 0.0), (1892, 0.37211274378955894), (1449, 0.4681043755617966)]


[(1892, 0.37211274378955894),
 (1449, 0.4681043755617966),
 (1568, 0.499481534075483),
 (1175, 0.5607328686644573),
 (1915, 0.5697636184263161),
 (1750, 0.5735016248224336),
 (1762, 0.5835598075040634),
 (2262, 0.5969556505636742),
 (1799, 0.6117390408503449),
 (1166, 0.6306415724234411)]

In [14]:
# создаем пустой список, в который будем помещать название аниме и расстояние до него
recom_list = []
 
# теперь в цикле будем поочередно проходить по кортежам
for ind_dist in indices_distances_sorted:
 
    # искать anime_uid в матрице предпочтений
    matrix_anime_id = user_item_matrix.iloc[ind_dist[0]]['anime_uid']
 
    # выяснять индекс этого фильма в датафрейме animes
    id = animes[animes['uid'] == matrix_anime_id].index
 
    # брать название аниме и расстояние до него
    title = animes.iloc[id]['title'].values[0]
    dist = ind_dist[1]
 
    # помещать каждую пару в питоновский словарь
    # который, в свою очередь, станет элементом списка recom_list
    recom_list.append({'Title' : title, 'Distance' : dist})
    recom_list[0]

In [15]:
# индекс будем начинать с 1, как и положено рейтингу
recom_df = pd.DataFrame(recom_list, index = range(1, recommendations + 1))
recom_df

Unnamed: 0,Title,Distance
1,Haikyuu!!: Karasuno Koukou vs. Shiratorizawa G...,0.372113
2,Haikyuu!!,0.468104
3,Kuroko no Basket 3rd Season,0.499482
4,Kuroko no Basket,0.560733
5,Bungou Stray Dogs: Hitori Ayumu,0.569764
6,Kuroko no Basket: Saikou no Present Desu,0.573502
7,Servamp,0.58356
8,Tokyo Ghoul:re 2nd Season,0.596956
9,Kuroko no Basket Movie 4: Last Game,0.611739
10,Kimi to Boku. 2,0.630642
