In [243]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from pandas.api.types import CategoricalDtype 
import implicit
import tqdm


In [130]:
# Загрузим датасет
ratings = pd.read_csv('ratings.csv')
ratings

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3
...,...,...,...
5976474,49925,510,5
5976475,49925,528,4
5976476,49925,722,4
5976477,49925,949,5


In [132]:
# Зададим бинарные оценки 2 - книга понравилась, 1 - не понравилась, если задать, как в задании, то книги, которые не читали и книги, 
# которые не понравились, не будут отличаться
ratings.loc[ratings['rating'] <= 3, 'rating'] = 1
ratings.loc[ratings['rating'] > 3, 'rating'] = 2
ratings

Unnamed: 0,user_id,book_id,rating
0,1,258,2
1,2,4081,2
2,2,260,2
3,2,9296,2
4,2,2318,1
...,...,...,...
5976474,49925,510,2
5976475,49925,528,2
5976476,49925,722,2
5976477,49925,949,2


In [134]:
# Пронумеруем прочитанные книги
ratings = ratings.sort_values(by=['user_id'])
number = []
n = 1
for i in range(0, len(ratings)):
    if i == 0:
        number.append(n)
        continue
    if ratings.iloc[i,0] != ratings.iloc[i-1,0]:
        n = 1
        number.append(n)
        continue
    else:
        n += 1
        number.append(n)
        continue
ratings['number'] = number
ratings  
    

Unnamed: 0,user_id,book_id,rating,number
0,1,258,2,1
999485,1,140,1,2
999486,1,869,2,3
999487,1,2679,1,4
999488,1,1310,2,5
...,...,...,...,...
5404305,53424,5500,2,129
5404304,53424,4214,2,130
5404302,53424,2032,2,131
5404318,53424,41,2,132


In [136]:
# Посчитаем количество прочитанных
count = ratings.groupby('user_id').max().reset_index()[['user_id', 'number']]
count.rename(columns={'number': 'count'}, inplace=True)
ratings = ratings.merge(count, how='left')
ratings

Unnamed: 0,user_id,book_id,rating,number,count
0,1,258,2,1,117
1,1,140,1,2,117
2,1,869,2,3,117
3,1,2679,1,4,117
4,1,1310,2,5,117
...,...,...,...,...,...
5976474,53424,5500,2,129,133
5976475,53424,4214,2,130,133
5976476,53424,2032,2,131,133
5976477,53424,41,2,132,133


In [138]:
# Переведем номера прочитанных книг в доли
ratings['fractions'] = ratings['number'] / ratings['count']
ratings

Unnamed: 0,user_id,book_id,rating,number,count,fractions
0,1,258,2,1,117,0.008547
1,1,140,1,2,117,0.017094
2,1,869,2,3,117,0.025641
3,1,2679,1,4,117,0.034188
4,1,1310,2,5,117,0.042735
...,...,...,...,...,...,...
5976474,53424,5500,2,129,133,0.969925
5976475,53424,4214,2,130,133,0.977444
5976476,53424,2032,2,131,133,0.984962
5976477,53424,41,2,132,133,0.992481


In [140]:
# Разобъем на трейн
train = ratings[ratings.fractions <= 0.7].drop(columns=['number', 'count', 'fractions'])
train    

Unnamed: 0,user_id,book_id,rating
0,1,258,2
1,1,140,1
2,1,869,2
3,1,2679,1
4,1,1310,2
...,...,...,...
5976434,53424,518,2
5976435,53424,257,2
5976436,53424,332,2
5976437,53424,245,2


In [142]:
# train.to_csv('train for Collabaration.csv', index=False)

In [144]:
# Получим матрицу для трейна
user_index = train.user_id.unique()
books_index = train.book_id.unique()

rows = train['user_id'].astype(CategoricalDtype(categories=user_index)).cat.codes
cols = train['book_id'].astype(CategoricalDtype(categories=books_index)).cat.codes

matrix = sparse.csr_matrix((train.rating, (rows, cols)), shape=(len(user_index), len(books_index)))
matrix = matrix.toarray()
matrix.shape

(53424, 10000)

In [154]:
# Получим матрицу для теста
test = ratings[ratings.fractions > 0.7].drop(columns=['number', 'count', 'fractions'])
# test.to_csv('test for Collabaration.csv', index=False)
user_index1 = test.user_id.unique()
books_index1 = test.book_id.unique()

rows1 = test['user_id'].astype(CategoricalDtype(categories=user_index1)).cat.codes
cols1 = test['book_id'].astype(CategoricalDtype(categories=books_index1)).cat.codes

matrix1 = sparse.csr_matrix((test.rating, (rows1, cols1)), shape=(len(user_index1), len(books_index1)))
matrix1 = matrix1.toarray()
matrix1.shape

(53424, 10000)

In [146]:
# Выведем список названий книг с высокой оценкой для пользователя 29104
books = pd.read_csv('books.csv')
user_number = 29104
for ind, score in enumerate(matrix[user_number]):
    if score>0:
        books_id = books_index[ind]
        print(score, books[books['book_id']==books_id].original_title.values[0])

2 The Last Lecture
2 The Great Gatsby
2 Het Achterhuis: Dagboekbrieven 14 juni 1942 - 1 augustus 1944
2 Velvet Elvis: Repainting the Christian Faith
1 Where the Sidewalk Ends: The Poems and Drawings of Shel Silverstein
1 The Hobbit or There and Back Again
2 Hatchet
1 The Fault in Our Stars
2 Maus: A Survivor's Tale : My Father Bleeds History
2 The Shack: Where Tragedy Confronts Eternity
2 バトル・ロワイアル
2 Artemis Fowl
2 The Arctic Incident
2 The Opal Deception
2 The Eternity Code
1 Ensaio Sobre a Cegueira
2 Fahrenheit 451
2 Freakonomics: A Rogue Economist Explores the Hidden Side of Everything
1 Persepolis
1 The Graveyard Book
1 Gregor the Overlander
2 The Outsiders
2 The Lost Hero
2 Legend
1 The Art of Fielding
1 Insurgent
1 Liar & Spy
2 Lost in Shangri-la
2 Unbroken: A World War II Story of Survival, Resilience, and Redemption
2 The Mark of Athena
2 The Chosen
1 Drama
1 The Forever War
1 Pursuit of God
2 The Litigators
2 Fablehaven
2 The Stonekeeper
2 Darth Paper Strikes Back
2 The Wednes

In [186]:
# Функция, которая подсчитывает метрику AP@K
def average_precision_at_k(recommendations, positive_books, k):
    found = 0
    accumulated_precision = 0
    if len(positive_books) > 0:
        for i, book in enumerate(recommendations):
            if book in positive_books:
                found += 1
                precision_at_k = found / (i + 1)
                accumulated_precision += precision_at_k
        
        average_precision = accumulated_precision / min(k, len(positive_books))
    else:
        average_precision = 0
    
    return average_precision

In [180]:
# случайный список из 500 пользовательских индексов
user_random = np.random.randint(matrix.shape[0], size=500)

In [188]:
# Посчитаем метрики случайного бейзлайна
baseline_ap_list = []
for user in user_random:
    recommendations = set(np.random.randint(matrix.shape[1], size=20))
    user_books = set(np.where(matrix[user]>0)[0])
    recommendations = recommendations - user_books
    positive_books = []
    
    for recommendation in recommendations:
        if recommendation in np.where(matrix1[user]>1)[0]:
            positive_books.append(recommendation)

    AP = average_precision_at_k(recommendations,  positive_books, 10)
    baseline_ap_list.append(AP) 
            
mean_ap_at_10 = sum(baseline_ap_list) / len(baseline_ap_list)

print("Средняя метрика AP@10 для случайного бейзлайна:", mean_ap_at_10)    

Средняя метрика AP@10 для случайного бейзлайна: 0.010286600972127287


In [239]:
# Посчитаем метрики бейзлайна «Популярные айтемы»
baseline_ap_list = []
popular_books = set(train.groupby('book_id').count().reset_index()[['book_id', 'user_id']].sort_values(by=['user_id'], ascending=False).iloc[:20,0])
for user in user_random:
    user_books = set(np.where(matrix[user]>0)[0])
    recommendations = popular_books - user_books
    positive_books = []
    
    for recommendation in recommendations:
        if recommendation in np.where(matrix1[user]>1)[0]:
            positive_books.append(recommendation)

    AP = average_precision_at_k(recommendations,  positive_books, 10)
    baseline_ap_list.append(AP) 
            
mean_ap_at_10 = sum(baseline_ap_list) / len(baseline_ap_list)

print("Средняя метрика AP@10 для бейзлайна «Популярные айтемы»:", mean_ap_at_10)    

Средняя метрика AP@10 для бейзлайна «Популярные айтемы»: 0.05060265909271327


In [249]:
# Обучим модель с базовыми парамтерами
model = implicit.als.AlternatingLeastSquares(factors=100, iterations=15, num_threads=7)

model.fit(sparse.csr_matrix(matrix))

  0%|          | 0/15 [00:00<?, ?it/s]

In [255]:
# Получим список рекомендаций для пользователя 29104
recommendations = model.recommend(user_number, sparse.csr_matrix(matrix)[user_number], N=20)
for ind in recommendations[0]: 
   print(books[books['book_id']==ind].original_title.values[0])

The Lost Hero
61 Hours
Obsidian
Prince of Thorns
Blood of the Fold
He's Just Not That Into You: The No-Excuses Truth to Understanding Guys
Same Kind of Different as Me
The Red Badge of Courage
The Curious Incident of the Dog in the Night-Time
George's Marvellous Medicine
One for the Money
Jurassic Park
The Hobbit or There and Back Again
Batman: Year One
Illusions: The Adventures of a Reluctant Messiah
Ramona the Pest
The Story of My Life
Black Hawk Down
Freakonomics: A Rogue Economist Explores the Hidden Side of Everything
Sushi for Beginners


In [265]:
# Посчитаем метрики модели Implicit
baseline_ap_list = []

for user in tqdm.tqdm(user_random):
    recomended_books = set(model.recommend(user_number, sparse.csr_matrix(matrix)[user_number], N=20)[0])
    user_books = set(np.where(matrix[user]>0)[0])
    recommendations = recomended_books - user_books
    positive_books = []
    
    for recommendation in recommendations:
        if recommendation in np.where(matrix1[user]>1)[0]:
            positive_books.append(recommendation)

    AP = average_precision_at_k(recommendations,  positive_books, 10)
    baseline_ap_list.append(AP) 
            
mean_ap_at_10 = sum(baseline_ap_list) / len(baseline_ap_list)

print("Средняя метрика AP@10 для модели Implicit:", mean_ap_at_10)   

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [23:07<00:00,  2.77s/it]

Средняя метрика AP@10 для модели Implicit: 0.02918189098769594





In [271]:
# Посчитаем несколько версий с разными параметрами модели Implicit
for iteration in tqdm.tqdm(range(15,100,30)):
    for factor in tqdm.tqdm(range(100,300,100)):
        if iteration > 15 or factor > 100:
            model = implicit.als.AlternatingLeastSquares(factors=100, iterations=15, num_threads=7)
    
            model.fit(sparse.csr_matrix(matrix))
            baseline_ap_list = []
            for user in user_random:
                recomended_books = set(model.recommend(user_number, sparse.csr_matrix(matrix)[user_number], N=20)[0])
                user_books = set(np.where(matrix[user]>0)[0])
                recommendations = recomended_books - user_books
                positive_books = []
                
                for recommendation in recommendations:
                    if recommendation in np.where(matrix1[user]>1)[0]:
                        positive_books.append(recommendation)
            
                AP = average_precision_at_k(recommendations,  positive_books, 10)
                baseline_ap_list.append(AP) 
                        
            mean_ap_at_10 = sum(baseline_ap_list) / len(baseline_ap_list)
    
            print(f"Средняя метрика AP@10 для модели Implicit c factors = {factor} и iterations={iteration}:", mean_ap_at_10) 
        

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/15 [00:00<?, ?it/s]


100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [23:47<00:00, 713.69s/it][A
 33%|███████████████████████████▎                                                      | 1/3 [23:47<47:34, 1427.39s/it]

Средняя метрика AP@10 для модели Implicit c factors = 200 и iterations=15: 0.030396550903516846



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/15 [00:00<?, ?it/s]


 50%|█████████████████████████████████████████                                         | 1/2 [24:05<24:05, 1445.32s/it][A

Средняя метрика AP@10 для модели Implicit c factors = 100 и iterations=45: 0.034454256705340296


  0%|          | 0/15 [00:00<?, ?it/s]


100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [48:16<00:00, 1448.44s/it][A
 67%|█████████████████████████████████████████████████████▎                          | 2/3 [1:12:04<38:11, 2291.80s/it]

Средняя метрика AP@10 для модели Implicit c factors = 200 и iterations=45: 0.025759885986046976



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/15 [00:00<?, ?it/s]


 50%|█████████████████████████████████████████                                         | 1/2 [24:11<24:11, 1451.95s/it][A

Средняя метрика AP@10 для модели Implicit c factors = 100 и iterations=75: 0.029469270555725666


  0%|          | 0/15 [00:00<?, ?it/s]


100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [48:22<00:00, 1451.46s/it][A
100%|████████████████████████████████████████████████████████████████████████████████| 3/3 [2:00:27<00:00, 2409.07s/it]

Средняя метрика AP@10 для модели Implicit c factors = 200 и iterations=75: 0.02845836181362497





Наилучшего результата удалось достичь при параметрах factors = 100 и iterations=45, при этом mAP=0.0345, что ниже 
, чем у бейзлайна «Популярные айтемы». Данную модель нельзя назвать удачной, т.к. гораздо менее трудозатратный подход с самымыми популярными айтемами даёт к тому же лучшие метрики.