In [34]:
import pandas as pd
import numpy as np
from scipy import sparse
import gc
import psutil
import implicit
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from pandas.api.types import CategoricalDtype
from collections import Counter
from itertools import product
from tqdm import tqdm

In [1]:
import random

In [9]:
a = random.choice([1,2,3,4,5,6])
a

3

In [2]:
def memory():
    gc.collect()
    print(f"все свободно памяти: {psutil.virtual_memory().available/1024**3: .2f}\nкод занимает: {psutil.Process().memory_info().rss/1024**3:.2f}")

In [3]:
memory()

все свободно памяти:  18.84
код занимает: 0.18


In [4]:
ratings = pd.read_csv("ratings.csv")
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [5]:
ratings["rating_bin"] = ratings["rating"].apply(lambda x : 1 if x>3 else 0)
ratings["rating_bin"].value_counts(normalize=True)

rating_bin
1    0.689722
0    0.310278
Name: proportion, dtype: float64

In [6]:
ratings[ratings["user_id"]==1]["book_id"]

0           258
75          268
76         5556
77         3638
78         1796
           ... 
5704475     142
5704476     642
5704477     901
5704479     212
5704480     231
Name: book_id, Length: 117, dtype: int64

In [7]:
ratings.sort_values("user_id")

Unnamed: 0,user_id,book_id,rating,rating_bin
0,1,258,5,1
999485,1,140,3,0
999486,1,869,4,1
999487,1,2679,3,0
999488,1,1310,4,1
...,...,...,...,...
5404305,53424,5500,4,1
5404304,53424,4214,5,1
5404302,53424,2032,4,1
5404318,53424,41,5,1


In [8]:
ratings["number_of_books"] = ratings.groupby("user_id").cumcount()+1

In [9]:
users = ratings.groupby("user_id").agg(counts_book = ("book_id", "count"))

In [10]:
users

Unnamed: 0_level_0,counts_book
user_id,Unnamed: 1_level_1
1,117
2,65
3,91
4,134
5,100
...,...
53420,110
53421,110
53422,130
53423,77


In [11]:
ratings = ratings.merge(users, on = "user_id", how = "inner")

In [12]:
ratings

Unnamed: 0,user_id,book_id,rating,rating_bin,number_of_books,counts_book
0,1,258,5,1,1,117
1,2,4081,4,1,1,65
2,2,260,5,1,2,65
3,2,9296,5,1,3,65
4,2,2318,3,0,4,65
...,...,...,...,...,...,...
5976474,49925,510,5,1,131,135
5976475,49925,528,4,1,132,135
5976476,49925,722,4,1,133,135
5976477,49925,949,5,1,134,135


In [13]:
ratings["ratio_book"] = ratings["number_of_books"]/ratings["counts_book"]

In [14]:
ratings = ratings.drop(columns = ["number_of_books", "counts_book"])

In [15]:
ratings

Unnamed: 0,user_id,book_id,rating,rating_bin,ratio_book
0,1,258,5,1,0.008547
1,2,4081,4,1,0.015385
2,2,260,5,1,0.030769
3,2,9296,5,1,0.046154
4,2,2318,3,0,0.061538
...,...,...,...,...,...
5976474,49925,510,5,1,0.970370
5976475,49925,528,4,1,0.977778
5976476,49925,722,4,1,0.985185
5976477,49925,949,5,1,0.992593


In [16]:
ratings[ratings["ratio_book"]<=0.8]

Unnamed: 0,user_id,book_id,rating,rating_bin,ratio_book
0,1,258,5,1,0.008547
1,2,4081,4,1,0.015385
2,2,260,5,1,0.030769
3,2,9296,5,1,0.046154
4,2,2318,3,0,0.061538
...,...,...,...,...,...
5976428,49802,9148,5,1,0.757576
5976429,49802,1086,4,1,0.767677
5976430,49802,1553,4,1,0.777778
5976431,49802,2090,4,1,0.787879


In [17]:
train = ratings[ratings["ratio_book"]<=0.75][["user_id", "book_id", "rating_bin"]]
test = ratings[ratings["ratio_book"]>0.75][["user_id", "book_id", "rating_bin"]]

In [18]:
train.shape, test.shape

((4462310, 3), (1514169, 3))

In [19]:
user_index = ratings["user_id"].unique()
book_index = ratings["book_id"].unique()
user_cat = CategoricalDtype(categories=user_index)
book_cat = CategoricalDtype(categories=book_index)
rows_train = train["user_id"].astype(user_cat).cat.codes
cols_train = train["book_id"].astype(book_cat).cat.codes
rows_test = test["user_id"].astype(user_cat).cat.codes
cols_test = test["book_id"].astype(book_cat).cat.codes

In [20]:
train_matrix = sparse.csr_matrix((train["rating_bin"], (rows_train, cols_train)), shape = (len(user_index), len(book_index)))
test_matrix = sparse.csr_matrix((test["rating_bin"], (rows_test, cols_test)), shape = (len(user_index), len(book_index)))

In [21]:
train_matrix_np = train_matrix.toarray()
test_matrix_np = test_matrix.toarray()
train_matrix_np.shape, test_matrix_np.shape

((53424, 10000), (53424, 10000))

In [22]:
train["user_id"].nunique(), train["book_id"].nunique()

(53424, 10000)

In [23]:
books = pd.read_csv("books.csv")
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [24]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   book_id                    10000 non-null  int64  
 1   goodreads_book_id          10000 non-null  int64  
 2   best_book_id               10000 non-null  int64  
 3   work_id                    10000 non-null  int64  
 4   books_count                10000 non-null  int64  
 5   isbn                       9300 non-null   object 
 6   isbn13                     9415 non-null   float64
 7   authors                    10000 non-null  object 
 8   original_publication_year  9979 non-null   float64
 9   original_title             9415 non-null   object 
 10  title                      10000 non-null  object 
 11  language_code              8916 non-null   object 
 12  average_rating             10000 non-null  float64
 13  ratings_count              10000 non-null  int6

In [25]:
train_matrix_np[42][train_matrix_np[42]>0]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int64)

In [26]:
user = train_matrix_np[42]

In [27]:
user_books = []
for i in range(len(user)):
    if user[i]>0:
        user_books.append(books[books["book_id"]==i]["title"].values[0])
user_books[:10]        

['The Great Gatsby',
 'The Girl with the Dragon Tattoo (Millennium, #1)',
 'Catching Fire (The Hunger Games, #2)',
 'Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)',
 'Mockingjay (The Hunger Games, #3)',
 'Of Mice and Men',
 'Fifty Shades of Grey (Fifty Shades, #1)',
 'A Game of Thrones (A Song of Ice and Fire, #1)',
 'Eclipse (Twilight, #3)',
 "The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy, #1)"]

In [28]:
len(user_books)

68

In [29]:
train_matrix_np[0], test_matrix_np[0]

(array([1, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64))

In [30]:
model = NearestNeighbors(n_neighbors=200,  algorithm="brute", metric="cosine")
model.fit(train_matrix)

0,1,2
,n_neighbors,200
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


In [31]:
def ap_counting(user_id, k):
    if user_id < 0:
        return "ошибка: параметр user_id  должнен быть больше или равен 0"
    if k <= 0:
        return "ошибка: параметр k  должнен быть больше  0"
    a = user_id
    _, b = model.kneighbors(train_matrix[a])
    q = b.flatten()[1:]
    q_books = np.where(train_matrix_np[q]>0)[1]
    a_books = np.where(train_matrix_np[a]>0)[0]
    r_books =q_books[~np.isin(q_books, a_books)]
    d_books = Counter(r_books)
    df = pd.DataFrame(list(d_books.values()), index = list(d_books.keys()))
    max_c = df[0].max()
    #recommendations = df[df[0]>(int(max_c*0.4))].sort_values(0, ascending = False).index.tolist()
    recommendations = df.sort_values(0, ascending = False).index.tolist()[:k]
    l_test = np.where(test_matrix_np[a]>0)[0]
    #l_test= [x for x in range(len(test_matrix_np[a])) if test_matrix_np[a][x]>0]
    if len(l_test)== 0:
        return 0
    l_ap = []
    iterations = 0
    for i in range(len(recommendations)):
        if recommendations[i] in l_test:
            l_ap.append((1+iterations)/(i+1))
            iterations +=1       
    apk = sum(l_ap)/len(l_test)
    return apk

In [32]:
user = np.random.randint(train_matrix_np.shape[0])
ap_counting(user, 10)

0.006802721088435374

In [35]:
ap_list = []
users = np.random.randint(0, train_matrix_np.shape[0], 500)
for user in tqdm(users):
    result = ap_counting(user, 10)
    ap_list.append(result)
mAP = sum(ap_list)/len(ap_list)
mAP   

100%|████████████████████████████████████████| 500/500 [02:57<00:00,  2.82it/s]


0.020294440845614187

In [36]:
l_book = np.where(train_matrix_np>0)[1]
users_book = Counter(l_book)
df_books = pd.DataFrame({"count_books": users_book.values()}, index = users_book.keys())

In [37]:
df_books_sort = df_books.sort_values("count_books", ascending = False).head(150)

In [38]:
df_books_sort["books"] = [books[books["book_id"]==book_index[x]]["title"].values[0] for x in df_books_sort.index.tolist()]
df_books_sort

Unnamed: 0,count_books,books
18,16819,Harry Potter and the Sorcerer's Stone (Harry P...
5694,16574,"The Hunger Games (The Hunger Games, #1)"
161,14263,To Kill a Mockingbird
15,11584,Harry Potter and the Prisoner of Azkaban (Harr...
1345,11415,Harry Potter and the Deathly Hallows (Harry Po...
...,...,...
540,2807,The Bell Jar
279,2788,Divine Secrets of the Ya-Ya Sisterhood
5650,2786,"A Dance with Dragons (A Song of Ice and Fire, #5)"
110,2777,Wicked: The Life and Times of the Wicked Witch...


In [39]:
df_books_sort

Unnamed: 0,count_books,books
18,16819,Harry Potter and the Sorcerer's Stone (Harry P...
5694,16574,"The Hunger Games (The Hunger Games, #1)"
161,14263,To Kill a Mockingbird
15,11584,Harry Potter and the Prisoner of Azkaban (Harr...
1345,11415,Harry Potter and the Deathly Hallows (Harry Po...
...,...,...
540,2807,The Bell Jar
279,2788,Divine Secrets of the Ya-Ya Sisterhood
5650,2786,"A Dance with Dragons (A Song of Ice and Fire, #5)"
110,2777,Wicked: The Life and Times of the Wicked Witch...


In [40]:
book_index

array([ 258, 4081,  260, ..., 9580, 8892, 9548], dtype=int64)

In [41]:
books.loc[books["book_id"]==book_index[161],"title"].values[0]

'To Kill a Mockingbird'

In [42]:
rec = df_books_sort.index.tolist()
apk_list = []
for user in users:
    l_test = np.where(test_matrix_np[user]>0)[0]
    l_train = np.where(train_matrix_np[user]>0)[0]
    notintrain = [x for x in rec if x not in l_train]
    notintrain = notintrain [:10]
    iterations = 0
    l_ap = []
    if len(l_test)!=0:
        for i in range(len(notintrain)):
            if notintrain[i] in l_test:
                l_ap.append((1+iterations)/(i+1))
                iterations +=1       
        apk = sum(l_ap)/len(l_test)
        apk_list.append(apk)
    else:
        apk_list.append(0)
mAP1 = sum(apk_list)/len(apk_list)
mAP1   

0.011958986900959447

In [74]:
factors = [20, 40, 70, 100]
iteration_s = [20, 40, 70, 100]

In [75]:
pairs = product(factors, iteration_s)
pairs = list(pairs)

In [76]:
len(pairs)

16

In [77]:
results = {}
for factor, iters in pairs:
    model = implicit.als.AlternatingLeastSquares(factors=factor, iterations=iters)
    model.fit(train_matrix)
    apk_list = []
    for user in users:
        rec = model.recommend(userid = user, filter_already_liked_items=True, N=10,user_items=train_matrix[user])[0]
        l_test = np.where(test_matrix_np[user]>0)[0][:10]
        iterations = 0
        l_ap = []
        if len(l_test)!=0:
            for i in range(len(rec)):
                if rec[i] in l_test:
                    l_ap.append((1+iterations)/(i+1))
                    iterations +=1       
            apk = sum(l_ap)/len(l_test)
            apk_list.append(apk)
        else:
            apk_list.append(0)
    mAP1 = sum(apk_list)/len(apk_list)
    results[mAP1]=[factor,iters]
    print(mAP1)

  0%|          | 0/20 [00:00<?, ?it/s]

0.031181126228269088


  0%|          | 0/40 [00:00<?, ?it/s]

0.029951587301587302


  0%|          | 0/70 [00:00<?, ?it/s]

0.028982063492063493


  0%|          | 0/100 [00:00<?, ?it/s]

0.02934164021164021


  0%|          | 0/20 [00:00<?, ?it/s]

0.03216230158730159


  0%|          | 0/40 [00:00<?, ?it/s]

0.03160440035273369


  0%|          | 0/70 [00:00<?, ?it/s]

0.03159613756613757


  0%|          | 0/100 [00:00<?, ?it/s]

0.03167793650793651


  0%|          | 0/20 [00:00<?, ?it/s]

0.03418272486772486


  0%|          | 0/40 [00:00<?, ?it/s]

0.033588606701940034


  0%|          | 0/70 [00:00<?, ?it/s]

0.03379867724867725


  0%|          | 0/100 [00:00<?, ?it/s]

0.03407401234567901


  0%|          | 0/20 [00:00<?, ?it/s]

0.03404628747795414


  0%|          | 0/40 [00:00<?, ?it/s]

0.03345544123204837


  0%|          | 0/70 [00:00<?, ?it/s]

0.034246428571428573


  0%|          | 0/100 [00:00<?, ?it/s]

0.034181635487528346


In [90]:
df_results = pd.DataFrame({"mAP": results.keys(), "parameters(factors, iterations)":results.values()})
df_results.sort_values("mAP", ascending = False).head(1)

Unnamed: 0,mAP,"parameters(factors, iterations)"
14,0.034246,"[100, 70]"


In [85]:
model1 = implicit.als.AlternatingLeastSquares(factors=50, iterations=50)
model1.fit(train_matrix)

  0%|          | 0/50 [00:00<?, ?it/s]

In [87]:
pred_matrix = model1.recommend_all(train_matrix, N=10, filter_already_liked_items=True)

In [88]:

apk_list = []
for user in users:
    rec = pred_matrix[user]
    l_test = np.where(test_matrix_np[user]>0)[0][:10]
    iterations = 0
    l_ap = []
    if len(l_test)!=0:
        for i in range(len(rec)):
            if rec[i] in l_test:
                l_ap.append((1+iterations)/(i+1))
                iterations +=1       
        apk = sum(l_ap)/len(l_test)
        apk_list.append(apk)
    else:
        apk_list.append(0)
mAP1 = sum(apk_list)/len(apk_list)
mAP1   

0.032492819349962204

In [89]:
apk_list = []
for user in users:
    rec = model1.recommend(userid = user, filter_already_liked_items=True, N=10,user_items=train_matrix[user])[0]
    l_test = np.where(test_matrix_np[user]>0)[0][:10]
    iterations = 0
    l_ap = []
    if len(l_test)!=0:
        for i in range(len(rec)):
            if rec[i] in l_test:
                l_ap.append((1+iterations)/(i+1))
                iterations +=1       
        apk = sum(l_ap)/len(l_test)
        apk_list.append(apk)
    else:
        apk_list.append(0)
mAP1 = sum(apk_list)/len(apk_list)
mAP1   

0.032492819349962204