In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import implicit
from scipy import sparse
from pandas.api.types import CategoricalDtype
import gc
import psutil
import nltk
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from itertools import combinations
import pickle
import warnings
from sklearn.exceptions import DataConversionWarning

In [2]:
def memory():
    gc.collect()
    print(f"все свободно памяти: {psutil.virtual_memory().available/1024**3: .2f}\nкод занимает: {psutil.Process().memory_info().rss/1024**3: .2f}")

In [3]:
memory()

все свободно памяти:  19.89
код занимает:  0.21


## ALS

In [4]:
raitings_full = pd.read_csv("ratings.csv")
raitings_full.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [5]:
genres = pd.read_json("goodreads_book_genres_initial.json", lines=True)
genres.head()

Unnamed: 0,book_id,genres
0,5333265,"{'history, historical fiction, biography': 1}"
1,1333909,"{'fiction': 219, 'history, historical fiction,..."
2,7327624,"{'fantasy, paranormal': 31, 'fiction': 8, 'mys..."
3,6066819,"{'fiction': 555, 'romance': 23, 'mystery, thri..."
4,287140,{'non-fiction': 3}


In [6]:
raitings = raitings_full.merge(genres["book_id"], on = "book_id", how = "inner")
raitings.shape

(3330234, 3)

In [7]:
raitings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,26,4
2,2,33,4
3,2,301,5
4,2,2686,5


In [8]:
users_list = raitings["user_id"].unique()
train_index, test_index = [],[]
for i in tqdm(users_list):
    a = raitings[raitings["user_id"]==i].index.tolist()
    for j in range(len(a)):
        if (j+1)/len(a)>=0.8:
            test_index.append(a[j])
        else:
            train_index.append(a[j])

100%|███████████████████████████████████| 53424/53424 [08:30<00:00, 104.62it/s]


In [9]:
train = raitings.loc[train_index,:]
test = raitings.loc[test_index, :]
train.shape, test.shape

((2632060, 3), (698174, 3))

In [10]:
train.loc[:,"rating_bin"] = np.where(train["rating"]> 3, 1, 0)
test.loc[:,"rating_bin"] = np.where(test["rating"]> 3, 1, 0)

In [11]:
train.head(), test.head()

(        user_id  book_id  rating  rating_bin
 0             1      258       5           1
 50            1      268       3           0
 51            1     3638       3           0
 52            1     1796       5           1
 135377        1     4691       4           1,
          user_id  book_id  rating  rating_bin
 572788         1      354       3           0
 1100414        1     1030       4           1
 1100790        1     1761       4           1
 1100791        1     1942       3           0
 1100793        1       81       5           1)

In [12]:
train["user_id"].nunique(), test["user_id"].nunique()

(53424, 53424)

In [13]:
train["book_id"].nunique(), test["book_id"].nunique()

(5123, 5123)

In [14]:
users_index = raitings["user_id"].unique()
items_index = raitings["book_id"].unique()
users_cat = CategoricalDtype(users_index)
items_cat = CategoricalDtype(items_index)
train_rows = train["user_id"].astype(users_cat).cat.codes
train_cols = train["book_id"].astype(items_cat).cat.codes
test_rows = test["user_id"].astype(users_cat).cat.codes
test_cols = test["book_id"].astype(items_cat).cat.codes

In [15]:
train_matrix = sparse.csr_matrix((train["rating_bin"], (train_rows, train_cols)), shape= (len(users_index), len(items_index)))
test_matrix = sparse.csr_matrix((test["rating_bin"], (test_rows, test_cols)), shape= (len(users_index), len(items_index)))
train_matrix.shape, test_matrix.shape

((53424, 5123), (53424, 5123))

In [34]:
train_matrix_np = train_matrix.toarray()
test_matrix_np = test_matrix.toarray()

In [35]:
memory()

все свободно памяти:  15.82
код занимает:  4.02


In [30]:
model_als0 = implicit.als.AlternatingLeastSquares()
model_als0.fit(train_matrix)

  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

In [31]:
recommendations = model_als0.recommend_all(train_matrix, N = 30, filter_already_liked_items=True)

In [36]:

def mAP_count(users, recommendations, k, print_apk):
    apk_list = []
    for user in tqdm(users):
        rec= recommendations[user][:k]
        test = np.where(test_matrix_np[user]>0)[0]
        ap = []
        if len(test)==0:
            apk_list.append(0)
        else:
            iters = 0
            for i, book in enumerate(rec):
                if book in test:
                    ap.append((iters+1)/(i+1))
                    iters += 1
            aPK = sum(ap)/len(test)
            apk_list.append(aPK)
            if len(users)<=10 and print_apk == True:
                print(f"for user_id: {user} aPK: {aPK: .2f}")            
    if len(users)>1:
        if sum(apk_list)>0:
            mAP = sum(apk_list)/len(apk_list)
        else:
            print("нет совпадений")
    else:
        mAP=aPK
    return mAP        


In [39]:
val_users = np.random.randint(0, train_matrix.shape[0], 10)
mAP_als = mAP_count(val_users,recommendations, 10, print_apk =True)
mAP_als

100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2499.73it/s]

for user_id: 35137 aPK:  0.00
for user_id: 24871 aPK:  0.01
for user_id: 16507 aPK:  0.00
for user_id: 34417 aPK:  0.00
for user_id: 11033 aPK:  0.00
for user_id: 44439 aPK:  0.01
for user_id: 6640 aPK:  0.00
for user_id: 22462 aPK:  0.00
for user_id: 19565 aPK:  0.00
for user_id: 40686 aPK:  0.07





0.009015567765567766

## LGBMClassifier

In [46]:
memory()

все свободно памяти:  15.44
код занимает:  3.99


In [47]:
books = pd.read_csv("books.csv")
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [48]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   book_id                    10000 non-null  int64  
 1   goodreads_book_id          10000 non-null  int64  
 2   best_book_id               10000 non-null  int64  
 3   work_id                    10000 non-null  int64  
 4   books_count                10000 non-null  int64  
 5   isbn                       9300 non-null   object 
 6   isbn13                     9415 non-null   float64
 7   authors                    10000 non-null  object 
 8   original_publication_year  9979 non-null   float64
 9   original_title             9415 non-null   object 
 10  title                      10000 non-null  object 
 11  language_code              8916 non-null   object 
 12  average_rating             10000 non-null  float64
 13  ratings_count              10000 non-null  int6

In [49]:
nul_title = books[books["original_title"].isna()].index.tolist()
books.loc[nul_title, "original_title"] = books.loc[nul_title, "title"].str.split("(").str[0]
books.loc[:,"original_title"] = books.loc[:, "original_title"].str.lower().str.strip()
books.loc[:,"authors"] = books.loc[:, "authors"].str.lower().str.strip()

In [50]:
books.loc[nul_title, "original_title"]

74                                  bridget jones's diary
142                           all the light we cannot see
209                                       vampire academy
214                                      ready player one
256                                   alice in wonderland
                              ...                        
9942    sherlock holmes and the case of the hound of t...
9944                                          rock bottom
9956      inside of a dog: what dogs see, smell, and know
9957                                          the pacific
9970                                 the tycoon's revenge
Name: original_title, Length: 585, dtype: object

In [51]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,suzanne collins,2008.0,the hunger games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"j.k. rowling, mary grandpré",1997.0,harry potter and the philosopher's stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,stephenie meyer,2005.0,twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,harper lee,1960.0,to kill a mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,f. scott fitzgerald,1925.0,the great gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [52]:
books["book_number"] = books["title"].str.lower().str.split("(").str[1].str.split("#").str[1].str.replace(r"[^0-9\s]", " ", regex= True).str.replace(r"\s+", "", regex = True).str.strip().fillna("0")
books["series"] = books["title"].str.lower().str.split("(").str[1].str.split("#").str[0].str.replace(r"[^0-9a-zA-Z\s]", " ", regex = True).str.replace(r"\s+", " ", regex = True).str.strip().fillna("single")

In [53]:
books[["book_number", "series"]]

Unnamed: 0,book_number,series
0,1,the hunger games
1,1,harry potter
2,1,twilight
3,0,single
4,0,single
...,...,...
9995,2,the edge
9996,2,the years of lyndon johnson
9997,0,single
9998,0,single


In [54]:
genres = pd.read_json("goodreads_book_genres_initial.json", lines=True)
genres.head()

Unnamed: 0,book_id,genres
0,5333265,"{'history, historical fiction, biography': 1}"
1,1333909,"{'fiction': 219, 'history, historical fiction,..."
2,7327624,"{'fantasy, paranormal': 31, 'fiction': 8, 'mys..."
3,6066819,"{'fiction': 555, 'romance': 23, 'mystery, thri..."
4,287140,{'non-fiction': 3}


In [55]:
genres.shape

(2360655, 2)

In [56]:
books = books.merge(genres, on = "book_id", how = "inner")

In [57]:
books.columns

Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',
       'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',
       'original_title', 'title', 'language_code', 'average_rating',
       'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'image_url', 'small_image_url', 'book_number', 'series', 'genres'],
      dtype='object')

In [58]:
len(books.loc[:,"genres"])==0

False

In [59]:
a = np.where(books["genres"].apply(len)==0)[0]
print(a)
len(a)

[   8   14   94  104  155  156  188  291  313  317  344  353  366  388
  423  438  535  545  552  575  588  605  606  619  646  654  655  679
  714  769  776  793  794  927 1009 1127 1204 1205 1246 1303 1314 1315
 1319 1377 1387 1593 1613 1750 1757 1771 1815 1927 2037 2064 2093 2101
 2124 2156 2182 2185 2244 2251 2286 2313 2406 2408 2409 2415 2416 2432
 2550 2554 2623 2895 2909 2915 2916 2921 2992 3060 3090 3106 3117 3127
 3128 3132 3163 3167 3185 3225 3283 3347 3351 3358 3359 3450 3460 3487
 3530 3573 3580 3650 3678 3694 3743 3755 3761 3796 3808 3814 3915 3929
 3960 4046 4079 4178 4232 4242 4260 4286 4307 4311 4372 4711 4713 4744
 4746 4783 4804 4814 4882 4952 4995 5098 5103]


135

In [60]:
books.loc[8, "genres"]

{}

In [61]:
books1 = pd.json_normalize(books["genres"]).fillna(0).astype(int)
books1

Unnamed: 0,"fantasy, paranormal",young-adult,fiction,children,"mystery, thriller, crime",romance,"history, historical fiction, biography",non-fiction,"comics, graphic",poetry
0,42143,14393,11308,6907,467,340,0,0,0,0
1,8249,3157,2807,4168,451,214,0,0,0,0
2,54156,17058,15016,11213,668,0,0,0,0,0
3,45018,14422,12224,8879,614,0,0,0,0,0
4,4639,1513,12103,8558,537,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
5118,13,0,70,0,0,0,0,0,0,0
5119,0,0,110,0,42,0,0,0,0,0
5120,21,0,653,0,6,0,0,0,0,0
5121,3,0,177,0,0,0,0,0,0,0


In [62]:
b = np.where(books1<0)[0]
print(b)

[ 120  330  421  658  711  770  772  822 1010 1191 1243 1550 1766 2045
 2298 2411 2577 2718 2734 3018 3130 3355 3659 3967 3980 4037 4083 4087
 4100 4122 4245 4294 4300 4300 4454 4476 4540]


In [63]:
books1.loc[b, :].head(10)

Unnamed: 0,"fantasy, paranormal",young-adult,fiction,children,"mystery, thriller, crime",romance,"history, historical fiction, biography",non-fiction,"comics, graphic",poetry
120,0,0,-1,0,0,0,26,18,1,0
330,0,-1,0,0,0,1,2,11,0,0
421,0,0,0,0,0,0,0,-1,0,0
658,1,0,17,-1,1,0,9,0,0,0
711,0,0,0,0,0,0,-1,20,0,0
770,0,0,1,0,0,0,-2,0,0,0
772,0,0,1,0,0,0,-1,0,0,0
822,0,0,2,0,0,0,-1,0,0,0
1010,0,0,-1,0,0,0,10,6,0,0
1191,1,0,0,0,0,0,0,-1,0,0


In [64]:
def normalize(data):
    data["total"] = data.sum(axis =1)
    data["total"] = data["total"].apply(lambda x : x if x>0 else 1)
    for column in data.columns:
        data[column] = data[column].apply(lambda x: x if x>=0 else 0)
        data[column] = data[column]/data["total"]
    data = data.astype("float16")    
    data = data.drop(columns = "total")
    return data

In [65]:
books1 = normalize(books1)
books1

Unnamed: 0,"fantasy, paranormal",young-adult,fiction,children,"mystery, thriller, crime",romance,"history, historical fiction, biography",non-fiction,"comics, graphic",poetry
0,0.557617,0.190430,0.149658,0.091431,0.006180,0.004501,0.0,0.0,0.0,0.0
1,0.433105,0.165771,0.147339,0.218872,0.023682,0.011238,0.0,0.0,0.0,0.0
2,0.551758,0.173828,0.153076,0.114319,0.006809,0.000000,0.0,0.0,0.0,0.0
3,0.554688,0.177734,0.150635,0.109375,0.007565,0.000000,0.0,0.0,0.0,0.0
4,0.169556,0.055328,0.442627,0.312988,0.019638,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
5118,0.156616,0.000000,0.843262,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
5119,0.000000,0.000000,0.723633,0.000000,0.276367,0.000000,0.0,0.0,0.0,0.0
5120,0.030884,0.000000,0.960449,0.000000,0.008827,0.000000,0.0,0.0,0.0,0.0
5121,0.016663,0.000000,0.983398,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


In [66]:
books1.describe()

Unnamed: 0,"fantasy, paranormal",young-adult,fiction,children,"mystery, thriller, crime",romance,"history, historical fiction, biography",non-fiction,"comics, graphic",poetry
count,5123.0,5123.0,5123.0,5123.0,5123.0,5123.0,5123.0,5123.0,5123.0,5123.0
mean,0.046112,0.017365,0.332764,0.055023,0.055389,0.023132,0.141602,0.259033,0.015991,0.02684
std,0.131714,0.061951,0.347168,0.172729,0.169678,0.092407,0.208984,0.368164,0.101135,0.126099
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.195068,0.0,0.0,0.0,0.029907,0.0,0.0,0.0
75%,0.003569,0.0,0.652832,0.0,0.0,0.0,0.216797,0.506348,0.0,0.0
max,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.166992,1.0,1.0


In [67]:
books_total = pd.concat([books[["book_id",'books_count','authors','original_title','ratings_count','work_text_reviews_count','series', 'book_number']] , books1], axis = 1)
books_total.shape

(5123, 18)

In [68]:
books_total.columns

Index(['book_id', 'books_count', 'authors', 'original_title', 'ratings_count',
       'work_text_reviews_count', 'series', 'book_number',
       'fantasy, paranormal', 'young-adult', 'fiction', 'children',
       'mystery, thriller, crime', 'romance',
       'history, historical fiction, biography', 'non-fiction',
       'comics, graphic', 'poetry'],
      dtype='object')

In [69]:
def change_columns_name(data, prefix):
    data = data.rename(columns = {x: re.sub(r"[^\w]+", "_", x).strip() for x in data.columns})
    data = data.rename(columns = {x : f"{prefix}_{x}" if prefix not in x else x for x in data.columns})
    return data
                       

In [70]:
books_features = change_columns_name(books_total, "book")
books_features.head()

Unnamed: 0,book_id,books_count,book_authors,book_original_title,book_ratings_count,book_work_text_reviews_count,book_series,book_number,book_fantasy_paranormal,book_young_adult,book_fiction,book_children,book_mystery_thriller_crime,book_romance,book_history_historical_fiction_biography,book_non_fiction,book_comics_graphic,book_poetry
0,1,272,suzanne collins,the hunger games,4780653,155254,the hunger games,1,0.557617,0.19043,0.149658,0.091431,0.00618,0.004501,0.0,0.0,0.0,0.0
1,2,491,"j.k. rowling, mary grandpré",harry potter and the philosopher's stone,4602479,75867,harry potter,1,0.433105,0.165771,0.147339,0.218872,0.023682,0.011238,0.0,0.0,0.0,0.0
2,3,226,stephenie meyer,twilight,3866839,95009,twilight,1,0.551758,0.173828,0.153076,0.114319,0.006809,0.0,0.0,0.0,0.0,0.0
3,4,487,harper lee,to kill a mockingbird,3198671,72586,single,0,0.554688,0.177734,0.150635,0.109375,0.007565,0.0,0.0,0.0,0.0,0.0
4,5,1356,f. scott fitzgerald,the great gatsby,2683664,51992,single,0,0.169556,0.055328,0.442627,0.312988,0.019638,0.0,0.0,0.0,0.0,0.0


In [71]:
books_features.to_csv("vectors_module8/books_features.csv", index = False)

In [72]:
memory()

все свободно памяти:  14.62
код занимает:  4.77


In [73]:
users = pd.json_normalize(books["genres"]).fillna(0)
users = change_columns_name(users, "user") 
users = normalize(users)
users = pd.concat([books["book_id"], users], axis = 1)
users

Unnamed: 0,book_id,user_fantasy_paranormal,user_young_adult,user_fiction,user_children,user_mystery_thriller_crime,user_romance,user_history_historical_fiction_biography,user_non_fiction,user_comics_graphic,user_poetry
0,1,0.557617,0.190430,0.149658,0.091431,0.006180,0.004501,0.0,0.0,0.0,0.0
1,2,0.433105,0.165771,0.147339,0.218872,0.023682,0.011238,0.0,0.0,0.0,0.0
2,3,0.551758,0.173828,0.153076,0.114319,0.006809,0.000000,0.0,0.0,0.0,0.0
3,4,0.554688,0.177734,0.150635,0.109375,0.007565,0.000000,0.0,0.0,0.0,0.0
4,5,0.169556,0.055328,0.442627,0.312988,0.019638,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
5118,9996,0.156616,0.000000,0.843262,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
5119,9997,0.000000,0.000000,0.723633,0.000000,0.276367,0.000000,0.0,0.0,0.0,0.0
5120,9998,0.030884,0.000000,0.960449,0.000000,0.008827,0.000000,0.0,0.0,0.0,0.0
5121,9999,0.016663,0.000000,0.983398,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


In [74]:
users.describe()

Unnamed: 0,book_id,user_fantasy_paranormal,user_young_adult,user_fiction,user_children,user_mystery_thriller_crime,user_romance,user_history_historical_fiction_biography,user_non_fiction,user_comics_graphic,user_poetry
count,5123.0,5123.0,5123.0,5123.0,5123.0,5123.0,5123.0,5123.0,5123.0,5123.0,5123.0
mean,5001.165723,0.046112,0.017365,0.332764,0.055023,0.055389,0.023132,0.141602,0.259033,0.015991,0.02684
std,2882.312766,0.131714,0.061951,0.347168,0.172729,0.169678,0.092407,0.208984,0.368164,0.101135,0.126099
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2532.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5045.0,0.0,0.0,0.195068,0.0,0.0,0.0,0.029907,0.0,0.0,0.0
75%,7479.5,0.003569,0.0,0.652832,0.0,0.0,0.0,0.216797,0.506348,0.0,0.0
max,10000.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.166992,1.0,1.0


In [75]:
raitings = pd.read_csv("ratings.csv")
raitings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [76]:
raitings = raitings.merge(users, on = "book_id", how = "inner")
raitings.shape

(3330234, 13)

In [77]:
raitings.head()

Unnamed: 0,user_id,book_id,rating,user_fantasy_paranormal,user_young_adult,user_fiction,user_children,user_mystery_thriller_crime,user_romance,user_history_historical_fiction_biography,user_non_fiction,user_comics_graphic,user_poetry
0,1,258,5,0.0,0.0,0.0,0.0,0.0,0.0,0.856934,0.142822,0.0,0.0
1,2,26,4,0.0,0.0,0.01268,0.0,0.0,0.0,0.078369,0.908691,0.0,0.0
2,2,33,4,0.773926,0.013298,0.212769,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,301,5,0.047699,0.089539,0.493896,0.244385,0.005424,0.0,0.118896,0.0,0.0,0.0
4,2,2686,5,0.0,0.0,0.868652,0.0,0.0,0.021881,0.109375,0.0,0.0,0.0


In [78]:
users_activity = raitings.drop(columns = ["book_id", "rating"]).groupby("user_id").sum().reset_index()
users_activity

Unnamed: 0,user_id,user_fantasy_paranormal,user_young_adult,user_fiction,user_children,user_mystery_thriller_crime,user_romance,user_history_historical_fiction_biography,user_non_fiction,user_comics_graphic,user_poetry
0,1,11.378357,1.120872,23.835442,2.217911,1.513325,1.061968,5.610268,14.644333,0.738037,0.877197
1,2,3.532860,0.839363,10.658649,2.194305,0.603302,0.619385,4.728386,9.759766,0.056480,1.006836
2,3,4.583302,0.738621,16.375912,0.876907,2.439732,0.448774,5.682619,16.321800,0.565804,1.965607
3,4,11.300301,1.984688,29.156443,3.419891,3.250870,1.118942,9.154797,23.397018,0.514214,0.701599
4,5,2.217377,1.095039,17.326725,3.572632,1.505829,2.244362,5.496758,10.977890,0.000000,2.563698
...,...,...,...,...,...,...,...,...,...,...,...
53419,53420,9.620451,1.853882,20.513374,4.216080,2.484951,0.680809,8.200439,15.495819,0.922668,1.008667
53420,53421,8.054855,1.570335,24.019882,1.823761,0.726456,2.036320,10.018692,17.044012,0.463326,2.240265
53421,53422,8.569559,1.363434,28.551876,2.792824,3.574135,1.323818,9.387726,14.628750,1.719879,1.086243
53422,53423,5.597801,1.075691,14.809097,1.364563,1.331940,0.351543,6.077877,13.341602,0.235062,1.889648


In [79]:
users_activity = normalize(users_activity.drop(columns = "user_id"))
users_activity

Unnamed: 0,user_fantasy_paranormal,user_young_adult,user_fiction,user_children,user_mystery_thriller_crime,user_romance,user_history_historical_fiction_biography,user_non_fiction,user_comics_graphic,user_poetry
0,0.180664,0.017792,0.378418,0.035217,0.024017,0.016861,0.089050,0.232422,0.011719,0.013924
1,0.103882,0.024689,0.313477,0.064514,0.017746,0.018219,0.139038,0.287109,0.001661,0.029617
2,0.091675,0.014771,0.327637,0.017532,0.048798,0.008972,0.113647,0.326416,0.011314,0.039307
3,0.134521,0.023621,0.347168,0.040710,0.038696,0.013321,0.109009,0.278564,0.006123,0.008354
4,0.047180,0.023300,0.368652,0.075989,0.032043,0.047760,0.116943,0.233521,0.000000,0.054535
...,...,...,...,...,...,...,...,...,...,...
53419,0.148071,0.028519,0.315674,0.064880,0.038239,0.010475,0.126221,0.238403,0.014198,0.015518
53420,0.118469,0.023087,0.353271,0.026825,0.010681,0.029953,0.147339,0.250732,0.006813,0.032959
53421,0.117371,0.018677,0.391113,0.038269,0.048950,0.018127,0.128662,0.200439,0.023560,0.014877
53422,0.121521,0.023346,0.321533,0.029617,0.028915,0.007629,0.131958,0.289551,0.005100,0.041016


In [80]:
users_activity

Unnamed: 0,user_fantasy_paranormal,user_young_adult,user_fiction,user_children,user_mystery_thriller_crime,user_romance,user_history_historical_fiction_biography,user_non_fiction,user_comics_graphic,user_poetry
0,0.180664,0.017792,0.378418,0.035217,0.024017,0.016861,0.089050,0.232422,0.011719,0.013924
1,0.103882,0.024689,0.313477,0.064514,0.017746,0.018219,0.139038,0.287109,0.001661,0.029617
2,0.091675,0.014771,0.327637,0.017532,0.048798,0.008972,0.113647,0.326416,0.011314,0.039307
3,0.134521,0.023621,0.347168,0.040710,0.038696,0.013321,0.109009,0.278564,0.006123,0.008354
4,0.047180,0.023300,0.368652,0.075989,0.032043,0.047760,0.116943,0.233521,0.000000,0.054535
...,...,...,...,...,...,...,...,...,...,...
53419,0.148071,0.028519,0.315674,0.064880,0.038239,0.010475,0.126221,0.238403,0.014198,0.015518
53420,0.118469,0.023087,0.353271,0.026825,0.010681,0.029953,0.147339,0.250732,0.006813,0.032959
53421,0.117371,0.018677,0.391113,0.038269,0.048950,0.018127,0.128662,0.200439,0.023560,0.014877
53422,0.121521,0.023346,0.321533,0.029617,0.028915,0.007629,0.131958,0.289551,0.005100,0.041016


In [81]:
users_activity.describe()

Unnamed: 0,user_fantasy_paranormal,user_young_adult,user_fiction,user_children,user_mystery_thriller_crime,user_romance,user_history_historical_fiction_biography,user_non_fiction,user_comics_graphic,user_poetry
count,53424.0,53424.0,53424.0,53424.0,53424.0,53424.0,53424.0,53424.0,53424.0,53424.0
mean,0.095215,0.0215,0.337402,0.041321,0.04245,0.018448,0.127563,0.276611,0.012039,0.027267
std,0.035339,0.008232,0.044617,0.018433,0.019867,0.010376,0.026474,0.049316,0.011238,0.016479
min,0.001214,0.0,0.095886,0.0,0.0,0.0,0.022583,0.000556,0.0,0.0
25%,0.069336,0.015823,0.307617,0.028473,0.028488,0.010948,0.109436,0.244263,0.00243,0.015411
50%,0.09375,0.02095,0.335938,0.039062,0.040466,0.016739,0.126221,0.276611,0.009727,0.025223
75%,0.119202,0.026459,0.365479,0.051666,0.054321,0.024048,0.144287,0.309082,0.018127,0.036682
max,0.295166,0.074829,0.665039,0.18457,0.197144,0.202026,0.307617,0.588867,0.102905,0.153564


In [82]:
users_activity1 = raitings.groupby("user_id").agg(user_books_count = ("book_id", "count")).reset_index()
users_activity1

Unnamed: 0,user_id,user_books_count
0,1,66
1,2,36
2,3,53
3,4,87
4,5,49
...,...,...
53419,53420,68
53420,53421,71
53421,53422,77
53422,53423,47


In [83]:
users_features = pd.concat([users_activity1, users_activity], axis = 1)
users_features

Unnamed: 0,user_id,user_books_count,user_fantasy_paranormal,user_young_adult,user_fiction,user_children,user_mystery_thriller_crime,user_romance,user_history_historical_fiction_biography,user_non_fiction,user_comics_graphic,user_poetry
0,1,66,0.180664,0.017792,0.378418,0.035217,0.024017,0.016861,0.089050,0.232422,0.011719,0.013924
1,2,36,0.103882,0.024689,0.313477,0.064514,0.017746,0.018219,0.139038,0.287109,0.001661,0.029617
2,3,53,0.091675,0.014771,0.327637,0.017532,0.048798,0.008972,0.113647,0.326416,0.011314,0.039307
3,4,87,0.134521,0.023621,0.347168,0.040710,0.038696,0.013321,0.109009,0.278564,0.006123,0.008354
4,5,49,0.047180,0.023300,0.368652,0.075989,0.032043,0.047760,0.116943,0.233521,0.000000,0.054535
...,...,...,...,...,...,...,...,...,...,...,...,...
53419,53420,68,0.148071,0.028519,0.315674,0.064880,0.038239,0.010475,0.126221,0.238403,0.014198,0.015518
53420,53421,71,0.118469,0.023087,0.353271,0.026825,0.010681,0.029953,0.147339,0.250732,0.006813,0.032959
53421,53422,77,0.117371,0.018677,0.391113,0.038269,0.048950,0.018127,0.128662,0.200439,0.023560,0.014877
53422,53423,47,0.121521,0.023346,0.321533,0.029617,0.028915,0.007629,0.131958,0.289551,0.005100,0.041016


In [84]:
users_features.to_csv("vectors_module8/users_features.csv", index = False)

In [85]:
raitings_total = raitings[["user_id","book_id", "rating"]].merge(users_features, on = "user_id", how = "inner").merge(books_features, on = "book_id", how = "inner")
raitings_total

Unnamed: 0,user_id,book_id,rating,user_books_count,user_fantasy_paranormal,user_young_adult,user_fiction,user_children,user_mystery_thriller_crime,user_romance,...,book_fantasy_paranormal,book_young_adult,book_fiction,book_children,book_mystery_thriller_crime,book_romance,book_history_historical_fiction_biography,book_non_fiction,book_comics_graphic,book_poetry
0,1,258,5,66,0.180664,0.017792,0.378418,0.035217,0.024017,0.016861,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.856934,0.142822,0.000000,0.000000
1,2,26,4,36,0.103882,0.024689,0.313477,0.064514,0.017746,0.018219,...,0.000000,0.000000,0.012680,0.000000,0.000000,0.000000,0.078369,0.908691,0.000000,0.000000
2,2,33,4,36,0.103882,0.024689,0.313477,0.064514,0.017746,0.018219,...,0.773926,0.013298,0.212769,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,2,301,5,36,0.103882,0.024689,0.313477,0.064514,0.017746,0.018219,...,0.047699,0.089539,0.493896,0.244385,0.005424,0.000000,0.118896,0.000000,0.000000,0.000000
4,2,2686,5,36,0.103882,0.024689,0.313477,0.064514,0.017746,0.018219,...,0.000000,0.000000,0.868652,0.000000,0.000000,0.021881,0.109375,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3330229,50229,8137,2,66,0.048279,0.012299,0.306396,0.049286,0.057312,0.039093,...,0.000000,0.365967,0.105713,0.284668,0.000000,0.065063,0.109741,0.000000,0.000000,0.069092
3330230,23863,8020,4,89,0.116882,0.024933,0.307617,0.036774,0.025925,0.028534,...,0.029419,0.029419,0.088257,0.853027,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3330231,49925,330,5,70,0.038940,0.004631,0.354248,0.017258,0.034912,0.008614,...,0.008171,0.005451,0.065369,0.866699,0.000000,0.000000,0.000000,0.000000,0.008171,0.046326
3330232,49925,528,4,70,0.038940,0.004631,0.354248,0.017258,0.034912,0.008614,...,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000


In [86]:
raitings_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3330234 entries, 0 to 3330233
Data columns (total 31 columns):
 #   Column                                     Dtype  
---  ------                                     -----  
 0   user_id                                    int64  
 1   book_id                                    int64  
 2   rating                                     int64  
 3   user_books_count                           int64  
 4   user_fantasy_paranormal                    float16
 5   user_young_adult                           float16
 6   user_fiction                               float16
 7   user_children                              float16
 8   user_mystery_thriller_crime                float16
 9   user_romance                               float16
 10  user_history_historical_fiction_biography  float16
 11  user_non_fiction                           float16
 12  user_comics_graphic                        float16
 13  user_poetry                               

In [87]:
raitings_total.columns.tolist()

['user_id',
 'book_id',
 'rating',
 'user_books_count',
 'user_fantasy_paranormal',
 'user_young_adult',
 'user_fiction',
 'user_children',
 'user_mystery_thriller_crime',
 'user_romance',
 'user_history_historical_fiction_biography',
 'user_non_fiction',
 'user_comics_graphic',
 'user_poetry',
 'books_count',
 'book_authors',
 'book_original_title',
 'book_ratings_count',
 'book_work_text_reviews_count',
 'book_series',
 'book_number',
 'book_fantasy_paranormal',
 'book_young_adult',
 'book_fiction',
 'book_children',
 'book_mystery_thriller_crime',
 'book_romance',
 'book_history_historical_fiction_biography',
 'book_non_fiction',
 'book_comics_graphic',
 'book_poetry']

In [88]:
raitings_total.shape

(3330234, 31)

In [89]:
raitings_total["raiting_bin"] = np.where(raitings_total["rating"]>3,1,0)
raitings_total["raiting_bin"].value_counts(normalize = True)

raiting_bin
1    0.693815
0    0.306185
Name: proportion, dtype: float64

In [90]:
raitings_total.to_parquet("vectors_module8/ratings_total.parquet", index = False, engine="fastparquet")

In [91]:
memory()

все свободно памяти:  13.89
код занимает:  5.46


In [92]:
raitings_total = pd.read_parquet("vectors_module8/ratings_total.parquet", engine = "fastparquet")
a = raitings_total.select_dtypes(include = "float32").columns
raitings_total[a] = raitings_total[a].astype("float16")

In [93]:
raitings_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3330234 entries, 0 to 3330233
Data columns (total 32 columns):
 #   Column                                     Dtype  
---  ------                                     -----  
 0   user_id                                    int64  
 1   book_id                                    int64  
 2   rating                                     int64  
 3   user_books_count                           int64  
 4   user_fantasy_paranormal                    float16
 5   user_young_adult                           float16
 6   user_fiction                               float16
 7   user_children                              float16
 8   user_mystery_thriller_crime                float16
 9   user_romance                               float16
 10  user_history_historical_fiction_biography  float16
 11  user_non_fiction                           float16
 12  user_comics_graphic                        float16
 13  user_poetry                               

In [94]:
df = raitings_total.copy()
df["cumcount"] = df.groupby("user_id").cumcount()+1
users = df.groupby("user_id").agg(count = ("user_id", "count"))
df = df.merge(users, on = "user_id", how = "inner")
df["ratio"]=df["cumcount"]/df["count"]
df

Unnamed: 0,user_id,book_id,rating,user_books_count,user_fantasy_paranormal,user_young_adult,user_fiction,user_children,user_mystery_thriller_crime,user_romance,...,book_mystery_thriller_crime,book_romance,book_history_historical_fiction_biography,book_non_fiction,book_comics_graphic,book_poetry,raiting_bin,cumcount,count,ratio
0,1,258,5,66,0.180664,0.017792,0.378418,0.035217,0.024017,0.016861,...,0.000000,0.000000,0.856934,0.142822,0.000000,0.000000,1,1,66,0.015152
1,2,26,4,36,0.103882,0.024689,0.313477,0.064514,0.017746,0.018219,...,0.000000,0.000000,0.078369,0.908691,0.000000,0.000000,1,1,36,0.027778
2,2,33,4,36,0.103882,0.024689,0.313477,0.064514,0.017746,0.018219,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,2,36,0.055556
3,2,301,5,36,0.103882,0.024689,0.313477,0.064514,0.017746,0.018219,...,0.005424,0.000000,0.118896,0.000000,0.000000,0.000000,1,3,36,0.083333
4,2,2686,5,36,0.103882,0.024689,0.313477,0.064514,0.017746,0.018219,...,0.000000,0.021881,0.109375,0.000000,0.000000,0.000000,1,4,36,0.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3330229,50229,8137,2,66,0.048279,0.012299,0.306396,0.049286,0.057312,0.039093,...,0.000000,0.065063,0.109741,0.000000,0.000000,0.069092,0,66,66,1.000000
3330230,23863,8020,4,89,0.116882,0.024933,0.307617,0.036774,0.025925,0.028534,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,89,89,1.000000
3330231,49925,330,5,70,0.038940,0.004631,0.354248,0.017258,0.034912,0.008614,...,0.000000,0.000000,0.000000,0.000000,0.008171,0.046326,1,68,70,0.971429
3330232,49925,528,4,70,0.038940,0.004631,0.354248,0.017258,0.034912,0.008614,...,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,1,69,70,0.985714


In [95]:
x_train = df[df["ratio"]<0.8].drop(columns = ["cumcount", "count", "ratio","rating", "raiting_bin"])
y_train = df[df["ratio"]<0.8][["rating", "raiting_bin"]]
x_test = df[df["ratio"]>=0.8].drop(columns = ["cumcount", "count", "ratio","rating", "raiting_bin"])
y_test = df[df["ratio"]>=0.8][["rating", "raiting_bin"]]

In [96]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2632060 entries, 0 to 3330210
Data columns (total 30 columns):
 #   Column                                     Dtype  
---  ------                                     -----  
 0   user_id                                    int64  
 1   book_id                                    int64  
 2   user_books_count                           int64  
 3   user_fantasy_paranormal                    float16
 4   user_young_adult                           float16
 5   user_fiction                               float16
 6   user_children                              float16
 7   user_mystery_thriller_crime                float16
 8   user_romance                               float16
 9   user_history_historical_fiction_biography  float16
 10  user_non_fiction                           float16
 11  user_comics_graphic                        float16
 12  user_poetry                                float16
 13  books_count                                int6

In [97]:
x_train.shape, x_test.shape

((2632060, 30), (698174, 30))

In [98]:
x_train["user_id"].nunique(), x_test["user_id"].nunique()

(53424, 53424)

In [99]:
x_train = x_train.drop(columns = ["user_id", "book_id"])
x_test = x_test.drop(columns = ["user_id", "book_id"])

In [100]:
num_features = x_train.select_dtypes(include = "int64").columns.tolist()
cat_features = x_train.select_dtypes(include = "object").columns.tolist()
num_features, cat_features

(['user_books_count',
  'books_count',
  'book_ratings_count',
  'book_work_text_reviews_count'],
 ['book_authors', 'book_original_title', 'book_series', 'book_number'])

In [101]:
memory()

все свободно памяти:  12.67
код занимает:  7.03


In [102]:
l = []
for i in range(0,5):
    pairs = combinations(cat_features, i)
    a = list(pairs)
    l.extend(a)
drop_f = {f'drop_{"_".join(i)}' if len(i) > 0 else "null_cat_features": list(i)  for i in l}
print(drop_f)

{'null_cat_features': [], 'drop_book_authors': ['book_authors'], 'drop_book_original_title': ['book_original_title'], 'drop_book_series': ['book_series'], 'drop_book_number': ['book_number'], 'drop_book_authors_book_original_title': ['book_authors', 'book_original_title'], 'drop_book_authors_book_series': ['book_authors', 'book_series'], 'drop_book_authors_book_number': ['book_authors', 'book_number'], 'drop_book_original_title_book_series': ['book_original_title', 'book_series'], 'drop_book_original_title_book_number': ['book_original_title', 'book_number'], 'drop_book_series_book_number': ['book_series', 'book_number'], 'drop_book_authors_book_original_title_book_series': ['book_authors', 'book_original_title', 'book_series'], 'drop_book_authors_book_original_title_book_number': ['book_authors', 'book_original_title', 'book_number'], 'drop_book_authors_book_series_book_number': ['book_authors', 'book_series', 'book_number'], 'drop_book_original_title_book_series_book_number': ['book_

In [103]:
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

In [104]:
model_als0

<implicit.cpu.als.AlternatingLeastSquares at 0x194c1cceb40>

In [105]:
models = {"model_als" : model_als0}
for k, v in drop_f.items():
    x_train1 = x_train.drop(columns = v)
    x_test1 = x_test.drop(columns = v)
    cat_features1 = x_train1.select_dtypes("object").columns.tolist()
    transformer = ColumnTransformer(transformers=[
        ("scaler", StandardScaler(), num_features),
        ("ohe", OneHotEncoder(max_categories=100, dtype=np.int8, handle_unknown="ignore", sparse_output=False), cat_features1)], remainder="passthrough")
    pipeline = Pipeline(steps = [
        ("transformer", transformer),
        ("model", LGBMClassifier(verbose = -1, objective="binary"))])
    pipeline.fit(x_train1,y_train["raiting_bin"])
    models[k] = pipeline
    pred = pipeline.predict_proba(x_test1)[:,1]
    roc = roc_auc_score(y_test["raiting_bin"], pred)
    print(f" при удалении колонок {'всех категориальных' if len(v) == 0 else k} roc_auc: {roc: .4f}")

 при удалении колонок всех категориальных roc_auc:  0.6165
 при удалении колонок drop_book_authors roc_auc:  0.6148
 при удалении колонок drop_book_original_title roc_auc:  0.6161
 при удалении колонок drop_book_series roc_auc:  0.6157
 при удалении колонок drop_book_number roc_auc:  0.6126
 при удалении колонок drop_book_authors_book_original_title roc_auc:  0.6163
 при удалении колонок drop_book_authors_book_series roc_auc:  0.6160
 при удалении колонок drop_book_authors_book_number roc_auc:  0.6143
 при удалении колонок drop_book_original_title_book_series roc_auc:  0.6163
 при удалении колонок drop_book_original_title_book_number roc_auc:  0.6148
 при удалении колонок drop_book_series_book_number roc_auc:  0.6140
 при удалении колонок drop_book_authors_book_original_title_book_series roc_auc:  0.6180
 при удалении колонок drop_book_authors_book_original_title_book_number roc_auc:  0.6141
 при удалении колонок drop_book_authors_book_series_book_number roc_auc:  0.6134
 при удалении 

In [106]:
len(models)

17

In [108]:
with open("vectors_module8/models1.pkl", "wb") as file:
    pickle.dump(models, file)

## Gibrid

In [40]:
with open("vectors_module8/models1.pkl", "rb") as file:
    models = pickle.load(file)

In [41]:
for model in models:
    print(model)

model_als
null_cat_features
drop_book_authors
drop_book_original_title
drop_book_series
drop_book_number
drop_book_authors_book_original_title
drop_book_authors_book_series
drop_book_authors_book_number
drop_book_original_title_book_series
drop_book_original_title_book_number
drop_book_series_book_number
drop_book_authors_book_original_title_book_series
drop_book_authors_book_original_title_book_number
drop_book_authors_book_series_book_number
drop_book_original_title_book_series_book_number
drop_book_authors_book_original_title_book_series_book_number


In [42]:
models["model_als"]

<implicit.cpu.als.AlternatingLeastSquares at 0x18866e4e750>

In [43]:
memory()

все свободно памяти:  15.73
код занимает:  4.04


In [44]:
books_features = pd.read_csv("vectors_module8/books_features.csv")
books_features["book_number"]=books_features["book_number"].astype("object")
books_features

Unnamed: 0,book_id,books_count,book_authors,book_original_title,book_ratings_count,book_work_text_reviews_count,book_series,book_number,book_fantasy_paranormal,book_young_adult,book_fiction,book_children,book_mystery_thriller_crime,book_romance,book_history_historical_fiction_biography,book_non_fiction,book_comics_graphic,book_poetry
0,1,272,suzanne collins,the hunger games,4780653,155254,the hunger games,1,0.55760,0.19040,0.1497,0.09143,0.006180,0.00450,0.0,0.0,0.0,0.0
1,2,491,"j.k. rowling, mary grandpré",harry potter and the philosopher's stone,4602479,75867,harry potter,1,0.43300,0.16580,0.1473,0.21890,0.023680,0.01124,0.0,0.0,0.0,0.0
2,3,226,stephenie meyer,twilight,3866839,95009,twilight,1,0.55200,0.17380,0.1531,0.11430,0.006810,0.00000,0.0,0.0,0.0,0.0
3,4,487,harper lee,to kill a mockingbird,3198671,72586,single,0,0.55470,0.17770,0.1506,0.10940,0.007565,0.00000,0.0,0.0,0.0,0.0
4,5,1356,f. scott fitzgerald,the great gatsby,2683664,51992,single,0,0.16960,0.05533,0.4426,0.31300,0.019640,0.00000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5118,9996,19,ilona andrews,bayou moon,17204,1180,the edge,2,0.15660,0.00000,0.8433,0.00000,0.000000,0.00000,0.0,0.0,0.0,0.0
5119,9997,19,robert a. caro,means of ascent,12582,395,the years of lyndon johnson,2,0.00000,0.00000,0.7236,0.00000,0.276400,0.00000,0.0,0.0,0.0,0.0
5120,9998,60,patrick o'brian,the mauritius command,9421,374,single,0,0.03088,0.00000,0.9604,0.00000,0.008830,0.00000,0.0,0.0,0.0,0.0
5121,9999,7,peggy orenstein,cinderella ate my daughter: dispatches from th...,11279,1988,single,0,0.01666,0.00000,0.9834,0.00000,0.000000,0.00000,0.0,0.0,0.0,0.0


In [45]:
books_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5123 entries, 0 to 5122
Data columns (total 18 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   book_id                                    5123 non-null   int64  
 1   books_count                                5123 non-null   int64  
 2   book_authors                               5123 non-null   object 
 3   book_original_title                        5120 non-null   object 
 4   book_ratings_count                         5123 non-null   int64  
 5   book_work_text_reviews_count               5123 non-null   int64  
 6   book_series                                5122 non-null   object 
 7   book_number                                5123 non-null   object 
 8   book_fantasy_paranormal                    5123 non-null   float64
 9   book_young_adult                           5123 non-null   float64
 10  book_fiction            

In [46]:
users_features = pd.read_csv("vectors_module8/users_features.csv")
users_features

Unnamed: 0,user_id,user_books_count,user_fantasy_paranormal,user_young_adult,user_fiction,user_children,user_mystery_thriller_crime,user_romance,user_history_historical_fiction_biography,user_non_fiction,user_comics_graphic,user_poetry
0,1,66,0.18070,0.01779,0.3784,0.03522,0.02402,0.016860,0.08905,0.2324,0.011720,0.013920
1,2,36,0.10390,0.02469,0.3135,0.06450,0.01775,0.018220,0.13900,0.2870,0.001661,0.029620
2,3,53,0.09170,0.01477,0.3276,0.01753,0.04880,0.008970,0.11365,0.3264,0.011314,0.039300
3,4,87,0.13450,0.02362,0.3472,0.04070,0.03870,0.013320,0.10900,0.2786,0.006123,0.008354
4,5,49,0.04718,0.02330,0.3687,0.07600,0.03204,0.047760,0.11694,0.2335,0.000000,0.054530
...,...,...,...,...,...,...,...,...,...,...,...,...
53419,53420,68,0.14810,0.02852,0.3157,0.06490,0.03824,0.010475,0.12620,0.2384,0.014200,0.015520
53420,53421,71,0.11847,0.02309,0.3533,0.02682,0.01068,0.029950,0.14730,0.2507,0.006813,0.032960
53421,53422,77,0.11740,0.01868,0.3910,0.03827,0.04895,0.018130,0.12870,0.2004,0.023560,0.014880
53422,53423,47,0.12150,0.02335,0.3215,0.02962,0.02892,0.007630,0.13200,0.2896,0.005100,0.041020


In [47]:
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

In [48]:
def gibrid_recommendations(users, items, model, recommendations):
    book_index = {items_index1[i] : i for i in range(len(items_index1))}
    users_dict = {}
    for user in users:
        rec_list = recommendations[user]
        users_dict[user] = [items[i] for i in rec_list]
    user_recs = {}
    for k, v in users_dict.items():
        user_prof = users_features[users_features["user_id"]==k]
        books_prof = books_features[books_features["book_id"].isin(v)]
        pred1, book1 = [], []
        for book in books_prof["book_id"].unique():
            a = books_prof[books_prof["book_id"]==book]
            a.loc[:,user_prof.columns] = user_prof.values
            df_val = a[model.feature_names_in_]
            pred = model.predict_proba(df_val)[:,1]
            pred1.extend(pred)
            #book1.append(book)
        #index1 = [book_index[book] for book in books_prof["book_id"].unique()]
        df = pd.DataFrame({"book_id":books_prof["book_id"].unique(), "pred": pred1, "index":[book_index[book] for book in books_prof["book_id"].unique()]})
        df = df.sort_values("pred", ascending = False)
        recs = df["index"].values
        user_recs[k]=recs

    return user_recs, df


In [49]:
recommendations1 = models["model_als"].recommend_all(train_matrix, N = 30, filter_already_liked_items=True)
items_index1 = raitings["book_id"].unique()

In [53]:
val_users = np.random.randint(0, train_matrix.shape[0], 10)
b = np.random.randint(len(val_users))
total_recs = list(recommendations1[val_users[b]])
best_map, best_model = 0, 0
for k, v in models.items():
    if k != "model_als":
        users_recs,_ = gibrid_recommendations(val_users, items_index1, v, recommendations1)
        mp = mAP_count(users_recs.keys(), users_recs, 10, print_apk=False)
        total_recs.extend(users_recs[val_users[b]])
        if mp>best_map:
            best_map = mp
            best_model = k
        print(f" для модели {k} значение mAP@10:{mp: .4f}")
    else:
        mp = mAP_count(val_users, recommendations1, 10, print_apk=False)
        best_map = mp
        best_model = k
        print(f" для модели {k} значение mAP@10:{mp: .4f}")
if len(set(total_recs)) != 30:
    print(f"ошибка: неправильно определены индексы книг")
print(f"лучший результат метрики mAP@10: {best_map: .4f} показала модель с удаленными колонками: {best_model}")


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3333.05it/s]


 для модели model_als значение mAP@10: 0.0430


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 1999.29it/s]


 для модели null_cat_features значение mAP@10: 0.0277


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 1997.57it/s]


 для модели drop_book_authors значение mAP@10: 0.0283


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2446.94it/s]


 для модели drop_book_original_title значение mAP@10: 0.0291


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2442.81it/s]


 для модели drop_book_series значение mAP@10: 0.0255


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2494.23it/s]


 для модели drop_book_number значение mAP@10: 0.0279


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 4992.62it/s]


 для модели drop_book_authors_book_original_title значение mAP@10: 0.0299


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2506.16it/s]


 для модели drop_book_authors_book_series значение mAP@10: 0.0213


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2549.26it/s]


 для модели drop_book_authors_book_number значение mAP@10: 0.0232


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3331.99it/s]


 для модели drop_book_original_title_book_series значение mAP@10: 0.0286


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3321.43it/s]


 для модели drop_book_original_title_book_number значение mAP@10: 0.0265


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2545.24it/s]


 для модели drop_book_series_book_number значение mAP@10: 0.0267


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3329.87it/s]


 для модели drop_book_authors_book_original_title_book_series значение mAP@10: 0.0222


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3335.70it/s]


 для модели drop_book_authors_book_original_title_book_number значение mAP@10: 0.0237


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3407.79it/s]


 для модели drop_book_authors_book_series_book_number значение mAP@10: 0.0236


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2499.44it/s]


 для модели drop_book_original_title_book_series_book_number значение mAP@10: 0.0326


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3331.19it/s]

 для модели drop_book_authors_book_original_title_book_series_book_number значение mAP@10: 0.0261
лучший результат метрики mAP@10:  0.0430 показала модель с удаленными колонками: model_als





In [55]:
val_users = np.random.randint(0, train_matrix.shape[0], 10)
b = np.random.randint(len(val_users))
total_recs = list(recommendations1[val_users[b]])
best_map, best_model = 0, 0
for k, v in models.items():
    if k != "model_als":
        users_recs,_ = gibrid_recommendations(val_users, items_index1, v, recommendations1)
        mp = mAP_count(users_recs.keys(), users_recs, 10, print_apk=False)
        total_recs.extend(users_recs[val_users[b]])
        if mp>best_map:
            best_map = mp
            best_model = k
        print(f" для модели {k} значение mAP@10:{mp: .4f}")
    else:
        mp = mAP_count(val_users, recommendations1, 10, print_apk=False)
        best_map = mp
        best_model = k
        print(f" для модели {k} значение mAP@10:{mp: .4f}")
if len(set(total_recs)) != 30:
    print(f"ошибка: неправильно определены индексы книг")
print(f"лучший результат метрики mAP@10: {best_map: .4f} показала модель: {best_model}")


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2499.73it/s]


 для модели model_als значение mAP@10: 0.0367


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3405.57it/s]


 для модели null_cat_features значение mAP@10: 0.0459


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3333.84it/s]


 для модели drop_book_authors значение mAP@10: 0.0407


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2513.82it/s]


 для модели drop_book_original_title значение mAP@10: 0.0439


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2327.20it/s]


 для модели drop_book_series значение mAP@10: 0.0269


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2505.71it/s]


 для модели drop_book_number значение mAP@10: 0.0371


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3334.90it/s]


 для модели drop_book_authors_book_original_title значение mAP@10: 0.0414


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3334.37it/s]


 для модели drop_book_authors_book_series значение mAP@10: 0.0121


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3343.41it/s]


 для модели drop_book_authors_book_number значение mAP@10: 0.0390


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3324.85it/s]


 для модели drop_book_original_title_book_series значение mAP@10: 0.0215


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2493.49it/s]


 для модели drop_book_original_title_book_number значение mAP@10: 0.0397


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3332.25it/s]


 для модели drop_book_series_book_number значение mAP@10: 0.0324


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2021.06it/s]


 для модели drop_book_authors_book_original_title_book_series значение mAP@10: 0.0142


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3334.90it/s]


 для модели drop_book_authors_book_original_title_book_number значение mAP@10: 0.0383


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 3337.02it/s]


 для модели drop_book_authors_book_series_book_number значение mAP@10: 0.0312


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2497.35it/s]


 для модели drop_book_original_title_book_series_book_number значение mAP@10: 0.0337


100%|████████████████████████████████████████| 10/10 [00:00<00:00, 2498.39it/s]

 для модели drop_book_authors_book_original_title_book_series_book_number значение mAP@10: 0.0247
лучший результат метрики mAP@10:  0.0459 показала модель: null_cat_features



