In [1]:
from implicit.nearest_neighbours import CosineRecommender
from implicit.als import AlternatingLeastSquares
import pandas as pd
import os

In [2]:
ratings = pd.read_csv('C:/Users/adwiz/Documents/Courses\datascience_netology/datasets/ml-data/ratings.csv')

In [3]:
from scipy.sparse import coo_matrix
import numpy as np

user_item_matrix = coo_matrix((
    (ratings['rating'] >= 4).astype(np.float32),
    (ratings['userId'], ratings['movieId'])
))
user_item_matrix.eliminate_zeros()

In [4]:
# делим разреженную матрицу на обучающую и тестовую
total_len = user_item_matrix.data.size
train_len = int(total_len * .8)
all_indices = np.arange(total_len)
np.random.seed(42)
train_indices = np.random.choice(all_indices, train_len, replace=False)
train_mask = np.in1d(all_indices, train_indices)

In [5]:
def get_masked(arr, mask):
    return coo_matrix(
        (
            [np.float32(item) for item in arr.data[mask]],
            (arr.row[mask], arr.col[mask])
        ),
        arr.shape
    )

In [6]:
train_csr = get_masked(user_item_matrix, train_mask).tocsr()
train = train_csr.T
test_coo = get_masked(user_item_matrix, ~train_mask)
test_csr = test_coo.tocsr()

In [7]:
cosine = CosineRecommender()
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '-1' #UserWarning: Intel MKL BLAS detected. Its highly recommend to set the environment variable 'export MKL_NUM_THREADS=1' to disable its internal multithreading
als = AlternatingLeastSquares(factors=10, iterations=10)



In [None]:
%%time
cosine.fit(train)

  0%|          | 0/283229 [00:00<?, ?it/s]

In [None]:
als.fit(train)

In [None]:
users = list(set(test_coo.row))
small_users = users[:10000]

def get_recs(users, model):
    return {
        user: model.recommend(userid=user, user_items=train_csr, N=50)
        for user in small_users
    }

In [None]:
def hitrate(k, recs, users):
    hits = 0
    for user in users:
        if recs[user]:
            rec_items, _ = zip(*recs[user])
            hits += len(set(rec_items[:k]).intersections(set(test_csr[user].indices))) > 0
    return hits / len(users)

In [None]:
%%time
als = get_recs(small_users, als)

In [None]:
%%time
print(hitrate(50, als_recs, small_users))

In [None]:
%%time
cosine_recs = get_recs(small_users, cosine)

In [None]:
%%time
print(hitrate(50, cosine_recs, small_users))

In [None]:
from itertools import groupby

new_recs = dict()
for user in users:
    new_rec = cosine_rec[user] + als_recs[user]
    new_rec = [
        (
            pair[0],
            sum([values[1] for values in pairp[1]]) / (len(list(pair(1)) if list(pair[1])) else 1)
        )
        for pair in groupby(new_rec, lambda pair[0])
    ]
    new_recs[user] = sorted(new_rec, key=lambda pair: -pair[1])[:50]

In [None]:
%%time
from pickle import dump

with open(r'C:\Users\adwiz\Documents\Courses\datascience_netology\models\als_recs.pkl', 'wb') as f:
    dump(als_recs, f)
    
with open(r'C:\Users\adwiz\Documents\Courses\datascience_netology\models\cosine_recs.pkl', 'wb') as f:
    dump(cosine_recs, f)

with open(r'C:\Users\adwiz\Documents\Courses\datascience_netology\models\test_csr.pkl', 'wb') as f:
    dump(test_csr, f)
