In [1]:
import pandas as pd

movies = pd.read_csv('C:/Users/adwiz/Documents/Courses/datascience_netology/datasets/ml-data/movies.csv')

In [2]:
rows = []
for _, row in movies.iterrows():
    for genre in row['genres'].split('|'):
        rows.append([genre, row['movieId']])

movies_genres = pd.DataFrame(rows, columns=['genre', 'movieId'])
movies_genres['genre_id'] = movies_genres['genre'].astype('category').cat.codes.copy()
movies_genres.head()

Unnamed: 0,genre,movieId,genre_id
0,Adventure,1,2
1,Animation,1,3
2,Children,1,4
3,Comedy,1,5
4,Fantasy,1,9


In [3]:
ratings = pd.read_csv('C:/Users/adwiz/Documents/Courses/datascience_netology/datasets/ml-data/ratings.csv')

In [4]:
from scipy.sparse import coo_matrix
import numpy as np

user_item_matrix = coo_matrix(
    ((ratings['rating'] >= 4).astype(np.float32),
     (ratings['userId'], ratings['movieId'])),
    shape = (
        ratings['userId'].unique().max() + 1,
        movies['movieId'].unique().max() + 1
    )
)
user_item_matrix.eliminate_zeros() # удалить нулевые записи из матрицы

In [5]:
# делим разреженную матрицу на обучающую и тестовую
total_len = user_item_matrix.data.size
train_len = int(total_len * .8)
all_indices = np.arange(total_len)
np.random.seed(42)
train_indices = np.random.choice(all_indices, train_len, replace=False)
train_mask = np.in1d(all_indices, train_indices)

In [6]:
def get_masked(arr, mask):
    return coo_matrix(
        (
            [np.float32(item) for item in arr.data[mask]],
            (arr.row[mask], arr.col[mask])
        ),
        arr.shape
    )

In [7]:
train = get_masked(user_item_matrix, train_mask)
test = get_masked(user_item_matrix, ~train_mask)

In [8]:
from scipy.sparse import save_npz

save_npz('C:/Users/adwiz/Documents/Courses/datascience_netology/other/lightfm_train.npz', train)
save_npz('C:/Users/adwiz/Documents/Courses/datascience_netology/other/lightfm_test.npz', test)

In [9]:
from lightfm import LightFM

fm = LightFM()



In [10]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    num_threads=12,
    verbose=True
)

Epoch: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [52:54<00:00, 31.75s/it]

CPU times: total: 52min 54s
Wall time: 52min 55s





<lightfm.lightfm.LightFM at 0x26610e3fd90>

In [11]:
fm.get_params()

{'loss': 'logistic',
 'learning_schedule': 'adagrad',
 'no_components': 10,
 'learning_rate': 0.05,
 'k': 5,
 'n': 10,
 'rho': 0.95,
 'epsilon': 1e-06,
 'max_sampled': 10,
 'item_alpha': 0.0,
 'user_alpha': 0.0,
 'random_state': RandomState(MT19937) at 0x26610DB9840}

In [12]:
%%time
from lightfm.evaluation import reciprocal_rank

rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    num_threads=12
)

CPU times: total: 1h 18min 58s
Wall time: 1h 18min 58s


In [13]:
rr.mean()

0.22738948

In [None]:
%%time
fm.fit_partial(
    interactions=train,
    epochs=100,
    num_threads=12,
    verbose=True
)

Epoch:   1%|▊                                                                          | 1/100 [00:32<53:20, 32.33s/it]

In [None]:
usser = fm.get_user_representations()
print(user_factors)

In [None]:
print(len(user_factors[0]))
print(user_factors[1].shape)

In [None]:
item_factors = fm.get_item_representations()
print(item_factors)

In [None]:
from scipy.sparse import identity, hstack # конкатинирует 2 матрицы

item_feature_matrix = hstack([
    coo_matrix(
        (np.ones(movies_genres.count()[0], dtype=np.float32),
        (movies_genres['movieId'], movie_genres['genre_id'])),
        shape=(user_item_matrix.shape[1], movie_genres['genre_id'].unique().size)
    ),
    identity(user_item_matrix.shape[1])
])
item_feature_matrix.shape

In [None]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    item_features=item_feature_matrix
    num_threads=12,
    verbose=True
)

In [None]:
new_item_factors = fm.get_item_representations()
print(new_item_factors)

In [None]:
print(len(new_item_factors[0]))
print(new_item_factors[1].shape)

In [None]:
rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    item_features=item_feature_matrix,
    num_threads=12
)

In [None]:
rr.mean()