In [1]:
import pandas as pd

movies = pd.read_csv('C:/Users/adwiz/Documents/Courses/datascience_netology/datasets/ml-data/movies.csv')

In [2]:
rows = []
for _, row in movies.iterrows():
    for genre in row['genres'].split('|'):
        rows.append([genre, row['movieId']])

movies_genres = pd.DataFrame(rows, columns=['genre', 'movieId'])
movies_genres['genre_id'] = movies_genres['genre'].astype('category').cat.codes.copy()
movies_genres.head()

Unnamed: 0,genre,movieId,genre_id
0,Adventure,1,2
1,Animation,1,3
2,Children,1,4
3,Comedy,1,5
4,Fantasy,1,9


In [3]:
ratings = pd.read_csv('C:/Users/adwiz/Documents/Courses/datascience_netology/datasets/ml-data/ratings.csv')

In [4]:
from scipy.sparse import coo_matrix
import numpy as np

user_item_matrix = coo_matrix(
    ((ratings['rating'] >= 4).astype(np.float32),
     (ratings['userId'], ratings['movieId'])),
    shape = (
        ratings['userId'].unique().max() + 1,
        movies['movieId'].unique().max() + 1
    )
)
user_item_matrix.eliminate_zeros() # удалить нулевые записи из матрицы

In [5]:
# делим разреженную матрицу на обучающую и тестовую
total_len = user_item_matrix.data.size
train_len = int(total_len * .8)
all_indices = np.arange(total_len)
np.random.seed(42)
train_indices = np.random.choice(all_indices, train_len, replace=False)
train_mask = np.in1d(all_indices, train_indices)

In [6]:
def get_masked(arr, mask):
    return coo_matrix(
        (
            [np.float32(item) for item in arr.data[mask]],
            (arr.row[mask], arr.col[mask])
        ),
        arr.shape
    )

In [7]:
train = get_masked(user_item_matrix, train_mask)
test = get_masked(user_item_matrix, ~train_mask)

In [8]:
from scipy.sparse import save_npz

save_npz('C:/Users/adwiz/Documents/Courses/datascience_netology/other/lightfm_train.npz', train)
save_npz('C:/Users/adwiz/Documents/Courses/datascience_netology/other/lightfm_test.npz', test)

In [9]:
from lightfm import LightFM

fm = LightFM()



In [10]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    num_threads=12,
    verbose=True
)

Epoch: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [52:54<00:00, 31.75s/it]

CPU times: total: 52min 54s
Wall time: 52min 55s





<lightfm.lightfm.LightFM at 0x26610e3fd90>

In [11]:
fm.get_params()

{'loss': 'logistic',
 'learning_schedule': 'adagrad',
 'no_components': 10,
 'learning_rate': 0.05,
 'k': 5,
 'n': 10,
 'rho': 0.95,
 'epsilon': 1e-06,
 'max_sampled': 10,
 'item_alpha': 0.0,
 'user_alpha': 0.0,
 'random_state': RandomState(MT19937) at 0x26610DB9840}

In [12]:
%%time
from lightfm.evaluation import reciprocal_rank

rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    num_threads=12
)

CPU times: total: 1h 18min 58s
Wall time: 1h 18min 58s


In [13]:
rr.mean()

0.22738948

In [15]:
%%time
fm.fit_partial(
    interactions=train,
    epochs=100,
    num_threads=12,
    verbose=True
)

Epoch: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [52:50<00:00, 31.71s/it]

CPU times: total: 52min 50s
Wall time: 52min 50s





<lightfm.lightfm.LightFM at 0x26610e3fd90>

In [17]:
user_factors = fm.get_user_representations()
print(user_factors)

(array([0.        , 0.14927335, 0.13578227, ..., 0.        , 0.3737401 ,
       1.0511205 ], dtype=float32), array([[ 0.03066379, -0.02371882,  0.04632885, ...,  0.01933927,
        -0.0109182 ,  0.00576386],
       [-0.05892237,  0.07533388, -0.00120902, ...,  0.0160636 ,
         0.0413982 ,  0.02173831],
       [ 0.01491145,  0.07681484,  0.04461513, ..., -0.05397052,
        -0.01886121,  0.0549685 ],
       ...,
       [ 0.03172434,  0.01704092,  0.00813045, ..., -0.00870013,
        -0.02654032,  0.01924318],
       [-0.1136448 ,  0.17010255,  0.05548843, ..., -0.06930283,
         0.05069719,  0.09762739],
       [-0.14724341,  0.2842345 ,  0.22940426, ..., -0.29796836,
         0.02811358,  0.50113076]], dtype=float32))


In [18]:
print(len(user_factors[0]))
print(user_factors[1].shape)

283229
(283229, 10)


In [19]:
item_factors = fm.get_item_representations()
print(item_factors)

(array([0.       , 9.511578 , 8.276478 , ..., 0.       , 0.       ,
       0.5709119], dtype=float32), array([[ 0.0075431 , -0.04835035,  0.01449538, ...,  0.04319308,
        -0.01159711, -0.04510246],
       [-0.5281699 ,  0.85698104,  0.51154953, ..., -0.48383096,
         0.27811605,  1.0483782 ],
       [-0.45236868,  0.8806588 ,  0.36627984, ..., -0.36334985,
         0.28542385,  0.99988455],
       ...,
       [-0.00898536, -0.01118005,  0.02242933, ..., -0.03251198,
         0.04870926, -0.00577138],
       [ 0.01391027, -0.02967813,  0.04648434, ..., -0.00573243,
        -0.01182511, -0.01617796],
       [-0.15064882,  0.22714819, -0.01595487, ..., -0.03417111,
        -0.0020113 ,  0.12847221]], dtype=float32))


In [22]:
from scipy.sparse import identity, hstack # конкатинирует 2 матрицы

item_feature_matrix = hstack([
    coo_matrix(
        (np.ones(movies_genres.count()[0], dtype=np.float32),
        (movies_genres['movieId'], movies_genres['genre_id'])),
        shape=(user_item_matrix.shape[1], movies_genres['genre_id'].unique().size)
    ),
    identity(user_item_matrix.shape[1])
])
item_feature_matrix.shape

(193887, 193907)

In [24]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    item_features=item_feature_matrix,
    num_threads=12,
    verbose=True
)

Epoch: 100%|███████████████████████████████████████████████████████████████████████| 100/100 [1:05:08<00:00, 39.08s/it]

CPU times: total: 1h 5min 8s
Wall time: 1h 5min 8s





<lightfm.lightfm.LightFM at 0x26610e3fd90>

In [25]:
new_item_factors = fm.get_item_representations()
print(new_item_factors)

(array([7.818739e+00, 8.747635e+00, 6.981470e+00, ..., 0.000000e+00,
       0.000000e+00, 4.172325e-08], dtype=float32), array([[ 0.00799013,  0.11062282,  0.03665303, ...,  0.10267024,
         0.07506839,  0.19684379],
       [ 0.02109262,  0.01427401,  0.00129043, ..., -0.0263094 ,
         0.05165055,  0.05366145],
       [-0.03750359,  0.02375157,  0.04617911, ..., -0.01067868,
         0.02542773, -0.01425616],
       ...,
       [ 0.03673305,  0.04718608,  0.0454399 , ..., -0.04488411,
         0.00085725,  0.03290394],
       [-0.01459847,  0.01758425, -0.01163472, ..., -0.00642719,
        -0.0169201 ,  0.01033426],
       [ 0.0042342 ,  0.04146232, -0.02030915, ...,  0.01934129,
         0.04723404, -0.04406389]], dtype=float32))


In [26]:
print(len(new_item_factors[0]))
print(new_item_factors[1].shape)

193907
(193907, 10)


In [27]:
rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    item_features=item_feature_matrix,
    num_threads=12
)

In [28]:
rr.mean()

0.0029796464

In [None]:
from lightfm import LightFM

fm = LightFM(loss='bpr')

In [None]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    num_threads=12,
)

In [None]:
fm.get_params()

In [None]:
%%time
from lightfm.evaluation import reciprocal_rank

rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    num_threads=12
)

In [None]:
rr.mean()

In [None]:
fm = LightFM(loss='bpr')

In [None]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    num_threads=12,
)

In [None]:
fm.get_params()

In [None]:
rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    num_threads=12
)

In [None]:
rr.mean()

In [None]:
/fm = LightFM(loss='warp'),ю

In [None]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    num_threads=12,
)

In [None]:
fm.get_params()

In [None]:
rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    num_threads=12
)

In [None]:
rr.mean()

In [None]:
fm = LightFM(loss='warp-kos', k=5, n=10)

In [None]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    num_threads=12,
)

In [None]:
fm.get_params()

In [None]:
rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    num_threads=12
)

In [None]:
rr.mean()