# 基于 Surprise 的音乐推荐系统

learn:

 - https://zhuanlan.zhihu.com/p/56834797
 - https://surprise.readthedocs.io/en/stable/getting_started.html


In [18]:
DB = "/Volumes/shared/murecom/intro/spotify/playlists.db"
DATA_COUNT = 10000  # 取多少数据出来用

import sqlite3
import pandas as pd

conn = sqlite3.connect(DB)

pt = pd.read_sql(f"SELECT * FROM playlist_tracks LIMIT {DATA_COUNT}", conn)
pt

Unnamed: 0,playlist_id,track_id
0,37i9dQZF1DWZUozJiHy44Y,0gplL1WMoJ6iYaPgMCL0gX
1,37i9dQZF1DWZUozJiHy44Y,3Kkjo3cT83cw09VJyrLNwX
2,37i9dQZF1DWZUozJiHy44Y,6v0UJD4a2FtleHeSYVX02A
3,37i9dQZF1DWZUozJiHy44Y,6w8ZPYdnGajyfPddTWdthN
4,37i9dQZF1DWZUozJiHy44Y,10ImcQk9tihY1EKMDIbvXJ
...,...,...
9995,4DhjdVg3725DXyAtXYi7KB,18mmN3VrFWRi6SsSBJf6WJ
9996,4DhjdVg3725DXyAtXYi7KB,0YUiI4zdalScQmDUahywEg
9997,4DhjdVg3725DXyAtXYi7KB,23CfGZgeDJkBxuObB6KmmQ
9998,4DhjdVg3725DXyAtXYi7KB,2oAumpCOVTxRtzL3r7LIxJ


搞成 Surprise 能用的数据样式：

> user 对 item 评分 (rating)

In [19]:
# 播放列表 as 用户
# 曲目 as 电影
pt = pt.rename(columns={"playlist_id": "userID", "track_id": "itemID"})
# 歌在播放列表里，就是用户给歌打了一分
pt = pt.join(pd.Series([1] * len(pt), name="rating"))

pt

Unnamed: 0,userID,itemID,rating
0,37i9dQZF1DWZUozJiHy44Y,0gplL1WMoJ6iYaPgMCL0gX,1
1,37i9dQZF1DWZUozJiHy44Y,3Kkjo3cT83cw09VJyrLNwX,1
2,37i9dQZF1DWZUozJiHy44Y,6v0UJD4a2FtleHeSYVX02A,1
3,37i9dQZF1DWZUozJiHy44Y,6w8ZPYdnGajyfPddTWdthN,1
4,37i9dQZF1DWZUozJiHy44Y,10ImcQk9tihY1EKMDIbvXJ,1
...,...,...,...
9995,4DhjdVg3725DXyAtXYi7KB,18mmN3VrFWRi6SsSBJf6WJ,1
9996,4DhjdVg3725DXyAtXYi7KB,0YUiI4zdalScQmDUahywEg,1
9997,4DhjdVg3725DXyAtXYi7KB,23CfGZgeDJkBxuObB6KmmQ,1
9998,4DhjdVg3725DXyAtXYi7KB,2oAumpCOVTxRtzL3r7LIxJ,1


分一下训练集和测试集：

(TODO: 其实不用分，后面不用这个，而是用 surprise.Trainset.build_testset 和 build_anti_testset)

In [24]:
# 洗牌
pt = pt.sample(frac=1)

# 分 10% 出来做测试集
_test_data_rate = 0.1
train_end_idx = int(len(pt) * (1 - _test_data_rate))

pt_train = pt[:train_end_idx].reset_index(drop=True)
pt_test = pt[train_end_idx:].reset_index(drop=True)

print(f"{pt_train.shape=}\n{pt_test.shape=}")

pt_train.shape=(9000, 3)
pt_test.shape=(1000, 3)


在训练集上训练，针对歌单（user）推荐：

In [65]:
# https://surprise.readthedocs.io/en/stable/getting_started.html#use-a-custom-dataset

from surprise import KNNBaseline
from surprise import Reader, Dataset

# custom dataset
reader = Reader(rating_scale=(0, 1))
train_data = Dataset.load_from_df(
    pt_train[['userID', 'itemID', 'rating']],
    reader)
trainset = train_data.build_full_trainset()

# 算法、训练
algo = KNNBaseline()
algo.fit(trainset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x126aec1f0>

在测试集上测试：

In [66]:
from surprise import Trainset

reader = Reader(rating_scale=(0, 1.0))
test_data = Dataset.load_from_df(
    pt_test[['userID', 'itemID', 'rating']],
    reader)

In [67]:
testset = trainset.build_testset()
algo.test(testset, verbose=False)

[Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='26xQbFpVg8JCT5NKQFJXg8', r_ui=1.0, est=1, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='2YekzGSNDRlEvRLmvfRZfb', r_ui=1.0, est=1, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='5teILTGkJkSJXI03XeMIj2', r_ui=1.0, est=1, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='3uUvQGiUZMj8WzG5Vo1MCz', r_ui=1.0, est=1, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='3peoAySRE6rnMfKLdSYfpp', r_ui=1.0, est=1, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='0k1wBT6Dy6gWHLgHZ7I3dG', r_ui=1.0, est=1, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='4td9uuqgQFAzJ3qoJS6Ozx', r_ui=1.0, est=1, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='37i

In [71]:
testset = trainset.build_anti_testset()
algo.test(testset, verbose=False)

[Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='6pKADEM7gx8EZ6UypMUzyd', r_ui=0.5, est=1, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='2V68QXKzrgPDYOqMfwbp2X', r_ui=0.5, est=1, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='1OkW0tv1k3kNB0IXDN2oPL', r_ui=0.5, est=1, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='64yajM6CxtLghmgB53VeXT', r_ui=0.5, est=1, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='0NJNQdvwxZTylydofoF76s', r_ui=0.5, est=1, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='7cVNpJG6phyW7jF8GOswiY', r_ui=0.5, est=1, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='37i9dQZF1DXdDoYRQ4LfQJ', iid='2llAgFuIcyxAacceqb1ZRo', r_ui=0.5, est=1, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='37i

In [79]:
pt_train.sample(1)

Unnamed: 0,userID,itemID,rating
4253,37i9dQZF1DX4LMp7Dggc2V,4NK1i0tWDdnuu93ijPsBGY,1


In [81]:
algo.get_neighbors(iid=algo.trainset.to_inner_uid("4NK1i0tWDdnuu93ijPsBGY"), k=5)

ValueError: User 4NK1i0tWDdnuu93ijPsBGY is not part of the trainset.