# 67373视频推荐系统

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle
import os

NUM_RECOMM_VIDEOS = 12 # 获取推荐视频的个数

def save_model(fname, model):
    os.makedirs('models', exist_ok=True)
    path = f'models/{fname}.pkl'
    with open(path, 'wb') as f:
        pickle.dump(model, f)
    print(f'Model [{fname}] has been saved to path: {path}')

In [2]:
# 读取数据
data = []
with open('data/data.txt') as f:
    for line in f.read().splitlines():
        [title, cate_name, cate_id, duration, creation_time, video_id] = line.split(',')
        data.append({
            'title': title,
            'cate_name': cate_name[3:],
            'cate_id': cate_id,
            'duration': duration,
            'creation_time': creation_time,
            'video_id': video_id,
        })

print(f'Total: {len(data)}')

Total: 1072


In [3]:
# 暂时只用title和cate_name
corpus = [''.join([i['title'], i['cate_name']]) for i in data]

# 向量化
def tokenizer(s):
    return list(s)

vectorizer = TfidfVectorizer(tokenizer=tokenizer)
X = vectorizer.fit_transform(corpus)

# +1是因为包含了自身
knn = NearestNeighbors(n_neighbors=NUM_RECOMM_VIDEOS+1)
knn.fit(X)

save_model('vectorizer', vectorizer)
save_model('knn', knn)

Model [vectorizer] has been saved to path: models/vectorizer.pkl
Model [knn] has been saved to path: models/knn.pkl


In [4]:
# 测试
query = '[2022-03-02] 《艾尔登法环》（第一期）'
query = vectorizer.transform([query])
dist, nbrs = knn.kneighbors(query)

for i, score in zip(nbrs[0], 1-dist[0]):
    word = data[i]['title']
    print(f'({word}, {score:.3f})')

([2022-03-02] 《艾尔登法环》（第一期）, 0.747)
([2022-03-02] 《艾尔登法环》（第二期）, 0.650)
([2022-03-30] 《艾尔登法环》（第六期）, 0.574)
([2022-03-09] 《艾尔登法环》（第三期）, 0.562)
([2022-04-06] 《艾尔登法环》（第七期）, 0.464)
([2022-03-23] 《艾尔登法环》（第五期）高端女玩家, 0.397)
([2022-04-13] 《艾尔登法环》（第八期）高端女玩家, 0.322)
([2022-03-16] 《艾尔登法环》（第四期）平民女玩家vs碎星？, 0.234)
([2022-03-05] HI, 0.180)
([2022-04-20] 《艾尔登法环》（第九期）到底我先通关老头环还是小区先解封？, 0.165)
([2021-06-23] 《生化危机：村庄》（第一期）, 0.159)
([2022-05-04] 《艾尔登法环》（第十一期 · 上）女武神vs女武神好伐, 0.154)
([2022-05-04] 《艾尔登法环》（第十一期 · 下）女武神vs女武神好伐, 0.153)


In [5]:
# 将测试结果写入本地
with open('test_results.txt', 'w') as f:
    for i in range(0, 20):
        title     = data[i]['title']
        cate_name = data[i]['cate_name']
        video_id  = data[i]['video_id']
        f.write(f'测试数据: {title}\n')
        query = vectorizer.transform([''.join([title, cate_name])])
        nbrs = knn.kneighbors(query, return_distance=False)
        recomms = [(data[idx]['title'], data[idx]['video_id']) for idx in nbrs[0]]
        # 根据video_id筛选掉当前的测试数据
        recomms = list(filter(lambda v: v[1] != video_id, recomms))
        if len(recomms) > NUM_RECOMM_VIDEOS:
            recomms = recomms[:NUM_RECOMM_VIDEOS]
        f.write(f'测试结果:\n')
        for t, _ in recomms:
            f.write(f'{t}\n')
        f.write('\n\n')