In [3]:
# 基于Surprise框架的推荐系统Demo
from surprise import KNNBasic
from surprise import Dataset
from surprise import evaluate, print_perf

# http://surprise.readthedocs.io/en/stable/index.html
# http://files.grouplens.org/datasets/movielens/ml-100k-README.txt

# Load the movielens-100k dataset (download it if needed),
# and split it into 3 folds for cross-validation.
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)
# We'll use the famous KNNBasic algorithm.
knn = KNNBasic()
# Evaluate performances of our algorithm on the dataset.
perf = evaluate(knn, data, measures=['RMSE', 'MAE'])
print_perf(perf)



Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.


RMSE: 0.9889
MAE:  0.7803
------------
Fold 2
Computing the msd similarity matrix...


Done computing similarity matrix.


RMSE: 0.9908
MAE:  0.7832
------------
Fold 3


Computing the msd similarity matrix...
Done computing similarity matrix.


RMSE: 0.9878
MAE:  0.7816
------------
------------
Mean RMSE: 0.9892
Mean MAE : 0.7817
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
MAE     0.7803  0.7832  0.7816  0.7817  
RMSE    0.9889  0.9908  0.9878  0.9892  


In [5]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

# 所需参数候选值：epoch、学习率和正则化参数
# 共组合2*2*2=8种参数组合
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

# 将各个参数组合的结果列表展示
import pandas as pd

results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df


0.9635950266798726
{'lr_all': 0.005, 'n_epochs': 10, 'reg_all': 0.4}


  return f(*args, **kwds)


  return f(*args, **kwds)


Unnamed: 0,mean_fit_time,mean_test_mae,mean_test_rmse,mean_test_time,param_lr_all,param_n_epochs,param_reg_all,params,rank_test_mae,rank_test_rmse,split0_test_mae,split0_test_rmse,split1_test_mae,split1_test_rmse,split2_test_mae,split2_test_rmse,std_fit_time,std_test_mae,std_test_rmse,std_test_time
0,0.99856,0.806102,0.997322,0.315428,0.002,5,0.4,"{'lr_all': 0.002, 'n_epochs': 5, 'reg_all': 0.4}",7,7,0.796968,0.988911,0.810063,1.002999,0.811276,1.000056,0.036593,0.006478,0.006067,0.061769
1,0.947777,0.814693,1.003356,0.301667,0.002,5,0.6,"{'lr_all': 0.002, 'n_epochs': 5, 'reg_all': 0.6}",8,8,0.805437,0.994681,0.818581,1.009099,0.820061,1.006288,0.016386,0.006573,0.00624,0.050943
2,2.018068,0.785961,0.977996,0.3089,0.002,10,0.4,"{'lr_all': 0.002, 'n_epochs': 10, 'reg_all': 0.4}",4,4,0.776766,0.969648,0.789723,0.983718,0.791393,0.980622,0.158187,0.006537,0.006037,0.004565
3,1.876242,0.796476,0.986164,0.277767,0.002,10,0.6,"{'lr_all': 0.002, 'n_epochs': 10, 'reg_all': 0.6}",6,6,0.787202,0.977569,0.800398,0.992143,0.801829,0.988781,0.036667,0.006584,0.006231,0.026832
4,1.052878,0.78178,0.97362,0.318643,0.005,5,0.4,"{'lr_all': 0.005, 'n_epochs': 5, 'reg_all': 0.4}",2,3,0.773064,0.965483,0.785179,0.979327,0.787096,0.976052,0.151448,0.006212,0.005908,0.064252
5,0.984598,0.792693,0.982333,0.289444,0.005,5,0.6,"{'lr_all': 0.005, 'n_epochs': 5, 'reg_all': 0.6}",5,5,0.78382,0.973903,0.796388,0.988327,0.797871,0.984769,0.007856,0.006303,0.006135,0.034606
6,2.050858,0.77235,0.963595,0.292109,0.005,10,0.4,"{'lr_all': 0.005, 'n_epochs': 10, 'reg_all': 0.4}",1,1,0.764088,0.955937,0.775624,0.969407,0.777338,0.965441,0.153612,0.005884,0.005652,0.045053
7,1.902498,0.784173,0.973518,0.275965,0.005,10,0.6,"{'lr_all': 0.005, 'n_epochs': 10, 'reg_all': 0.6}",3,2,0.775739,0.965547,0.787623,0.97947,0.789156,0.975536,0.026279,0.005996,0.00586,0.033275


In [8]:
def read_item_names():
    """
    将电影名字和ID进行映射
    :return:
    """
    import io

    file_name = ('/Users/wugang/code/python/rec-demo/data/ml-100k/u.item')
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]
    return rid_to_name, name_to_rid

# test read_item_names
rid_to_name, name_to_rid = read_item_names()
toy_story_raw_id = name_to_rid['Now and Then (1995)']
print(toy_story_raw_id)
raw_id_toy_story = rid_to_name['1053']
print(raw_id_toy_story)


1053
Now and Then (1995)


In [10]:

# 推荐Demo
train_data = data.build_full_trainset()
# 皮尔逊系数计算相似度，基于物品的推荐
sim_options = {'name': 'pearson_baseline', 'user_based': False}
from surprise import KNNBaseline

knn = KNNBaseline(sim_options=sim_options)
knn.fit(train_data)

rid_to_name, name_to_rid = read_item_names()
toy_story_raw_id = name_to_rid['Now and Then (1995)']
print("数据中的ID：%s" % toy_story_raw_id)

# toy_story_inner_id在实际要计算的矩阵中的ID。
toy_story_inner_id = knn.trainset.to_inner_iid(toy_story_raw_id)
print("矩阵中的ID：%s" % toy_story_inner_id)
toy_story_neighbors = knn.get_neighbors(toy_story_inner_id, k=10)
print("最近邻：%s" % toy_story_neighbors)

# 对近邻集合中对ID进行转换为名字
toy_story_neighbors = (knn.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)
print()
print('推荐的Top 10 :')
for movie in toy_story_neighbors:
    print(movie)


Estimating biases using als...


Computing the pearson_baseline similarity matrix...


Done computing similarity matrix.
数据中的ID：1053
矩阵中的ID：564
最近邻：[518, 425, 90, 386, 476, 52, 569, 327, 291, 333]

推荐的Top 10 :
While You Were Sleeping (1995)
Batman (1989)
Dave (1993)
Mrs. Doubtfire (1993)
Groundhog Day (1993)
Raiders of the Lost Ark (1981)
Maverick (1994)
French Kiss (1995)
Stand by Me (1986)
Net, The (1995)
