In [1]:
# 参考链接 https://blog.csdn.net/qq_40006058/article/details/85345058
# 推荐算法理解 参考链接 http://tieba.baidu.com/p/5494769518?traceid=

In [54]:
import random
import os
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, evaluate, print_perf, model_selection, Reader
import surprise

In [17]:
random.seed(66)

In [22]:
data = Dataset.load_builtin('ml-100k')

In [25]:
# SVD矩阵分解
algo = surprise.SVD()
# pref = surprise.evaluate(algo, data, measures=['rmse', 'mae'])
# 3折交叉验证训练
pref = model_selection.cross_validate(algo, data, measures=['rmse', 'mae'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9460  0.9475  0.9397  0.9444  0.0034  
MAE (testset)     0.7452  0.7466  0.7432  0.7450  0.0014  
Fit time          3.64    3.75    3.70    3.70    0.04    
Test time         0.30    0.29    0.28    0.29    0.01    


In [43]:
### 划分训练集合测试集
data = Dataset.load_builtin('ml-100k')
train_data, test_data = model_selection.train_test_split(data, test_size=0.3, random_state=0)

In [44]:
algo = SVD()
algo.fit(train_data)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x119fbb588>

In [46]:
pred = algo.test(test_data)
surprise.accuracy.rmse(pred)

RMSE: 0.9527


0.9527068145147576

### Train on a whole trainset and the predict() method


In [47]:
data = Dataset.load_builtin('ml-100k')
train_data = data.build_full_trainset()

# 建立模型 训练
algo = surprise.KNNBasic()
# 训练模型
algo.fit(train_data)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x118e0b438>

In [49]:
uid = str(196)  # 原始用户id
iid = str(302)  # 原始物品ID
# 预测用户(uid) 对电影(iid)的评分 r_ui 真实得分
algo.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.06   {'actual_k': 40, 'was_impossible': False}


Prediction(uid='196', iid='302', r_ui=4, est=4.06292421377939, details={'actual_k': 40, 'was_impossible': False})

### 加载自己的数据集

In [28]:
# 指定文件所在路径
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
# 告诉文本阅读器，文本的格式是怎么样的
reader = Reader(line_format='user item rating timestamp', sep='\t')
# 加载数据
data = Dataset.load_from_file(file_path, reader=reader)


In [51]:
train_data, test_data = model_selection.train_test_split(data, test_size=0.3, random_state=0)
# 给定用户和Item，给出基于baseline的估计值
algo = surprise.BaselineOnly()
algo.fit(train_data)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x11b7a2278>

In [52]:
algo.predict(196, 302, 4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 3.53   {'was_impossible': False}


Prediction(uid=196, iid=302, r_ui=4, est=3.527671428571429, details={'was_impossible': False})

### 加载DataFrame格式的数据

In [56]:
# 制造数据
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 'user_foo'],
                'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)
df

Unnamed: 0,itemID,userID,rating
0,1,9,3
1,1,32,2
2,1,2,4
3,2,45,3
4,2,user_foo,1


In [59]:
reader = surprise.Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df, reader)
data

<surprise.dataset.DatasetAutoFolds at 0x11af61390>

In [60]:
# 根据训练集的分布特征随机给出一个预测值
model_selection.cross_validate(surprise.NormalPredictor(), data, measures=['rmse', 'mae'], cv=3)

{'test_rmse': array([0.90717444, 2.21240707, 3.        ]),
 'test_mae': array([0.81337504, 2.20379093, 3.        ]),
 'fit_time': (0.00016617774963378906,
  8.296966552734375e-05,
  5.5789947509765625e-05),
 'test_time': (0.00010800361633300781,
  6.461143493652344e-05,
  3.1948089599609375e-05)}

### K折交叉验证

In [63]:
# 加载数据
data = Dataset.load_builtin('ml-100k')
kf = model_selection.KFold(n_splits=3)
algo = surprise.SVDpp()

for train_data, test_data in kf.split(data):
    # 训练 预测
    algo.fit(train_data)
    pred = algo.test(test_data)
    
    surprise.accuracy.rmse(pred, verbose=True)

RMSE: 0.9372
RMSE: 0.9240
RMSE: 0.9199


### PredefinedKFold 加载多个文件目录, eg: [train0, test0, train1, test1,...]

In [64]:
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')
reader = surprise.Reader('ml-100k')

train_files = files_dir + 'u%d.base'
test_files = files_dir + 'u%d.test'

# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
fold_files = [(train_files % i, test_files % i) for i in range(1, 6)]

In [66]:
data = Dataset.load_from_folds(fold_files, reader)
pkf = model_selection.PredefinedKFold()

In [67]:
algo = surprise.SVD()
for train_data, test_data in pkf.split(data):
    algo.fit(train_data)
    pred = algo.test(test_data)
    surprise.accuracy.rmse(pred, verbose=True)

RMSE: 0.9517
RMSE: 0.9382
RMSE: 0.9356
RMSE: 0.9326
RMSE: 0.9339


### 网格搜索寻找最优参数

In [71]:
data = Dataset.load_builtin('ml-100k')

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
grid_search = model_selection.GridSearchCV(SVD, param_grid=param_grid, measures=['rmse', 'mae'], cv=3)
grid_search.fit(data)

In [42]:
grid_search.best_params

{'rmse': {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4},
 'mae': {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}}

In [68]:
grid_search.cv_results

{'split0_test_rmse': array([1.00134143, 1.00765761, 0.97784742, 0.98658819, 0.98208853,
        0.99036983, 0.9674461 , 0.97729525]),
 'split1_test_rmse': array([0.99854436, 1.00430696, 0.97465316, 0.98303722, 0.97938324,
        0.98722469, 0.96479225, 0.97436013]),
 'split2_test_rmse': array([0.99181419, 0.9982668 , 0.96878332, 0.97786103, 0.97259605,
        0.98145909, 0.95921016, 0.96934216]),
 'mean_test_rmse': array([0.99723333, 1.00341046, 0.9737613 , 0.98249548, 0.97802261,
        0.9863512 , 0.96381617, 0.97366584]),
 'std_test_rmse': array([0.00399843, 0.00388584, 0.00375376, 0.00358339, 0.00399293,
        0.00368986, 0.00343242, 0.00328374]),
 'rank_test_rmse': array([7, 8, 3, 5, 4, 6, 1, 2]),
 'split0_test_mae': array([0.81041145, 0.81911603, 0.78598981, 0.79691577, 0.79026612,
        0.80083287, 0.77646284, 0.78807264]),
 'split1_test_mae': array([0.80597304, 0.81450348, 0.78135737, 0.79200482, 0.78591202,
        0.79605217, 0.77203078, 0.78356387]),
 'split2_test_mae

In [72]:
algo.fit(data.build_full_trainset())


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11af40cf8>

In [73]:
algo.predict(193, 302, 4, verbose=True)


user: 193        item: 302        r_ui = 4.00   est = 3.53   {'was_impossible': False}


Prediction(uid=193, iid=302, r_ui=4, est=3.52986, details={'was_impossible': False})

### ALS & SGD

In [75]:
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = surprise.BaselineOnly(bsl_options=bsl_options)


Using ALS


In [76]:
print('Using SGD')
bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
               }
algo = surprise.BaselineOnly(bsl_options=bsl_options)


Using SGD


### sim_options相似度参数

In [77]:
sim_options = {'name': 'cosine', # 相似度
               'user_based': False  # True计算用户之间的相似度 False计算的是Item之间的相似度
               }
algo = surprise.KNNBasic(sim_options=sim_options)


In [79]:
sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0  # no shrinkage
               }
algo = surprise.KNNBasic(sim_options=sim_options)


In [None]:
su