In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../datasets/recdemo.csv')

In [3]:
df

Unnamed: 0,id,A,B,C,D,E,F
0,1,3.0,4.0,2.0,4.0,1.0,
1,2,3.0,4.0,2.0,4.0,,2.0
2,3,,2.0,5.0,5.0,,5.0
3,4,,,,,4.0,
4,5,3.0,,2.0,4.0,4.0,
5,6,,5.0,5.0,5.0,,5.0
6,7,1.0,,,2.0,,3.0
7,8,,,,,4.0,4.0


In [4]:
df_unpivot=pd.melt(df, id_vars=['id'])
df_unpivot.head()

Unnamed: 0,id,variable,value
0,1,A,3.0
1,2,A,3.0
2,3,A,
3,4,A,
4,5,A,3.0


In [5]:
df_unpivot.dropna(inplace=True)
df_unpivot.columns = ['userID', 'itemID', 'rating']

In [6]:
from surprise import Dataset
from surprise import Reader

In [7]:
reader = Reader(rating_scale=(1, 5)) # Зададим разброс оценок
data = Dataset.load_from_df(df_unpivot[['userID', 'itemID', 'rating']], reader)

In [8]:
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

In [9]:
from surprise import SVD

In [10]:
algo = SVD(n_factors=2, random_state=999)
predictions = algo.fit(trainset).test(testset)

In [11]:
df_unpivot1 = df_unpivot.copy()
for i in predictions:
    df_unpivot1 = df_unpivot1.append({'userID': i.uid, 'itemID': i.iid, 'rating': i.est}, ignore_index=True)

In [12]:
df_unpivot1.pivot(index='userID', columns='itemID', values='rating')

itemID,A,B,C,D,E,F
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3.0,4.0,2.0,4.0,1.0,3.312456
2,3.0,4.0,2.0,4.0,3.210599,2.0
3,3.44435,2.0,5.0,5.0,3.684146,5.0
4,3.242607,3.599635,3.41523,3.784069,4.0,3.647434
5,3.0,3.490772,2.0,4.0,4.0,3.533499
6,3.676756,5.0,5.0,5.0,3.884204,5.0
7,1.0,3.194893,2.985395,2.0,3.010203,3.0
8,3.301363,3.630884,3.442314,3.798689,4.0,4.0


In [13]:
algo.pu # users

array([[-0.02356038,  0.086795  ],
       [ 0.03311668, -0.06586878],
       [-0.01663012, -0.05196951],
       [ 0.17326108, -0.22112997],
       [ 0.11922751,  0.0574848 ],
       [-0.03847042, -0.02430361],
       [-0.10099924, -0.04709509],
       [-0.00974106,  0.08383197]])

In [14]:
algo.qi # films

array([[-0.0499725 ,  0.20377982],
       [ 0.07260589, -0.09175883],
       [-0.0048234 , -0.06959711],
       [-0.08501758, -0.10890099],
       [ 0.16616968,  0.1825492 ],
       [ 0.00912318, -0.15003955]])

In [15]:
algo.qi.T # films

array([[-0.0499725 ,  0.07260589, -0.0048234 , -0.08501758,  0.16616968,
         0.00912318],
       [ 0.20377982, -0.09175883, -0.06959711, -0.10890099,  0.1825492 ,
        -0.15003955]])

In [16]:
mu = df_unpivot.rating.mean()

In [17]:
m = algo.pu @ algo.qi.T

In [18]:
m.shape

(8, 6)

In [19]:
i = 0
j = 5

In [20]:
mu + m[i][j] + algo.bu[i] + algo.bi[j]

3.31245613193024

# Сравнение параметров

In [21]:
from surprise import KNNBaseline
from surprise.model_selection import cross_validate

In [22]:
data = Dataset.load_builtin('ml-100k')

In [23]:
df = pd.DataFrame(data.raw_ratings)
df.columns = ['user', 'item', 'rating', 'timestamp']
df.head()

Unnamed: 0,user,item,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [24]:
for i in [1, 3, 5, 7, 20]:
    algo = KNNBaseline(k=i, verbose=False)
    cv = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)
    print(str(i)+'NN:', np.mean(cv['test_rmse']))

1NN: 1.2295359154081826
3NN: 1.032548504304505
5NN: 0.9836027712152072
7NN: 0.9659690209525162
20NN: 0.9348287306914548


In [25]:
for i in [1, 2, 3, 5, 7, 10, 50, 70]:
    algo = SVD(n_factors=i, random_state=999, verbose=False)
    cv = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)
    print(str(i)+'-factors:', np.mean(cv['test_rmse']))

1-factors: 0.9419753545016203
2-factors: 0.9410188928226612
3-factors: 0.9402796951521835
5-factors: 0.9376826016201596
7-factors: 0.9386149657258895
10-factors: 0.9354505622534381
50-factors: 0.936888665176095
70-factors: 0.9354026437505212


In [26]:
 for i in [1, 2, 3, 5, 7, 10, 50, 70]:
    algo = SVD(n_factors=i, biased=False, random_state=999, verbose=False)
    cv = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)
    print(str(i)+'-factors:', np.mean(cv['test_rmse']))

1-factors: 0.9613491383394042
2-factors: 0.960088238338703
3-factors: 0.9470709073259945
5-factors: 0.95187898900611
7-factors: 0.9499247213351033
10-factors: 0.9429454719345713
50-factors: 0.9446334865203111
70-factors: 0.9440880137631391
