In [1]:
import pandas as pd

from surprise import SVD, Reader, Dataset, NormalPredictor, accuracy
from surprise.model_selection import cross_validate, KFold

pd.set_option("max.columns", None)

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("ratings.csv")

In [3]:
df

Unnamed: 0,user_id,product_id,rating
0,8765713110,1.623200e+12,10
1,8765713110,1.623200e+12,10
2,8765713110,1.623200e+12,10
3,8765713110,1.623200e+12,10
4,8765713110,1.623200e+12,10
...,...,...,...
93324,339223994897,4.569589e+11,8
93325,339223994897,7.906555e+11,8
93326,339223994897,1.406024e+12,8
93327,339223994897,3.394940e+11,7


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93329 entries, 0 to 93328
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   user_id     93329 non-null  int64  
 1   product_id  93296 non-null  float64
 2   rating      93329 non-null  int64  
dtypes: float64(1), int64(2)
memory usage: 2.1 MB


In [5]:
df[df.duplicated()]

Unnamed: 0,user_id,product_id,rating
1,8765713110,1.623200e+12,10
2,8765713110,1.623200e+12,10
3,8765713110,1.623200e+12,10
4,8765713110,1.623200e+12,10
8,8765713110,1.623200e+12,9
...,...,...,...
93100,96854724780,6.652259e+11,10
93150,440533159092,1.663416e+10,7
93196,410360450104,1.300653e+12,8
93200,395679429243,9.718234e+09,8


In [6]:
df = df.drop_duplicates(keep="first")

In [7]:
df[df["product_id"].isna()]

Unnamed: 0,user_id,product_id,rating
4375,190944716400,,6
4477,53064088275,,7
8680,156087835176,,7
9317,34803492348,,8
15267,24257413050,,7
17627,213903919380,,7
18173,94512206592,,6
32269,433781750208,,6
32791,34096916028,,2
35220,183795222312,,8


In [8]:
df = df.dropna()

In [9]:
df["rating"].value_counts()

8     36182
7     19979
6     11337
9     10236
10     7056
5      2517
4      1674
2       656
3       334
Name: rating, dtype: int64

In [10]:
reader = Reader(rating_scale=(1, 10))

In [11]:
data = Dataset.load_from_df(df[['user_id', 'product_id', 'rating']], reader)

In [12]:
cross_validate(NormalPredictor(), data, cv=5)

{'fit_time': (0.11062026023864746,
  0.1472489833831787,
  0.15941572189331055,
  0.14889883995056152,
  0.1404130458831787),
 'test_mae': array([1.51982835, 1.50913353, 1.50342744, 1.49806474, 1.50844934]),
 'test_rmse': array([1.92743268, 1.91134321, 1.91514192, 1.90597049, 1.91474617]),
 'test_time': (0.21341490745544434,
  0.19649410247802734,
  0.19779014587402344,
  0.18596506118774414,
  0.1930680274963379)}

In [13]:
data = data

# define a cross-validation iterator
kf = KFold(n_splits=5)

algo = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.1989
RMSE: 1.1952
RMSE: 1.1934
RMSE: 1.1880
RMSE: 1.1943


In [14]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=2, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 2 split(s).

                  Fold 1  Fold 2  Mean    Std     
RMSE (testset)    1.2147  1.2103  1.2125  0.0022  
MAE (testset)     0.8866  0.8864  0.8865  0.0001  
Fit time          2.63    2.58    2.60    0.02    
Test time         0.41    0.40    0.40    0.00    


{'fit_time': (2.6282269954681396, 2.581207036972046),
 'test_mae': array([0.88661933, 0.88635799]),
 'test_rmse': array([1.21467896, 1.21027856]),
 'test_time': (0.40659499168395996, 0.40102076530456543)}