In [1]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate


# Load the movielens-1m dataset (download it if needed),
data = Dataset.load_builtin('ml-1m')

# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8721  0.8703  0.8757  0.8750  0.8747  0.8736  0.0021  
MAE (testset)     0.6840  0.6836  0.6870  0.6876  0.6863  0.6857  0.0016  
Fit time          67.06   67.43   67.57   67.48   67.34   67.38   0.17    
Test time         3.33    3.28    3.25    3.26    3.27    3.28    0.03    


{'test_rmse': array([0.87205648, 0.87027723, 0.87574227, 0.87503408, 0.87470333]),
 'test_mae': array([0.68402233, 0.68355301, 0.68699288, 0.68756463, 0.68632696]),
 'fit_time': (67.0614914894104,
  67.4294822216034,
  67.56535530090332,
  67.4849362373352,
  67.34282064437866),
 'test_time': (3.3287510871887207,
  3.2797861099243164,
  3.252803087234497,
  3.26479434967041,
  3.267077684402466)}

In [3]:
from surprise.model_selection import train_test_split

# разбиваем на валидационную и тестовую выборки
trainset, testset = train_test_split(data, test_size=.2, random_state=42)

In [4]:
%%time
from surprise import KNNBasic

# обучаем модель
algo = KNNBasic()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.
Wall time: 55.6 s


<surprise.prediction_algorithms.knns.KNNBasic at 0x25073f699a0>

In [7]:
%%time
# получаем рекомендации на валидационном наборе
prediction = algo.test(testset)

Wall time: 2min 30s


In [9]:
%%time
from surprise import accuracy

# оценим качество
accuracy.rmse(prediction)

RMSE: 0.9231
Wall time: 227 ms


0.9231258721322069

In [None]:
 # Создаем spark сессию
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .config('spark.driver.memory', '16G')
    .config('spark.sql.analyzer.failAmbiguousSelfJoin', 'False')
    .master("local[*]")
    .getOrCreate()
)

In [None]:
# файл с оценками - user * item матрица
import os
import pyspark.sql.functions as sql_func

DATA_DIR = 'D:/Datasets/ml-latest-small/'
# Сначала посмотрим на общее распределение тегов
ratings = (
    spark
    .read
    .csv(
        os.path.join(DATA_DIR, 'ratings.csv'),
        header=True,
        inferSchema=True
    )
    .drop('timestamp')
    .cache()
)

In [None]:
%%time
(
    ratings.alias("one")
    .join(ratings.alias("two"), "userId")
    # расстояние симметрично
    # поэтому считаем только одну сторону
    .where("one.movieId > two.movieId")
    .groupBy("one.movieId", "two.movieId")
    .agg(
        sql_func.sum(
            sql_func.col("one.rating") *
            sql_func.col("two.rating")
        ).alias("inner_product"),
        sql_func.count("userId").alias("watched_both")
    ).select(
        sql_func.col("one.movieId").alias("movieId1"),
        sql_func.col("two.movieId").alias("movieId2"),
        sql_func.col("watched_both"),
        sql_func.col("inner_product")
    )
    .write
    .mode("overwrite")
    .parquet("half_cooccurrences.parquet")
)

In [None]:
popularities = (
    ratings
    .groupBy("movieId")
    .agg(
        sql_func.sum(sql_func.pow(
            sql_func.col("rating"),
            2
        )).alias("sum_of_squares"),
        sql_func.count("userId").alias("watched_one")
    )
    .cache()
)

In [None]:
half_cooccurrences = (
    spark
    .read
    .parquet("half_cooccurrences.parquet")
)

In [None]:
# агрегаты с расчета расстояний - считается 15 минут
(
    half_cooccurrences
    .join(
        popularities.alias("pop1"),
        sql_func.col("pop1.movieId") == sql_func.col("movieId1")
    )
    .join(
        popularities.alias("pop2"),
        sql_func.col("pop2.movieId") == sql_func.col("movieId2")
    )
    .select(
        sql_func.col("movieId1"),
        sql_func.col("movieId2"),
        sql_func.col("pop1.watched_one").alias("watched1"),
        sql_func.col("pop2.watched_one").alias("watched2"),
        sql_func.col("pop1.sum_of_squares").alias("sum_of_squares1"),
        sql_func.col("pop2.sum_of_squares").alias("sum_of_squares2"),
        sql_func.col("inner_product"),
        sql_func.col("watched_both")
        
    )
    .write
    .mode("overwrite")
    .parquet("pre_distance_matrix.parquet")
)

In [None]:
pre_distance_matrix = (
    spark
    .read
    .parquet("pre_distance_matrix.parquet")
)

In [None]:
movies = (
    spark
    .read
    .csv(
        os.path.join(DATA_DIR, 'movies.csv'),
        header=True,
        inferSchema=True
    )
    # если используется меньше памяти,
    # то здесь можно взять не все данные, а небольшую выборку
    # даже при fraction=.01 качественная картина не меняеся
    .select('movieId', 'title')
    .cache()
)

In [None]:
movieId = 4896
(
    pre_distance_matrix
    .where("movieId1 == {} OR movieId2 == {}".format(movieId, movieId))
    .selectExpr(
        """
        CASE
            WHEN movieId1 == {}
            THEN movieId2
            ELSE movieId1
        END movieId
        """.format(movieId),
#         """
#         ВАШ КОД ТУТ AS disctance
#         """
    )
#     .orderBy("distance")
    .limit(10)
    .join(movies, "movieId")
#     .orderBy("distance")
    .toPandas()
)