In [1]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate


# Load the movielens-1m dataset (download it if needed),
data = Dataset.load_builtin('ml-1m') # ('ml-1m') ('ml-100k')

# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8747  0.8755  0.8738  0.8734  0.8729  0.8741  0.0009  
MAE (testset)     0.6869  0.6876  0.6866  0.6853  0.6849  0.6863  0.0010  
Fit time          67.44   67.33   67.34   67.93   67.12   67.43   0.27    
Test time         3.36    3.32    3.33    3.29    3.28    3.31    0.03    


{'test_rmse': array([0.8747261 , 0.8754963 , 0.87384452, 0.87344182, 0.87290242]),
 'test_mae': array([0.68689874, 0.6876363 , 0.68659226, 0.68534285, 0.68494085]),
 'fit_time': (67.4418158531189,
  67.33148431777954,
  67.33739066123962,
  67.92663192749023,
  67.12149715423584),
 'test_time': (3.360177993774414,
  3.3171985149383545,
  3.3265841007232666,
  3.2877726554870605,
  3.2787773609161377)}

In [2]:
from surprise.model_selection import train_test_split

# разбиваем на валидационную и тестовую выборки
trainset, testset = train_test_split(data, test_size=.2, random_state=42)

In [3]:
%%time
from surprise import KNNBasic

# обучаем модель
algo = KNNBasic(sim_options={'name': 'pearson_baseline',
               'shrinkage': 0})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Wall time: 2min 37s


<surprise.prediction_algorithms.knns.KNNBasic at 0x21d476ff790>

In [4]:
%%time
# получаем рекомендации на валидационном наборе
prediction = algo.test(testset)

Wall time: 2min 27s


In [5]:
%%time
from surprise import accuracy

# оценим качество
accuracy.rmse(prediction)

RMSE: 0.9486
Wall time: 222 ms


0.9486364645953307

In [6]:
 # Создаем spark сессию
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .config('spark.driver.memory', '16G')
    .config('spark.sql.analyzer.failAmbiguousSelfJoin', 'False')
    .master("local[*]")
    .getOrCreate()
)

In [7]:
# файл с оценками - user * item матрица
import os
import pyspark.sql.functions as sql_func

DATA_DIR = 'D:/Datasets/ml-latest-small/'
# Сначала посмотрим на общее распределение тегов
ratings = (
    spark
    .read
    .csv(
        os.path.join(DATA_DIR, 'ratings.csv'),
        header=True,
        inferSchema=True
    )
    .drop('timestamp')
    .cache()
)

In [8]:
%%time
(
    ratings.alias("one")
    .join(ratings.alias("two"), "userId")
    # расстояние симметрично
    # поэтому считаем только одну сторону
    .where("one.movieId > two.movieId")
    .groupBy("one.movieId", "two.movieId")
    .agg(
        sql_func.sum(
            sql_func.col("one.rating") *
            sql_func.col("two.rating")
        ).alias("inner_product"),
        sql_func.count("userId").alias("watched_both")
    ).select(
        sql_func.col("one.movieId").alias("movieId1"),
        sql_func.col("two.movieId").alias("movieId2"),
        sql_func.col("watched_both"),
        sql_func.col("inner_product")
    )
    .write
    .mode("overwrite")
    .parquet("half_cooccurrences.parquet")
)

Wall time: 27.7 s


In [9]:
popularities = (
    ratings
    .groupBy("movieId")
    .agg(
        sql_func.sum(sql_func.pow(
            sql_func.col("rating"),
            2
        )).alias("sum_of_squares"),
        sql_func.count("userId").alias("watched_one")
    )
    .cache()
)

In [10]:
half_cooccurrences = (
    spark
    .read
    .parquet("half_cooccurrences.parquet")
)

In [11]:
# агрегаты с расчета расстояний - считается 15 минут
(
    half_cooccurrences
    .join(
        popularities.alias("pop1"),
        sql_func.col("pop1.movieId") == sql_func.col("movieId1")
    )
    .join(
        popularities.alias("pop2"),
        sql_func.col("pop2.movieId") == sql_func.col("movieId2")
    )
    .select(
        sql_func.col("movieId1"),
        sql_func.col("movieId2"),
        sql_func.col("pop1.watched_one").alias("watched1"),
        sql_func.col("pop2.watched_one").alias("watched2"),
        sql_func.col("pop1.sum_of_squares").alias("sum_of_squares1"),
        sql_func.col("pop2.sum_of_squares").alias("sum_of_squares2"),
        sql_func.col("inner_product"),
        sql_func.col("watched_both")
        
    )
    .write
    .mode("overwrite")
    .parquet("pre_distance_matrix.parquet")
)

In [12]:
pre_distance_matrix = (
    spark
    .read
    .parquet("pre_distance_matrix.parquet")
)

In [13]:
movies = (
    spark
    .read
    .csv(
        os.path.join(DATA_DIR, 'movies.csv'),
        header=True,
        inferSchema=True
    )
    # если используется меньше памяти,
    # то здесь можно взять не все данные, а небольшую выборку
    # даже при fraction=.01 качественная картина не меняеся
    .select('movieId', 'title')
    .cache()
)

In [19]:
movieId = 4896
(
    pre_distance_matrix
    .where("movieId1 == {} OR movieId2 == {}".format(movieId, movieId))
    .selectExpr(
        """
        CASE
            WHEN movieId1 == {}
            THEN movieId2
            ELSE movieId1
        END movieId
        """.format(movieId),
        """
        -- sum_of_squares1 + sum_of_squares2 - 2 * inner_product AS distance -- Евклидово расстояние
        - inner_product / (sum_of_squares1 + sum_of_squares2 - inner_product) AS distance -- Танимото
        -- - inner_product / SQRT(sum_of_squares1) / SQRT(sum_of_squares2) AS distance -- Косинус
        
        """
    )
    .orderBy("distance")
    .limit(10)
    .join(movies, "movieId")
    .orderBy("distance")
    .toPandas()
)

Unnamed: 0,movieId,distance,title
0,5816,-0.636898,Harry Potter and the Chamber of Secrets (2002)
1,8368,-0.568846,Harry Potter and the Prisoner of Azkaban (2004)
2,40815,-0.553783,Harry Potter and the Goblet of Fire (2005)
3,69844,-0.422393,Harry Potter and the Half-Blood Prince (2009)
4,4306,-0.419741,Shrek (2001)
5,5349,-0.407751,Spider-Man (2002)
6,4886,-0.406374,"Monsters, Inc. (2001)"
7,6539,-0.399835,Pirates of the Caribbean: The Curse of the Bla...
8,54001,-0.399075,Harry Potter and the Order of the Phoenix (2007)
9,5218,-0.390914,Ice Age (2002)
