In [1]:
import os
# Создаем spark сессию
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import findspark
findspark.init()
from pyspark.sql import SparkSession
import pyspark.sql.functions as sql_func
from pyspark.sql.types import FloatType
from scipy.spatial.distance import euclidean

spark = (
    SparkSession
    .builder
    .config('spark.driver.memory', '12G')
    .config('spark.sql.analyzer.failAmbiguousSelfJoin', 'False')
    .master("local[*]")
    .getOrCreate()
)

In [6]:
DATA_DIR = 'D:/Datasets/ml-latest/'

In [7]:
# считаем фичи фильмов
tf_idf = spark.read.parquet(os.path.join(DATA_DIR, 'tf_idf.parquet')).cache()

In [8]:
# определяем функцию расстояния
distance = sql_func.udf(
    lambda x1, x2: euclidean(
        x1.toArray(),
        x2.toArray()
    ), # тут может потребоваться toList для некоторых расстояний
    returnType=FloatType()
)

In [9]:
# находим матрицу расстояний (лениво)
distance_matrix = (
    tf_idf.alias("one")
    .crossJoin(tf_idf.alias("two"))
    .select(
        "one.movieId",
        "one.title",
        "two.movieId",
        "two.title",
        distance("one.tf_idf", "two.tf_idf").alias("distance")
    )
)

In [11]:
# находим 10 ближайших соседей Гарри Потера
(
    distance_matrix
    .where(sql_func.col("one.movieId") == 4896)
    .orderBy("distance")
    .select("two.movieId", "two.title", "distance")
    .limit(10)
    .toPandas()
)

Unnamed: 0,movieId,title,distance
0,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,0.0
1,5816,Harry Potter and the Chamber of Secrets (2002),304.82251
2,40815,Harry Potter and the Goblet of Fire (2005),332.110016
3,8368,Harry Potter and the Prisoner of Azkaban (2004),408.846954
4,54001,Harry Potter and the Order of the Phoenix (2007),442.431244
5,69844,Harry Potter and the Half-Blood Prince (2009),450.669281
6,88125,Harry Potter and the Deathly Hallows: Part 2 (...,485.726501
7,41566,"Chronicles of Narnia: The Lion, the Witch and ...",489.401215
8,81834,Harry Potter and the Deathly Hallows: Part 1 (...,505.649261
9,2193,Willow (1988),537.205444
