In [1]:
## Para Colab
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import DataFrame, SparkSession
from typing import List
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark= SparkSession \
       .builder \
       .appName("Content Based Recommendation") \
       .getOrCreate()

spark

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,172 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,501 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 https:/

In [2]:
from pyspark.sql.functions import expr, col, udf, desc, collect_list
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, FloatType
from pyspark.sql import functions as F
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import RankingMetrics
from pyspark.sql import Window
from pyspark.sql.functions import row_number

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RecommenderSystem").getOrCreate()

In [4]:
df = spark.read.csv("/content/sample-user-filtered-2023.csv", header=True, inferSchema=True)

In [5]:
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- anime_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [6]:
df_final = df

total_users = df.select("user_id").distinct().count()
total_animes = df.select("anime_id").distinct().count()

In [7]:
indexer_user = StringIndexer(inputCol='user_id', outputCol='user_index').setHandleInvalid("keep")
indexer_anime = StringIndexer(inputCol='anime_id', outputCol='anime_index').setHandleInvalid("keep")

user_indexer_model = indexer_user.fit(df_final)
anime_indexer_model = indexer_anime.fit(df_final)

df_final = user_indexer_model.transform(df_final)
df_final = anime_indexer_model.transform(df_final)

df_final = df_final.withColumn('user_index', df_final['user_index'].cast('integer'))
df_final = df_final.withColumn('anime_index', df_final['anime_index'].cast('integer'))

In [8]:
percent_users_to_mask = 0.95
percent_animes_to_mask = 0.9

user_cutoff = int(total_users * (1 - percent_users_to_mask))
anime_cutoff = int(total_animes * (1 - percent_animes_to_mask))
print(user_cutoff, total_users)
print(anime_cutoff, total_animes)

3823 76467
49 500


In [9]:
train_data = df_final.filter(~((col("user_index") > user_cutoff) & (col("anime_index") > anime_cutoff)))
test_data = df_final.filter((col("user_index") > user_cutoff) & (col("anime_index") > anime_cutoff))
print(train_data.count())
print(test_data.count())

2421115
4317774


In [10]:
final_als = ALS(
    userCol='user_index',
    itemCol='anime_index',
    ratingCol='rating',
    coldStartStrategy='drop',
    nonnegative=True,
    rank=20,
    maxIter=20,
    regParam=0.15
)

best_model = final_als.fit(train_data)

In [None]:
!pip install recommenders

In [18]:
#dfs_pred = best_model.transform(test_data).drop('ratings')

In [20]:
# Get the cross join of all user-item pairs and score them.
users = train_data.select('user_index').distinct()
items = train_data.select('anime_index').distinct()
user_item = users.crossJoin(items)
dfs_pred = best_model.transform(user_item)

# Remove seen items.
dfs_pred_exclude_train = dfs_pred.alias("pred").join(
    train_data.alias("train"),
    (dfs_pred['user_index'] == train_data['user_index']) & (dfs_pred['anime_index'] == train_data['anime_index']),
    how='outer'
)

dfs_pred_final = dfs_pred_exclude_train.filter(dfs_pred_exclude_train["train.Rating"].isNull()) \
    .select('pred.' + 'user_index', 'pred.' + 'anime_index', 'pred.' + "prediction")

dfs_pred_final.show()

+----------+-----------+----------+
|user_index|anime_index|prediction|
+----------+-----------+----------+
|         1|        133| 5.7830753|
|         2|        299|  4.244053|
|         3|        305| 7.8052225|
|         4|        385|  7.593605|
|         5|         16|  8.881258|
|         5|        305|  8.095629|
|         6|        198|  6.871107|
|         6|        264|  7.537586|
|         6|        349| 7.1090765|
|         7|        151|  8.815727|
|         8|        274| 7.4829044|
|         9|        399|  6.422617|
|        10|        240|  8.590752|
|        10|        371| 6.8143687|
|        10|        394|  6.893742|
|        10|        428|  5.255093|
|        10|        486|  7.945748|
|        11|        329|  7.172482|
|        11|        387| 6.8482666|
|        12|        372| 7.5034337|
+----------+-----------+----------+
only showing top 20 rows



In [17]:
from recommenders.evaluation.spark_evaluation import SparkRankingEvaluation, SparkRatingEvaluation

In [22]:
evaluations = SparkRankingEvaluation(
    test_data,
    dfs_pred_final,
    col_user='user_index',
    col_item='anime_index',
    col_rating='rating',
    col_prediction='prediction',
    k=10
)

print(
    "Precision@k = {}".format(evaluations.precision_at_k()),
    "Recall@k = {}".format(evaluations.recall_at_k()),
    "NDCG@k = {}".format(evaluations.ndcg_at_k()),
    "Mean average precision = {}".format(evaluations.map_at_k()),
    sep="\n"
)



Precision@k = 0.1908351832098491
Recall@k = 0.038377746147481864
NDCG@k = 0.19729355192825335
Mean average precision = 0.10333281639625423
