In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.rdd import RDD

In [2]:
def init_spark():
    spark = SparkSession \
        .builder \
        .config("spark.driver.memory", "15g") \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark
spark = init_spark()

In [3]:
df = spark.read.text("D:/Courses/big data/project/Anime/rating.csv").rdd
from itertools import islice
df= df.mapPartitionsWithIndex(
    lambda idx, it: islice(it, 1, None) if idx == 0 else it 
)

In [4]:
data = df.map(lambda x: x[0].split(','))

In [5]:
ratingsRDD = data.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2])))

In [10]:
rating = ratingsRDD.map(lambda x: x[x["rating"] > 0])

In [21]:
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 2.0889547574742813


In [22]:
userRecs = model.recommendForAllUsers(10)

In [26]:
userRecs[userRecs['userId'] == 3].collect()

[Row(userId=3, recommendations=[Row(movieId=6868, rating=19.59992027282715), Row(movieId=10132, rating=16.633174896240234), Row(movieId=22455, rating=15.965030670166016), Row(movieId=22479, rating=15.965030670166016), Row(movieId=22445, rating=15.965030670166016), Row(movieId=22477, rating=15.965030670166016), Row(movieId=17985, rating=15.965030670166016), Row(movieId=18587, rating=15.375167846679688), Row(movieId=18463, rating=15.075610160827637), Row(movieId=3624, rating=14.97232437133789)])]

In [30]:
userRecs.where(userRecs.userId == 3).select("recommendations.movieId", "recommendations.rating").collect()

[Row(movieId=[6868, 10132, 22455, 22479, 22445, 22477, 17985, 18587, 18463, 3624], rating=[19.59992027282715, 16.633174896240234, 15.965030670166016, 15.965030670166016, 15.965030670166016, 15.965030670166016, 15.965030670166016, 15.375167846679688, 15.075610160827637, 14.97232437133789])]

In [31]:
movieId=[6868, 10132, 22455, 22479, 22445, 22477, 17985, 18587, 18463, 3624]

In [33]:
import pandas as pd
df2 = pd.read_csv("D:/Courses/big data/project/Anime/anime.csv")

In [38]:
for anime in movieId:
    values = df2[['anime_id','name', 'genre', 'type']].loc[df2['anime_id']==anime]
    print(str(values['anime_id']) + "," + str(values['name']) + "," + str(values['genre']) + "," + str(values['type']) + "\n")
    #print(str(values['anime_id']) + "," + str(values['name']) + "\n")

10713    6868
Name: anime_id, dtype: int64,10713    Wansa-kun
Name: name, dtype: object,10713    Comedy
Name: genre, dtype: object,10713    TV
Name: type, dtype: object

5686    10132
Name: anime_id, dtype: int64,5686    Jin Sheng Yuan
Name: name, dtype: object,5686    Music
Name: genre, dtype: object,5686    Music
Name: type, dtype: object

8800    22455
Name: anime_id, dtype: int64,8800    Hello Kitty no Circus ga Yatte Kita
Name: name, dtype: object,8800    Fantasy, Kids
Name: genre, dtype: object,8800    OVA
Name: type, dtype: object

8810    22479
Name: anime_id, dtype: int64,8810    Hello Kitty no Mahou no Ringo
Name: name, dtype: object,8810    Fantasy, Kids
Name: genre, dtype: object,8810    OVA
Name: type, dtype: object

8832    22445
Name: anime_id, dtype: int64,8832    Hello Kitty no Yappari Mama ga Suki
Name: name, dtype: object,8832    Fantasy, Kids
Name: genre, dtype: object,8832    OVA
Name: type, dtype: object

8831    22477
Name: anime_id, dtype: int64,8831    Hello Ki