In [27]:
import pyspark
from pyspark.sql import SparkSession
import numpy as np
import pandas as pd
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import *
import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [28]:
spark = SparkSession.builder.master("local[2]").getOrCreate()
sc = spark.sparkContext

In [41]:
df=spark.read.csv("data/completed.csv", header=True)
df=df.select(df['username'].cast('int'),df['anime_id'].cast('int'),df['score'].cast('float'))
df=df.na.drop()
df.printSchema()

root
 |-- username: integer (nullable = true)
 |-- anime_id: integer (nullable = true)
 |-- score: float (nullable = true)



In [42]:
df.show()


+--------+--------+-----+
|username|anime_id|score|
+--------+--------+-----+
|       1|       1| 60.0|
|       1|      30| 85.0|
|       1|      32| 85.0|
|       1|      79| 60.0|
|       1|     226| 35.0|
|       1|     227| 60.0|
|       1|     339| 60.0|
|       1|     356| 60.0|
|       1|     433| 35.0|
|       1|     759| 85.0|
|       1|     770| 85.0|
|       1|     889| 60.0|
|       1|    1195| 60.0|
|       1|    1535| 60.0|
|       1|    1575| 60.0|
|       1|    1689| 85.0|
|       1|    1943| 85.0|
|       1|    2001| 60.0|
|       1|    2167| 35.0|
|       1|    2236| 60.0|
+--------+--------+-----+
only showing top 20 rows



In [43]:
df.describe().show()

+-------+-----------------+-----------------+------------------+
|summary|         username|         anime_id|             score|
+-------+-----------------+-----------------+------------------+
|  count|          6294391|          6294391|           6294391|
|   mean|33073.78466590334|9977.269728556743|  56.6542702860372|
| stddev|28598.69684457577|8720.462189791226|35.259854390094766|
|    min|                1|                1|               0.0|
|    max|            96981|            99217|             255.0|
+-------+-----------------+-----------------+------------------+



In [44]:
als_model = ALS(
    itemCol='anime_id',
    userCol='username',
    ratingCol='score',
    nonnegative=True,
    regParam=0.1,
    rank=10
)

In [45]:
recommender = als_model.fit(df)

In [46]:
test_input = df.select(df['username'],df['anime_id'])

In [51]:
predictions = recommender.transform(test_input)
predictions = predictions.join(df, (predictions.username==df.username) & (predictions.anime_id==df.anime_id))
predictions.show()

+--------+--------+----------+--------+--------+-----+
|username|anime_id|prediction|username|anime_id|score|
+--------+--------+----------+--------+--------+-----+
|       3|   10020| 43.870415|       3|   10020| 87.0|
|       4|     457| 83.419914|       4|     457|100.0|
|       5|    3298| 80.556885|       5|    3298| 80.0|
|       6|     117| 79.384995|       6|     117| 90.0|
|       7|    3229|  64.27373|       7|    3229| 70.0|
|       7|   10448|  53.21048|       7|   10448| 60.0|
|       7|   17074|   88.6501|       7|   17074| 99.0|
|       8|    9062| 82.154755|       8|    9062| 90.0|
|       8|    9690| 67.959625|       8|    9690| 80.0|
|       8|   10012|  86.81378|       8|   10012| 70.0|
|       9|    6937| 30.523195|       9|    6937| 50.0|
|      11|     889|  75.30633|      11|     889| 80.0|
|      11|    1142|  83.54273|      11|    1142|100.0|
|      13|     462| 60.253754|      13|     462| 60.0|
|      13|    1254| 77.298515|      13|    1254| 50.0|
|      13|