In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder.appName("Big Data Recommendation Systems").getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x7fe6ad850750>


In [2]:
# Load CSV
df = spark.read.csv("/root/Lecture/BIGDATA/datasets/ratings.csv", header=True, inferSchema=True)

df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [3]:
# Count dataset Amazon Fine Food Reviews
df.count()

264505

In [4]:
# Show dataset Amazon Fine Food Reviews
df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
| 12882|      1|   4.0|1147195252|
| 12882|     32|   3.5|1147195307|
| 12882|     47|   5.0|1147195343|
| 12882|     50|   5.0|1147185499|
| 12882|    110|   4.5|1147195239|
| 12882|    150|   3.5|1147195267|
| 12882|    158|   2.0|1147185180|
| 12882|    165|   4.0|1147195325|
| 12882|    260|   4.0|1147195260|
| 12882|    296|   5.0|1147195153|
| 12882|    318|   5.0|1147195162|
| 12882|    356|   5.0|1147185487|
| 12882|    364|   3.5|1147195899|
| 12882|    380|   2.5|1147195276|
| 12882|    457|   4.0|1147195271|
| 12882|    480|   3.5|1147185483|
| 12882|    515|   3.5|1147185231|
| 12882|    527|   4.0|1147195296|
| 12882|    552|   2.5|1147185190|
| 12882|    588|   3.0|1147195313|
+------+-------+------+----------+
only showing top 20 rows



In [5]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

(training, test) = df.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [6]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.827770749489


In [7]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [8]:
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|  7880|[[1177, 6.147299]...|
| 27484|[[378, 5.817801],...|
|104508|[[3185, 5.1842403...|
| 34239|[[351, 13.639605]...|
|122710|[[82, 7.5867014],...|
|  8932|[[1300, 6.5434594...|
| 76164|[[928, 3.9772882]...|
| 68037|[[1914, 5.2823014...|
| 47573|[[766, 8.722683],...|
| 21058|[[334, 5.2173743]...|
| 48838|[[2859, 6.8364472...|
| 54989|[[971, 5.923529],...|
|108159|[[1238, 4.6789556...|
| 24200|[[1949, 5.2000747...|
| 91973|[[766, 5.9134126]...|
| 34494|[[1007, 4.6637163...|
| 26480|[[2859, 4.795166]...|
|128111|[[3185, 5.111826]...|
|  6937|[[378, 10.5386095...|
|109429|[[2093, 5.940619]...|
+------+--------------------+
only showing top 20 rows



In [9]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[[111036, 5.15001...|
|   6620|[[83479, 6.344374...|
|  54190|[[109970, 7.36779...|
|    471|[[27735, 5.889134...|
|   1591|[[109734, 7.21393...|
|   1342|[[68836, 8.893948...|
|   2122|[[35823, 12.38314...|
|   2142|[[35823, 7.708952...|
|  44022|[[34239, 6.562566...|
|   1645|[[95290, 6.434094...|
|   3175|[[27735, 6.521919...|
|   2366|[[27735, 8.370095...|
|   3997|[[89274, 7.726284...|
|   1088|[[50019, 7.030873...|
|   1238|[[43649, 11.01845...|
|   8638|[[43649, 6.760278...|
|   1959|[[95290, 9.444898...|
|   4519|[[8335, 7.0141745...|
|    540|[[34239, 6.646393...|
|   2580|[[8527, 5.7034883...|
+-------+--------------------+
only showing top 20 rows

