In [6]:
import boto3 
import numpy as np
import pandas as pd
from io import StringIO, BytesIO

In [7]:
s3 = boto3.client("s3")
s3_resource = boto3.resource('s3')
bucket_name = "zyyaphet"

In [8]:
obj = s3.get_object(Bucket=bucket_name, Key="Input/ratings_small.csv")
df = pd.read_csv(BytesIO(obj['Body'].read()))
df.drop('timestamp', inplace=True, axis=1)

In [14]:
df.count()

userId     100836
movieId    100836
rating     100836
dtype: int64

In [11]:
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *

# Build SparkSession
spark = SparkSession.builder.appName('ALS').getOrCreate()
    
# Convert Pandas Dataframe to Spark Dataframe with schema    
schema = StructType([StructField("userId", IntegerType(), True),StructField("movieId", IntegerType(), True), StructField("rating", DoubleType(), True)])
ratings = spark.createDataFrame(df, schema=schema) 
ratings.show(10)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
+------+-------+------+
only showing top 10 rows



In [16]:
#ratings

In [13]:
# ALS Modelling

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

(training, test) = ratings.randomSplit([0.8, 0.2])
als = ALS(maxIter=5, regParam=0.01, rank=20, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))


Root-mean-square error = 1.2004018682708308


In [17]:
# Generate recommendations

userRecs = model.recommendForAllUsers(10)
movieRecs = model.recommendForAllItems(10)
#users = ratings.select(als.getUserCol()).distinct().limit(3)
#userSubsetRecs = model.recommendForUserSubset(users, 10)
#movies = ratings.select(als.getItemCol()).distinct().limit(3)
#movieSubSetRecs = model.recommendForItemSubset(movies, 10)

In [20]:
# Genearte top 10 movie recommendation for each user
userRecs.show(10)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[1805, 8.739668]...|
|   463|[[1464, 8.258455]...|
|   496|[[8533, 7.6467705...|
|   148|[[59369, 7.308037...|
|   540|[[5903, 6.655172]...|
|   392|[[417, 7.903699],...|
|   243|[[44974, 9.210981...|
|    31|[[1958, 8.117658]...|
|   516|[[49932, 8.466272...|
|   580|[[27611, 5.886969...|
|   251|[[1147, 6.530244]...|
|   451|[[1186, 8.037044]...|
|    85|[[3040, 8.356973]...|
|   137|[[905, 5.406294],...|
|    65|[[7371, 5.844014]...|
|   458|[[3477, 7.183012]...|
|   481|[[5080, 7.522802]...|
|    53|[[2843, 6.7843556...|
|   255|[[122906, 8.93016...|
|   588|[[8973, 7.2354574...|
+------+--------------------+
only showing top 20 rows



In [21]:
# Generate top 10 user recomendation for each movie
movieRecs.show(10)


+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[[77, 7.0766826],...|
|   4900|[[264, 6.061315],...|
|   5300|[[164, 5.9314775]...|
|   6620|[[158, 8.308047],...|
|   7340|[[302, 5.0520425]...|
|  32460|[[536, 8.207065],...|
|  54190|[[35, 7.330521], ...|
|    471|[[55, 8.853011], ...|
|   1591|[[344, 7.281192],...|
| 140541|[[270, 6.0058136]...|
+-------+--------------------+
only showing top 10 rows

