Dependency Imports

In [1]:

from pyspark import SparkContext
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession ,Row
from pyspark.sql.functions import col
from pyspark.sql import SQLContext
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType,StructField,IntegerType


Constants : Modify as and when required!

In [2]:
# file paths
business_file="./Small_Datasets/small_business/part-00000-ee490ef4-cc94-4c10-a519-93e1a97ae759-c000.snappy.parquet"
reviews_file="./Small_Datasets/small_reviews/part-00000-68ec6b88-1bc3-48fb-b20e-d37d1431365f-c000.snappy.parquet"
users_file="./Small_Datasets/small_users/part-00000-aaf232b2-4eae-4625-a454-215dd40c60f4-c000.snappy.parquet"
app_name="Collabrative filtering for restaurant recommendation"

In [3]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName(app_name) \
        .config("spark.some.config.option", "nothing") \
        .getOrCreate()
    return spark


In [4]:
spark=init_spark()
df_business = spark.read.parquet(business_file)
df_reviews=spark.read.parquet(reviews_file)
df_users=spark.read.parquet(users_file)

In [5]:
indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in ['business_id', 'user_id']]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(df_reviews).transform(df_reviews)
transformed.select(['business_id', 'user_id','business_id_index', 'user_id_index'])

DataFrame[business_id: string, user_id: string, business_id_index: double, user_id_index: double]

In [6]:
(training, test) = transformed.randomSplit([0.8, 0.2])

In [7]:
als=ALS(maxIter=5,
        regParam=0.09,
        rank=25,
        userCol="user_id_index",
        itemCol="business_id_index",
        ratingCol="stars",
        coldStartStrategy="drop",
        nonnegative=True)

model=als.fit(training)

In [8]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))

RMSE=1.4316221402744207
