Dependency Imports

In [1]:

from pyspark import SparkContext
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession ,Row
from pyspark.sql.functions import col
from pyspark.sql import SQLContext
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType,StructField,IntegerType


Constants : Modify as and when required!

In [2]:
# file paths
business_file="./Small_Datasets/small_business/part-00000-ee490ef4-cc94-4c10-a519-93e1a97ae759-c000.snappy.parquet"
reviews_file="./Small_Datasets/small_reviews/part-00000-68ec6b88-1bc3-48fb-b20e-d37d1431365f-c000.snappy.parquet"
users_file="./Small_Datasets/small_users/part-00000-aaf232b2-4eae-4625-a454-215dd40c60f4-c000.snappy.parquet"
app_name="Collabrative filtering for restaurant recommendation"

In [3]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName(app_name) \
        .config("spark.some.config.option", "nothing") \
        .getOrCreate()
    return spark


In [4]:
spark=init_spark()
df_business = spark.read.parquet(business_file)
df_reviews=spark.read.parquet(reviews_file)
df_users=spark.read.parquet(users_file)
df_business.columns

['business_id',
 'name',
 'address',
 'city',
 'state',
 'postal_code',
 'latitude',
 'longitude',
 'stars',
 'review_count',
 'categories']

In [5]:
df_business = df_business.select("business_id","name", "stars", 
                                 "review_count", "address", "city", "state", "postal_code", "longitude", 
                                 "categories", "latitude").withColumnRenamed("stars", "stars_restaurant")
df_reviews = df_reviews.join(df_business, on='business_id', how='inner')
df_reviews.columns

['business_id',
 'user_id',
 'review_id',
 'stars',
 'text',
 'name',
 'stars_restaurant',
 'review_count',
 'address',
 'city',
 'state',
 'postal_code',
 'longitude',
 'categories',
 'latitude']

In [6]:
indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in ['business_id', 'user_id']]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(df_reviews).transform(df_reviews)
transformed.select(['business_id', 'user_id','business_id_index', 'user_id_index','stars','categories'])
transformed.columns

['business_id',
 'user_id',
 'review_id',
 'stars',
 'text',
 'name',
 'stars_restaurant',
 'review_count',
 'address',
 'city',
 'state',
 'postal_code',
 'longitude',
 'categories',
 'latitude',
 'business_id_index',
 'user_id_index']

In [7]:
(training, test) = transformed.randomSplit([0.8, 0.2])

In [8]:
als=ALS(maxIter=5,
        regParam=0.09,
        rank=20,
        userCol="user_id_index",
        itemCol="business_id_index",
        ratingCol="stars",
        coldStartStrategy="drop",
        nonnegative=True)

model=als.fit(training)

In [9]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))

RMSE=1.4416752540571711


In [10]:
als = ALS(maxIter=20,regParam=0.09,rank=20,userCol="user_id_index",itemCol="business_id_index",ratingCol="stars",coldStartStrategy="drop",nonnegative=True)
model = als.fit(training)

In [11]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))

RMSE=1.3336007591519718


In [12]:
als = ALS(maxIter=20,regParam=0.09,rank=25,userCol="user_id_index",itemCol="business_id_index",ratingCol="stars",coldStartStrategy="drop",nonnegative=True)
model = als.fit(training)

In [13]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))

RMSE=1.3277165186374995


# Visual Recommendation

In [15]:
test = model.recommendForAllUsers(20).filter(col('user_id_index')==30).select("recommendations").collect()
topRestaurants = []
for item in test[0][0]:        
    topRestaurants.append(item.business_id_index)
    
schema = StructType([StructField("business_id_index",IntegerType(),True)])
restaurants = spark.createDataFrame(topRestaurants,IntegerType()).toDF("business_id_index")


transformed\
.select(['business_id', 'user_id', 'stars', 'categories'])\
.filter(col('user_id_index')==30)\
.show()

restaurants\
.join(transformed, on = 'business_id_index', how = 'inner')\
.select(['business_id', 'stars', 'categories', 'name'])\
.drop_duplicates(subset=['name'])\
.show()

+--------------------+--------------------+-----+--------------------+
|         business_id|             user_id|stars|          categories|
+--------------------+--------------------+-----+--------------------+
|ubLn_FrFygzcbhXTD...|0gZ8E5tBWTEtGEZDu...|  5.0|Restaurants, Cafe...|
|zrnP9HqoF-RI9jqoW...|0gZ8E5tBWTEtGEZDu...|  5.0|Coffee & Tea, Res...|
|yU3p0tEBtGuZLTcsB...|0gZ8E5tBWTEtGEZDu...|  5.0|Restaurants, Brea...|
|tmWp4Rtr_cm7nCh2u...|0gZ8E5tBWTEtGEZDu...|  5.0|Breakfast & Brunc...|
|vaURDGRo19cMB3Fy9...|0gZ8E5tBWTEtGEZDu...|  5.0|Restaurants, Coff...|
|58APdML-PG_OD4El2...|0gZ8E5tBWTEtGEZDu...|  2.0|Delicatessen, Foo...|
|ZL9sk7Imc42BNRhcj...|0gZ8E5tBWTEtGEZDu...|  4.0|Chinese, Restaura...|
|lpnuObNKbkH8usnUS...|0gZ8E5tBWTEtGEZDu...|  4.0|Cafes, Coffee & T...|
|q91nljuSDFl0IYRey...|0gZ8E5tBWTEtGEZDu...|  3.0|Vegan, Vegetarian...|
|8cPMjIwTnrVm3F2wL...|0gZ8E5tBWTEtGEZDu...|  4.0|Vegan, Cafes, Veg...|
|xlMgaPOpd_99SFTuO...|0gZ8E5tBWTEtGEZDu...|  4.0|Game Meat, Americ...|
|vYbpo