# Dependency Imports

In [1]:

from pyspark import SparkContext
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession ,Row
from pyspark.sql.functions import col
from pyspark.sql import SQLContext
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType,StructField,IntegerType


# Constants : Modify as and when required!

In [2]:
# file paths
business_file="./Small_Datasets/small_business/part-00000-ee490ef4-cc94-4c10-a519-93e1a97ae759-c000.snappy.parquet"
reviews_file="./Small_Datasets/small_reviews/part-00000-68ec6b88-1bc3-48fb-b20e-d37d1431365f-c000.snappy.parquet"
users_file="./Small_Datasets/small_users/part-00000-aaf232b2-4eae-4625-a454-215dd40c60f4-c000.snappy.parquet"
app_name="Collabrative filtering for restaurant recommendation"

# Initialize spark session

In [3]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName(app_name) \
        .config("spark.some.config.option", "nothing") \
        .getOrCreate()
    return spark


# Load Dataset in Apache Spark

In [4]:
spark=init_spark()
business_df = spark.read.parquet(business_file)
reviews_df=spark.read.parquet(reviews_file)
users_df=spark.read.parquet(users_file)

## Selecting rows and columns

In our project, we are only concerned with a subset of columns from the dataset, specifically those that are relevant to our goal of recommending restaurants in Montreal. Therefore, we extract the necessary features from the business_df table, including the id, name, stars, category. 
Similarly, we filter the reviews_df=spark.read.parquet(reviews_file)
 table to include only reviews for the selected restaurants by performing an inner join with business_df.

In [5]:
business_df = business_df.select("business_id","name", "stars", 
                                 "review_count", "address", "city", "state", "postal_code", "longitude", 
                                 "categories", "latitude").withColumnRenamed("stars", "stars_restaurant")
reviews_df = reviews_df.join(business_df, on='business_id', how='inner')

## Convert String to index 
Prior to initiating the modeling process, it is essential to transform all the relevant columns to integer type for compatibility with the ALS model from pyspark. The columns requiring conversion are the business_id and user_id. We accomplish this by leveraging the StringIndexer function, which we imported from pyspark.ml.feature.

In [6]:
indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in ['business_id', 'user_id']]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(reviews_df).transform(reviews_df)
transformed.select(['business_id', 'user_id','business_id_index', 'user_id_index','stars','categories'])

DataFrame[business_id: string, user_id: string, business_id_index: double, user_id_index: double, stars: double, categories: string]

## Spliting the dataset into training and testing subsets 

In [7]:
(training, test) = transformed.randomSplit([0.7, 0.3])

## Create ALS model
The Apache Spark library provides various parameters for the ALS (Alternating Least Squares) algorithm, including:

- numBlocks*: the number of blocks used to partition users and items, allowing for parallelized computation (default value: 10).
- rank: the number of latent factors used in the model (default value: 10).
- maxIter: the maximum number of iterations to run (default value: 10).
- regParam: the regularization parameter used in ALS (default value: 1.0).
- implicitPrefs: a boolean value that indicates whether to use the explicit feedback ALS variant or the one adapted for implicit feedback data (default value: false, which means using explicit feedback).
- alpha: a parameter that applies to the implicit feedback variant of ALS, determining the baseline confidence in preference observations (default value: 1.0).
- nonnegative: a boolean value that specifies whether to use nonnegative constraints for least squares (default value: false).

In [8]:
als=ALS(maxIter=5,
        regParam=0.09,
        rank=20,
        userCol="user_id_index",
        itemCol="business_id_index",
        ratingCol="stars",
        coldStartStrategy="drop",
        nonnegative=True)

model=als.fit(training)

## Evaluate RMSE

In [9]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))

RMSE=1.451705532394602


## Fine tuning the model

In [10]:
als = ALS(maxIter=20,regParam=0.09,rank=20,userCol="user_id_index",itemCol="business_id_index",ratingCol="stars",coldStartStrategy="drop",nonnegative=True)
model = als.fit(training)

## Re-evaluating the RMSE

In [11]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))

RMSE=1.338326439415606


In [12]:
als = ALS(maxIter=20,regParam=0.09,rank=25,userCol="user_id_index",itemCol="business_id_index",ratingCol="stars",coldStartStrategy="drop",nonnegative=True)
model = als.fit(training)

In [13]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))

RMSE=1.3332946011859435


# Visual Recommendation

In [14]:
test = model.recommendForAllUsers(20).filter(col('user_id_index')==30).select("recommendations").collect()
topRestaurants = []
for item in test[0][0]:        
    topRestaurants.append(item.business_id_index)
    
schema = StructType([StructField("business_id_index",IntegerType(),True)])
restaurants = spark.createDataFrame(topRestaurants,IntegerType()).toDF("business_id_index")


transformed\
.select(['business_id', 'user_id', 'stars', 'categories'])\
.filter(col('user_id_index')==30)\
.show()

restaurants\
.join(transformed, on = 'business_id_index', how = 'inner')\
.select(['business_id', 'stars', 'categories', 'name'])\
.drop_duplicates(subset=['name'])\
.show()

predictions.select(['user_id', 'business_id', 'stars', 'prediction']).show()


+--------------------+--------------------+-----+--------------------+
|         business_id|             user_id|stars|          categories|
+--------------------+--------------------+-----+--------------------+
|ubLn_FrFygzcbhXTD...|0gZ8E5tBWTEtGEZDu...|  5.0|Restaurants, Cafe...|
|zrnP9HqoF-RI9jqoW...|0gZ8E5tBWTEtGEZDu...|  5.0|Coffee & Tea, Res...|
|yU3p0tEBtGuZLTcsB...|0gZ8E5tBWTEtGEZDu...|  5.0|Restaurants, Brea...|
|tmWp4Rtr_cm7nCh2u...|0gZ8E5tBWTEtGEZDu...|  5.0|Breakfast & Brunc...|
|vaURDGRo19cMB3Fy9...|0gZ8E5tBWTEtGEZDu...|  5.0|Restaurants, Coff...|
|58APdML-PG_OD4El2...|0gZ8E5tBWTEtGEZDu...|  2.0|Delicatessen, Foo...|
|ZL9sk7Imc42BNRhcj...|0gZ8E5tBWTEtGEZDu...|  4.0|Chinese, Restaura...|
|lpnuObNKbkH8usnUS...|0gZ8E5tBWTEtGEZDu...|  4.0|Cafes, Coffee & T...|
|q91nljuSDFl0IYRey...|0gZ8E5tBWTEtGEZDu...|  3.0|Vegan, Vegetarian...|
|8cPMjIwTnrVm3F2wL...|0gZ8E5tBWTEtGEZDu...|  4.0|Vegan, Cafes, Veg...|
|xlMgaPOpd_99SFTuO...|0gZ8E5tBWTEtGEZDu...|  4.0|Game Meat, Americ...|
|vYbpo