# Import Libraries

In [1]:

from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType,StructField,IntegerType
from pyspark.sql.functions import stddev,mean,avg,lit

# Constants : Modify as and when required!

In [2]:
# file paths
business_file="Sample_Datasets/montreal_business/part-00000-b5f251d0-79e6-47a8-a405-042eb7b7894e-c000.snappy.parquet"
reviews_file="Sample_Datasets/montreal_reviews/part-00000-f0e4463e-0ac9-402e-b995-734cbefc958e-c000.snappy.parquet"
users_file="Sample_Datasets/montreal_users/part-00000-a7d49d78-89a7-478f-a577-0efe02dca047-c000.snappy.parquet"


# Initialize spark session

In [3]:
app_name="Collaborative filtering for restaurant recommendation"

def init_spark():
    spark = SparkSession \
        .builder \
        .appName(app_name) \
        .getOrCreate()
    return spark


# Load Dataset in Apache Spark

In [4]:
spark=init_spark()
business_df = spark.read.parquet(business_file)
reviews_df=spark.read.parquet(reviews_file)
users_df=spark.read.parquet(users_file)

## Selecting required features

In our project, we are only concerned with a subset of columns from the dataset, specifically those that are relevant to our goal of recommending restaurants in Montreal. Therefore, we extract the necessary features from the business_df table, including the id, name, stars, category. 
Similarly, we filter the reviews_df table to include only reviews for the selected restaurants by performing an inner join with business_df.

In [5]:
business_df = business_df.select("business_id","name", "stars", 
                                 "review_count", "address", "city", "state", "postal_code", "longitude", 
                                 "categories", "latitude").withColumnRenamed("stars", "stars_restaurant")
reviews_df = reviews_df.join(business_df, on='business_id', how='inner')

## Preparing Data for ALS: Convert String to index
Prior to initiating the modeling process, it is essential to transform all the relevant columns to integer type for compatibility with the ALS model from pyspark. The columns requiring conversion are the business_id and user_id. We accomplish this by leveraging the StringIndexer function, which we imported from pyspark.ml.feature.

In [6]:
indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in ['business_id', 'user_id']]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(reviews_df).transform(reviews_df)
transformed.select(['business_id', 'user_id','business_id_index', 'user_id_index','stars','categories'])

DataFrame[business_id: string, user_id: string, business_id_index: double, user_id_index: double, stars: double, categories: string]

## Spliting the dataset into training and testing subsets 

Setting a seed value as 3 to make randomsplit output deterministic

In [7]:
(training, test) = transformed.randomSplit([0.8, 0.2],3)

In [8]:
# Compute the global mean
global_mean = training.agg(avg('stars')).collect()[0][0]
print("Global Average:",str(global_mean))


Global Average: 3.8922836512555232


# Prediction using global average for comparison

In [9]:
#Evaluate the root mean squared error if we use the global average as a prediction for all rating, Our model should perform better than this

test_avg = test.withColumn('prediction',lit(global_mean))
    
evaluator = RegressionEvaluator(metricName="rmse", labelCol="stars",predictionCol="prediction")
rmse = evaluator.evaluate(test_avg)

print("Root-mean-square error = " + str(rmse))


Root-mean-square error = 1.2260849010325352


## Basic ALS model
The Apache Spark library provides various parameters for the ALS (Alternating Least Squares) algorithm, including:

- rank: the number of latent factors used in the model (default value: 10).
- maxIter: the maximum number of iterations to run (default value: 10).
- regParam: the regularization parameter used in ALS (default value: 1.0).
- implicitPrefs: a boolean value that indicates whether to use the explicit feedback ALS variant or the one adapted for implicit feedback data (default value: false, which means using explicit feedback).
- alpha: a parameter that applies to the implicit feedback variant of ALS, determining the baseline confidence in preference observations (default value: 1.0).
- nonnegative: a boolean value that specifies whether to use nonnegative constraints for least squares (default value: false).

In [10]:

#ALS model with default values
als=ALS(userCol="user_id_index",itemCol="business_id_index",ratingCol="stars",coldStartStrategy="drop",nonnegative=True)
als.setSeed(2)

ALS_9718f70bc14f

In [11]:
#train the default model
model=als.fit(training)

In [12]:
# predict the stars for test set 
predictions=model.transform(test)

In [13]:
#evaludate default model
evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))

RMSE=1.3534108396501865


## Tuning Hyper Parameters

In [14]:
ranks=[5,30,50,100]
for rank in ranks:
    als = ALS(rank=rank,userCol="user_id_index",itemCol="business_id_index",ratingCol="stars",coldStartStrategy="drop",nonnegative=True)
    als.setSeed(2)
    model = als.fit(training)
    evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
    predictions=model.transform(test)
    rmse=evaluator.evaluate(predictions)
    print("RMSE with latent factor "+str(rank) +" is="+str(rmse))
    

RMSE with latent factor 5 is=1.3524617852548462
RMSE with latent factor 30 is=1.3238220621391399
RMSE with latent factor 50 is=1.3182668564134457
RMSE with latent factor 100 is=1.3092697928682182


Final model 

In [15]:
als = ALS(maxIter=20,regParam=0.4,rank=100,userCol="user_id_index",itemCol="business_id_index",ratingCol="stars",coldStartStrategy="drop",nonnegative=True)
als.setSeed(2)
model = als.fit(training)
evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))
print("Given ratings vs Predicted ratings")
predictions.select(['user_id', 'business_id', 'stars', 'prediction']).show()


RMSE=1.2362356753952617
Given ratings vs Predicted ratings
+--------------------+--------------------+-----+----------+
|             user_id|         business_id|stars|prediction|
+--------------------+--------------------+-----+----------+
|FdMNvBDX--D_MWuu5...|-1xuC540Nycht_iWF...|  5.0| 4.1148996|
|FdMNvBDX--D_MWuu5...|1uPQTz5XZSWr0ti7k...|  4.0|  3.945095|
|FdMNvBDX--D_MWuu5...|8dNuvBvySG2FxFdEw...|  3.0| 2.8763595|
|FdMNvBDX--D_MWuu5...|Dfbjw4uixn84JIF3G...|  4.0| 4.2694216|
|FdMNvBDX--D_MWuu5...|EjS8cFHBK_rvgXqfE...|  4.0|  4.601696|
|FdMNvBDX--D_MWuu5...|O1P4x0RgJWb6Waetk...|  4.0|  4.119071|
|FdMNvBDX--D_MWuu5...|Ok9vikKusbG_Ue9pJ...|  4.0| 3.8579445|
|FdMNvBDX--D_MWuu5...|QUxiY29dWzLhKgaOe...|  3.0| 4.1098213|
|FdMNvBDX--D_MWuu5...|Tv70ZOXji7yCfsoe-...|  5.0| 4.4358563|
|FdMNvBDX--D_MWuu5...|U5XCBcTtiNyasdT3A...|  5.0|  4.101814|
|FdMNvBDX--D_MWuu5...|YbnqifugnZUrye6Nz...|  4.0|  4.672397|
|FdMNvBDX--D_MWuu5...|dKRuSDoP7r_Qn48uS...|  5.0| 4.1576047|
|FdMNvBDX--D_MWuu5...|ey9c

In [16]:
std_dev = predictions.select(stddev('prediction')).collect()[0][0]
print('Standard deviation of predictions:', std_dev)

std_dev_stars = predictions.select(stddev('stars')).collect()[0][0]
print('Standard deviation of stars:', std_dev_stars)

mean_prediction = predictions.select(mean('prediction')).collect()[0][0]
print('Mean of predictions:', mean_prediction)

mean_stars = predictions.select(mean('stars')).collect()[0][0]
print('Mean of stars:', mean_stars)

print('Standard Deviation ratio ', std_dev/std_dev_stars)

Standard deviation of predictions: 0.7501112244849435
Standard deviation of stars: 1.1406615072644828
Mean of predictions: 3.3752867230158388
Mean of stars: 3.8865761405816848
Standard Deviation ratio  0.6576107107215785


ALS Model With Bias

In [21]:
# Compute user mean and item mean on the training set
user_mean = training.groupBy('user_id_index').agg(avg('stars').alias('user_mean'))
item_mean = training.groupBy('business_id_index').agg(avg('stars').alias('item_mean'))
#remove bias from training set
interactions = training.join(user_mean, 'user_id_index').join(item_mean, 'business_id_index')
interactions = interactions.withColumn('user_item_interaction', col('stars') - col('user_mean') - col('item_mean') + global_mean)
interactions.select('stars','user_mean','item_mean','user_item_interaction').show(10)

+-----+------------------+-----------------+---------------------+
|stars|         user_mean|        item_mean|user_item_interaction|
+-----+------------------+-----------------+---------------------+
|  5.0|3.9722222222222223| 3.58974358974359|    1.330317839289711|
|  4.0|               4.0|2.769230769230769|    1.123052882024754|
|  1.0|               3.0|3.513888888888889|   -1.621605237633366|
|  1.0|3.7777777777777777|           2.1875|  -1.0729941265222545|
|  5.0| 4.095238095238095| 4.08641975308642|   0.7106258029310082|
|  1.0|               1.0|           2.1875|   1.7047836512555232|
|  3.0|  4.12962962962963|4.083333333333333|  -1.3206793117074396|
|  5.0|  4.12962962962963|4.659574468085107|  0.10307955354078668|
|  5.0| 4.166666666666667| 4.51063829787234|    0.214978686716516|
|  5.0|               3.0| 3.58974358974359|   2.3025400615119334|
+-----+------------------+-----------------+---------------------+
only showing top 10 rows



In [23]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='stars', predictionCol='user_item_interaction_recomputed')

ranks=[5,10,30,50,100]
for rank in ranks:
    als = ALS(rank=rank,userCol="user_id_index",itemCol="business_id_index",ratingCol="user_item_interaction",coldStartStrategy="drop",nonnegative=True)
    als.setSeed(2)
    model = als.fit(interactions)
    predictions=model.transform(test).withColumnRenamed('prediction','predicted_rating')
    predictions=  predictions.join(user_mean, 'user_id_index').join(item_mean, 'business_id_index')
    predictions = predictions.withColumn('user_item_interaction_recomputed', col('predicted_rating') + col('user_mean') + col('item_mean') - global_mean)
    rmse=evaluator.evaluate(predictions)
    print("RMSE with latent factor "+str(rank) +" is="+str(rmse))

RMSE with latent factor 5 is=1.1861574341304963
RMSE with latent factor 10 is=1.18501689628477
RMSE with latent factor 30 is=1.1821426821686394
RMSE with latent factor 50 is=1.1818901204888033
RMSE with latent factor 100 is=1.180330526154049


# Find Best Recommendations

In [24]:
def get_recommendations():
    """
    Returns top recommendations for a user.

    Returns
    -------
    :py:class:`pyspark.sql.DataFrame`
    a DataFrame of (itemCol, recommendations), where recommendations are
    stored as an array of ('name','business_id', 'stars', 'categories') Rows.
    """
    test = model.recommendForAllUsers(10).filter(col('user_id_index')==30).select("recommendations").take(10)
    topRestaurants = []
    for item in test[0][0]:        
        topRestaurants.append(item.business_id_index)
    
    schema = StructType([StructField("business_id_index",IntegerType(),True)])
    restaurants = spark.createDataFrame(topRestaurants,IntegerType()).toDF("business_id_index")
    return restaurants

In [25]:
def display_transformed_list():
    transformed\
    .select(['name', 'user_id', 'stars', 'categories'])\
    .filter(col('user_id_index')==30)\
    .show()

In [26]:
def display_top10_recommendations(restaurants):
    """
    Displays the top 10 restaurant recommendations.
    """
    restaurants\
    .join(transformed, on = 'business_id_index', how = 'inner')\
    .select([ 'name','business_id', 'stars', 'categories'])\
    .drop_duplicates(subset=['name'])\
    .show(10)

## Display the Top Recommendations

In [27]:


top10_recommendations = get_recommendations()
display_top10_recommendations(top10_recommendations)


+--------------------+--------------------+-----+--------------------+
|                name|         business_id|stars|          categories|
+--------------------+--------------------+-----+--------------------+
|         Burger King|ECUSa_fHQVLvG5Dg2...|  2.0|Fast Food, Restau...|
|Centre des Mets C...|OsqehU809Bxa_clNa...|  1.0|Chinese, Restaurants|
|           Délicieux|7ScWdpnwZyP6hJXxG...|  5.0|Breakfast & Brunc...|
|          L'Oeufrier|-2R--HQiUyvN4qld5...|  3.0|Breakfast & Brunc...|
|             Mandy's|LtyoPfxpvcF_9e9wM...|  5.0|Restaurants, Vega...|
|Pains Farcis Tianjin|len7Tn8eoi1KhUXbv...|  1.0|Specialty Food, R...|
|Restaurant Xi'An ...|HVd718KlG6VdY5oh5...|  1.0|Restaurants, Chinese|
|       Resto Chillax|ubLn_FrFygzcbhXTD...|  5.0|Restaurants, Cafe...|
|             Reva Zy|nspY01_h-1uiGFjF1...|  1.0|Sandwiches, Break...|
|Tiki Ming Restaurant|RpLfGOE7Hjru2Tc3z...|  1.0|Fast Food, Chines...|
+--------------------+--------------------+-----+--------------------+

