# Import Libraries

In [10]:

from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType,StructField,IntegerType


# Constants : Modify as and when required!

In [11]:
# file paths
business_file="Sample_Datasets/montreal_business/part-00000-b5f251d0-79e6-47a8-a405-042eb7b7894e-c000.snappy.parquet"
reviews_file="Sample_Datasets/montreal_reviews/part-00000-f0e4463e-0ac9-402e-b995-734cbefc958e-c000.snappy.parquet"
users_file="Sample_Datasets/montreal_users/part-00000-a7d49d78-89a7-478f-a577-0efe02dca047-c000.snappy.parquet"


# Initialize spark session

In [12]:
app_name="Collaborative filtering for restaurant recommendation"

def init_spark():
    spark = SparkSession \
        .builder \
        .appName(app_name) \
        .getOrCreate()
    return spark


# Load Dataset in Apache Spark

In [13]:
spark=init_spark()
business_df = spark.read.parquet(business_file)
reviews_df=spark.read.parquet(reviews_file)
users_df=spark.read.parquet(users_file)

## Selecting required features

In our project, we are only concerned with a subset of columns from the dataset, specifically those that are relevant to our goal of recommending restaurants in Montreal. Therefore, we extract the necessary features from the business_df table, including the id, name, stars, category. 
Similarly, we filter the reviews_df table to include only reviews for the selected restaurants by performing an inner join with business_df.

In [14]:
business_df = business_df.select("business_id","name", "stars", 
                                 "review_count", "address", "city", "state", "postal_code", "longitude", 
                                 "categories", "latitude").withColumnRenamed("stars", "stars_restaurant")
reviews_df = reviews_df.join(business_df, on='business_id', how='inner')

## Preparing Data for ALS: Convert String to index
Prior to initiating the modeling process, it is essential to transform all the relevant columns to integer type for compatibility with the ALS model from pyspark. The columns requiring conversion are the business_id and user_id. We accomplish this by leveraging the StringIndexer function, which we imported from pyspark.ml.feature.

In [15]:
indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in ['business_id', 'user_id']]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(reviews_df).transform(reviews_df)
transformed.select(['business_id', 'user_id','business_id_index', 'user_id_index','stars','categories'])

DataFrame[business_id: string, user_id: string, business_id_index: double, user_id_index: double, stars: double, categories: string]

## Spliting the dataset into training and testing subsets 

Setting a seed value as 3 to make randomsplit output deterministic

In [None]:
(training, test) = transformed.randomSplit([0.8, 0.2],3)

## Create ALS model
The Apache Spark library provides various parameters for the ALS (Alternating Least Squares) algorithm, including:

- rank: the number of latent factors used in the model (default value: 10).
- maxIter: the maximum number of iterations to run (default value: 10).
- regParam: the regularization parameter used in ALS (default value: 1.0).
- implicitPrefs: a boolean value that indicates whether to use the explicit feedback ALS variant or the one adapted for implicit feedback data (default value: false, which means using explicit feedback).
- alpha: a parameter that applies to the implicit feedback variant of ALS, determining the baseline confidence in preference observations (default value: 1.0).
- nonnegative: a boolean value that specifies whether to use nonnegative constraints for least squares (default value: false).

In [17]:
als=ALS(maxIter=5,
        regParam=0.09,
        rank=20,
        userCol="user_id_index",
        itemCol="business_id_index",
        ratingCol="stars",
        coldStartStrategy="drop",
        nonnegative=True)

model=als.fit(training)

## Evaluate RMSE

In [18]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))

RMSE=1.4344524168573067


## Tuning Hyper Parameters

In [19]:
values=[0.08,0.09,0.1,0.15,0.2,0.25,0.3,0.4,0.45,0.5,0.5,0.6,0.7,0.8,0.9]
for rm in values:
    als = ALS(maxIter=20,regParam=rm,rank=25,userCol="user_id_index",itemCol="business_id_index",ratingCol="stars",coldStartStrategy="drop",nonnegative=True)
    model = als.fit(training)
    evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
    predictions=model.transform(test)
    rmse=evaluator.evaluate(predictions)
    print("RMSE="+str(rmse))
    print(model.userFactors.count())
    print(model.itemFactors.count())
    

RMSE=1.3256622625564687
42263
4169
RMSE=1.3197219683901034
42263
4169
RMSE=1.3139086154158246
42263
4169
RMSE=1.284438624757947
42263
4169
RMSE=1.2623386478171916
42263
4169
RMSE=1.2474116071619326
42263
4169
RMSE=1.238762445624405
42263
4169
RMSE=1.2356426249395458
42263
4169
RMSE=1.2395899586518075
42263
4169
RMSE=1.2469344735450594
42263
4169
RMSE=1.2469344735450616
42263
4169
RMSE=1.2721805658805418
42263
4169
RMSE=1.3102064196211636
42263
4169
RMSE=1.3574268518445995
42263
4169
RMSE=1.4111391176146966
42263
4169


In [20]:
# we are uisng 0.4 as the regularizatin parameter as it yields lowest RMSE

In [21]:
als = ALS(maxIter=20,regParam=0.4,rank=25,userCol="user_id_index",itemCol="business_id_index",ratingCol="stars",coldStartStrategy="drop",nonnegative=True)
model = als.fit(training)
evaluator=RegressionEvaluator(metricName="rmse",labelCol="stars",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))

print("Given ratings vs Predicted ratings")
predictions.select(['user_id', 'business_id', 'stars', 'prediction']).show()


RMSE=1.2356426249395458
Given ratings vs Predicted ratings
+--------------------+--------------------+-----+----------+
|             user_id|         business_id|stars|prediction|
+--------------------+--------------------+-----+----------+
|bIDIIDbZUo1p5Vl8w...|0JGMKaKJGVuDus5Wc...|  5.0|  4.263014|
|6KHdW3MjBZJZ4BkzJ...|0W4lkclzZThpx3V65...|  4.0| 3.7163045|
|75Xtdm65_xnFXbL3v...|0W4lkclzZThpx3V65...|  5.0| 4.3275266|
|C7o1LcGjQis0ICvsQ...|0W4lkclzZThpx3V65...|  4.0| 3.5761085|
|w01hd3eejoOkE6kQ9...|17jcc6RYsSgf3NgWE...|  5.0|  4.356521|
|yX0g0cSodyqpIzfa6...|1iKMxBsTsPSoEzYUf...|  5.0| 2.3909633|
|2av488ePvb-Z4qgeN...|25m6rM6hFw2CGADUj...|  4.0| 3.7119172|
|XZeFJAOFwmkw9rCeu...|2gUbgbdJ7IFSbicBX...|  4.0| 4.1132197|
|1IpC-cW9MxaM9Tvhj...|2wFSVah7gU9ImNuBH...|  5.0| 4.3055344|
|XP9aSaVhu_q1OCUtu...|3DGtOhtb6jNdRs9T9...|  4.0|  3.911713|
|lh_0BHrCSx87geLzQ...|3jKUbhGSjFTv5jZ0w...|  5.0|  4.284819|
|mJ6yNAm5x5-lyW7mk...|3p0yosq4IE5E2B91P...|  5.0| 3.2425134|
|2av488ePvb-Z4qgeN...|46Ld

# Find Best Recommendations

In [22]:
def get_recommendations():
    """
    Returns top recommendations for a user.

    Returns
    -------
    :py:class:`pyspark.sql.DataFrame`
    a DataFrame of (itemCol, recommendations), where recommendations are
    stored as an array of ('name','business_id', 'stars', 'categories') Rows.
    """
    test = model.recommendForAllUsers(10).filter(col('user_id_index')==30).select("recommendations").take(10)
    topRestaurants = []
    for item in test[0][0]:        
        topRestaurants.append(item.business_id_index)
    
    schema = StructType([StructField("business_id_index",IntegerType(),True)])
    restaurants = spark.createDataFrame(topRestaurants,IntegerType()).toDF("business_id_index")
    return restaurants

In [23]:
def display_transformed_list():
    transformed\
    .select(['name', 'user_id', 'stars', 'categories'])\
    .filter(col('user_id_index')==30)\
    .show()

In [24]:
def display_top10_recommendations(restaurants):
    """
    Displays the top 10 restaurant recommendations.
    """
    restaurants\
    .join(transformed, on = 'business_id_index', how = 'inner')\
    .select([ 'name','business_id', 'stars', 'categories'])\
    .drop_duplicates(subset=['name'])\
    .show(10)

## Display the Top Recommendations

In [25]:


top10_recommendations = get_recommendations()
display_top10_recommendations(top10_recommendations)


+-------------------+--------------------+-----+--------------------+
|               name|         business_id|stars|          categories|
+-------------------+--------------------+-----+--------------------+
|               180g|OpMGaD7tsUZzt19mf...|  5.0|Books, Mags, Musi...|
|        Café Nomade|b--VluQ4oOo9Zqlaz...|  5.0|Restaurants, Food...|
|           Dispensa|HwxlCM7TY73C1kcQK...|  5.0|Food, Restaurants...|
|  Il Miglio Express|L_ikuudEVAgQNKMcr...|  5.0|Italian, Restaura...|
|La Fromagerie Hamel|IxthHWpZgpdt72uOU...|  5.0|Cheese Shops, Res...|
|         Melchorita|pRvq-3aYrzzbgEuOw...|  5.0|Latin American, P...|
|            Mercado|Y1tXlYIwg26AdewIl...|  5.0|Latin American, B...|
|            Parasol|ZwWiPJYA-hk6jtBo9...|  5.0|Wine Bars, Restau...|
|            Salerno|n_LSGXDDApdFS9EC2...|  5.0|Restaurants, Pizz...|
|         Sushi Shop|79cDBLMfxp_PLMret...|  4.0|Restaurants, Sush...|
+-------------------+--------------------+-----+--------------------+

