# Import Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import IDF
from pyspark.ml import Pipeline, PipelineModel
import numpy as np
from pyspark.sql.functions import desc
from pyspark.sql.types import *

In [2]:
spark = SparkSession \
        .builder \
        .appName("RestaurantRecommendation") \
        .getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Load the sampled data

In [3]:
business_df = spark.read.parquet('Sample_Datasets/montreal_business')
reviews_df = spark.read.parquet('Sample_Datasets/montreal_reviews')
users_df = spark.read.parquet('Sample_Datasets/montreal_users')

                                                                                

# Process Reviews
1. Group reviews of restaurants
2. Generate a TF-IDF vector from the reviews.

### Step 1 : Group reviews of restaurants

In [4]:
reviews_text = reviews_df.select('business_id', 'text')
# Group reviews by restaurant ID
reviews_by_business = reviews_text.rdd.map(lambda x: (x[0], x[1])).reduceByKey(lambda x, y: x + " " + y)
reviews_by_business_df = reviews_by_business.toDF(['business_id', 'text'])
reviews_by_business_df.show(3)

                                                                                

+--------------------+--------------------+
|         business_id|                text|
+--------------------+--------------------+
|m8LwM6D7UkgZxCj7u...|It is very small ...|
|Xt6OhDqC8J69wTkMN...|Nice new spot to ...|
|luPvNx4XSxaM7pka8...|I'm writing this ...|
+--------------------+--------------------+
only showing top 3 rows



### Step 2: Convert reviews to TF-IDF feature vector

In [5]:
# Remove the stop words from text, and create the tf idf matrix
tokenize_sentence        = RegexTokenizer(pattern = '\w+', inputCol = 'text', outputCol = 'tokens', toLowercase=True, gaps = False)
remove_stopwords = StopWordsRemover(inputCol = 'tokens', outputCol = 'nostopwords')
# 1000 words to limit number of features and reduce overfitting
countVectorizer  = CountVectorizer(inputCol='nostopwords', outputCol='tf', vocabSize=1000)
tfiDF            = IDF(inputCol='tf', outputCol='tfidf_vec')
pipeline         = Pipeline(stages=[tokenize_sentence, remove_stopwords, countVectorizer, tfiDF])

reviews_pipeline = pipeline.fit(reviews_by_business_df)
reviews_pipeline.write().overwrite().save('reviews_pipeline')

                                                                                

In [6]:
reviews_pipeline = PipelineModel.load('reviews_pipeline')
transformed_reviews_by_business = reviews_pipeline.transform(reviews_by_business_df)
transformed_reviews_by_business.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- nostopwords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tf: vector (nullable = true)
 |-- tfidf_vec: vector (nullable = true)



# Calculate cosine similarity between reviews of restaurants

In [7]:
def cosineSimilarity(vector1, vector2):
    '''
    This function calculates the cosine similarity between the restaurant feature vector and target restaurant feature vector
    using formula (A.B) / (sqrt((A**2)  * sqrt((B**2)))
    '''

    dot_product = np.dot(vector1, vector2)
    product_of_norms = np.sqrt(np.dot(vector1, vector1)) * np.sqrt(np.dot(vector2, vector2))

    return float(dot_product/product_of_norms)

In [8]:
def getUserRecommendationsTop10(user_restaurants, all_restaurants_tfidf) :
    '''
    This function uses cosine similarity to recommend restaurants based for the user
    '''

    # schema for output DF
    schema = StructType([
                            StructField("business_id", StringType(), True)
                            ,StructField("cosine_similarity", IntegerType(), True)
                        ])

    result_restaurants = spark.createDataFrame([], schema)

    for rest_id in user_restaurants:

        # Get the feature values i.e. review texts for the restaurants rated by the user
        user_rated_restaurant_features = all_restaurants_tfidf.filter(lambda x: x[0] == rest_id).map(lambda x: x[1]).first()

        # Calculate cosine similarity
        similar_restaurants_rdd = all_restaurants_tfidf.filter(lambda x: x[0] != rest_id).map(lambda x: (x[0], cosineSimilarity(x[1], user_rated_restaurant_features)))

        similar_restaurants_df = similar_restaurants_rdd.toDF(schema = ['business_id', 'cosine_similarity']).orderBy(desc('cosine_similarity')).limit(5)
        result_restaurants = result_restaurants.union(similar_restaurants_df)

    result_restaurants_top10 = result_restaurants.dropDuplicates(['business_id']).orderBy(desc('cosine_similarity')).limit(10)

    return result_restaurants_top10

In [9]:
def getRestaurantDetails(sim_rest):
    '''
    Function to get the recommended restaurant details based on ids.
    '''

    restaurant_details = sim_rest.join(business_df, on='business_id', how = 'inner') \
                                 .select(sim_rest.business_id, \
                                       sim_rest.cosine_similarity, business_df.name, \
                                       business_df.categories, business_df.stars, business_df.review_count,
                                       business_df.latitude, business_df.longitude)

    return restaurant_details

# Find Best Recommendations

### 1. Best Restaurants based on past reviews of the user

In [10]:
# Selecting a random user

# usr_id = reviews_df.select('user_id').orderBy(rand()).limit(1).collect()
# target_user = [val.user_id for val in usr_id][0]
target_user = 'KVehpTNCERwMGAx-h9MR-A'

#Create user profile from the reviews the user has given
user_reviews = reviews_df.filter( (reviews_df.user_id == target_user) & (reviews_df.stars > float(3.0)) )\
                        .select(reviews_df.business_id).distinct()

#Here we use collect only to retrieve user reviewed restaurants, so it is safe
user_restaurants = [val.business_id for val in user_reviews.collect()]
user_past_restaurants = user_reviews.join(business_df, on='business_id', how = 'inner')

print(f'\nRestaurants reviewed by target user: {target_user}')
user_past_restaurants.select('business_id', 'name', 'categories', 'stars').show()

# (business_id => [reviews word vectors] )
all_restaurant_tfidf = transformed_reviews_by_business.select('business_id', 'tfidf_vec') \
                                                    .rdd.map(lambda x: (x[0], x[1]))

#Fetch recommendations for target user
recommended_restaurants = getUserRecommendationsTop10(user_restaurants, all_restaurant_tfidf)

#Get details about the recommended restaurants
recommended_restaurants_details = getRestaurantDetails(recommended_restaurants)

print(f'\nRecommendations for User: {target_user}')
recommended_restaurants_details.drop('cosine_similarity','latitude', 'longitude') \
                           .orderBy(desc('cosine_similarity')).show(10)


                                                                                


Restaurants reviewed by target user: KVehpTNCERwMGAx-h9MR-A
+--------------------+-------------------+--------------------+-----+
|         business_id|               name|          categories|stars|
+--------------------+-------------------+--------------------+-----+
|qBRlwvEuAeTBh_yQL...|Bières et Compagnie|Brazilian, Restau...|  3.5|
|Wr0zpZnYtLxlRcP60...|          Pica Pica|Spanish, Tapas Ba...|  4.5|
+--------------------+-------------------+--------------------+-----+



                                                                                


Recommendations for User: KVehpTNCERwMGAx-h9MR-A


                                                                                

+--------------------+-------------------+--------------------+-----+------------+
|         business_id|               name|          categories|stars|review_count|
+--------------------+-------------------+--------------------+-----+------------+
|HSrbuWbiBnkujnfCJ...|     Vices et Versa|Poutineries, Rest...|  4.0|         139|
|8NHV3DBvq9Q88Ztzd...|      Le Saint-Bock|Restaurants, Bras...|  4.0|         197|
|B0YUHoSSVpq4a8Uh2...|Brasserie Harricana|Breakfast & Brunc...|  4.0|          68|
|lHazo7qKEcSc62PMU...|    L'Amère à Boire|Tapas Bars, Resta...|  4.0|          67|
|JgKpVoemZ-Khkfgif...|     The Bier Markt|Food, Beer, Wine ...|  3.5|         221|
|TFYkVf814tT6gDUBo...|              Tapeo|Restaurants, Tapa...|  4.5|          69|
|xNpD5ObRmK0q87f6Q...|           Tapas 24|Spanish, Tapas/Sm...|  3.5|         102|
|zWVSNGmW2O1iN7PEV...|        Taza Flores|Tex-Mex, Restaura...|  4.5|          40|
|sYCN1nTvy-gIUXf_H...|          Toro Toro|Restaurants, Tapa...|  3.5|          34|
|10R

### 2. Best Restaurants Recommendations based on a Keyword

If a user wants some recommendations based on a specific keyword, like a particular type of restaurant, or recommendations based on a specific food item.


In [11]:
def keyWordsRecommendation(keyword, all_restaurant_tfidf):

    keyword_df = sc.parallelize([(0, keyword)]).toDF(['business_id', 'text'])

    #Create word vector from keyword
    keyword_df = reviews_pipeline.transform(keyword_df)
    #Here we collect only one row, so it is safe
    keyword_tfidf = keyword_df.select('tfidf_vec').collect()[0][0]

    # Fetch cosine similarity
    similar_restaurants_rdd = all_restaurant_tfidf.map(lambda x: (x[0], cosineSimilarity(x[1], keyword_tfidf)))

    similar_restaurants = similar_restaurants_rdd.toDF(['business_id', 'cosine_similarity']) \
                                             .orderBy(desc('cosine_similarity')).limit(10)

    return similar_restaurants


In [12]:
keyword = 'Indian'

print(f'\nRestaurants similar to keyword - {keyword}')
similar_restaurants = keyWordsRecommendation(keyword, all_restaurant_tfidf)

# Fetch details of similar restaurants
similar_restaurants = getRestaurantDetails(similar_restaurants)
similar_restaurants.drop('input_business_id', 'cosine_similarity','latitude', 'longitude').orderBy(desc('stars')).show()


Restaurants similar to keyword - Indian


                                                                                

+--------------------+-----------------+--------------------+-----+------------+
|         business_id|             name|          categories|stars|review_count|
+--------------------+-----------------+--------------------+-----+------------+
|6wZIRMkNY3XVF3rSo...| Le Super Qualité| Restaurants, Indian|  4.5|          28|
|B3ucstFotOHce9VhS...|Restaurant Gandhi| Indian, Restaurants|  4.0|         119|
|rwLm3556i3xw2n3ND...|          Thanjai|Vegan, Restaurant...|  4.0|         120|
|vVW4aVlo0bxPrgEKH...|    Notre Endroit| Restaurants, Indian|  4.0|          26|
|Ra6dcmjOIqDdBXrem...|          Singh's| Indian, Restaurants|  4.0|          90|
|82fdk8YOZ67hl_t66...|           Le Taj|Middle Eastern, I...|  4.0|         253|
|A_05gZ2QwayLb0YeQ...|  Chennai Express| Restaurants, Indian|  4.0|          53|
|gA4bnK91phT5FTzG4...|  Restaurant Devi|Indian, Restauran...|  3.5|          86|
|5HAv0eKeid0MIEAr-...|        Taj Mahal| Restaurants, Indian|  3.0|          22|
|5KsvP10z9InBcI6hh...|  Buff