# Import Libraries

In [75]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import IDF
from pyspark.ml import Pipeline, PipelineModel
import numpy as np
from pyspark.sql.functions import desc
from pyspark.sql.types import *
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator

In [76]:
spark = SparkSession \
        .builder \
        .appName("Content Based restaurant recommendation") \
        .getOrCreate()
sc = spark.sparkContext

# Load the sampled data

In [77]:
business_df = spark.read.parquet('Sample_Datasets/montreal_business')
reviews_df = spark.read.parquet('Sample_Datasets/montreal_reviews')
users_df = spark.read.parquet('Sample_Datasets/montreal_users')

# Process Reviews
1. Group reviews of restaurants
2. Generate a Word2Vec vector from the reviews.

### Step 1 : Group reviews of restaurants

In [78]:
reviews_df_train, reviews_df_test = reviews_df.randomSplit([0.8, 0.2], seed=3)
reviews_text = reviews_df_train.select('business_id', 'text')

# Group reviews by restaurant ID
reviews_by_business = reviews_text.rdd.map(lambda x: (x[0], x[1])).reduceByKey(lambda x, y: x + " " + y)
reviews_by_business_df = reviews_by_business.toDF(['business_id', 'text'])
reviews_by_business_df.show(3)

                                                                                

+--------------------+--------------------+
|         business_id|                text|
+--------------------+--------------------+
|-1xuC540Nycht_iWF...|I was so hungry f...|
|-4TVAE4KcLWVSXO2r...|These guys won't ...|
|-ldO91cdwbzIHN5hD...|Not a fan of this...|
+--------------------+--------------------+
only showing top 3 rows



### Step 2: Convert reviews to Word2Vec feature vector

In [79]:
# Remove the stop words from text, and create the tf idf matrix
tokenize_sentence        = RegexTokenizer(pattern = '\w+', inputCol = 'text', outputCol = 'tokens', toLowercase=True, gaps = False)
remove_stopwords = StopWordsRemover(inputCol = 'tokens', outputCol = 'nostopwords')
# 1000 words to limit number of features and reduce overfitting
countVectorizer  = CountVectorizer(inputCol='nostopwords', outputCol='tf', vocabSize=1000)
tfiDF            = IDF(inputCol='tf', outputCol='tfidf_vec')
word2Vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'nostopwords', outputCol = 'word_2_vec', seed=123)
pipeline         = Pipeline(stages=[tokenize_sentence, remove_stopwords, countVectorizer, tfiDF, word2Vec])

reviews_pipeline = pipeline.fit(reviews_by_business_df)
reviews_pipeline.write().overwrite().save('reviews_pipeline')

                                                                                

In [80]:
reviews_pipeline = PipelineModel.load('reviews_pipeline')
transformed_reviews_by_business = reviews_pipeline.transform(reviews_by_business_df)
transformed_reviews_by_business.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- nostopwords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tf: vector (nullable = true)
 |-- tfidf_vec: vector (nullable = true)
 |-- word_2_vec: vector (nullable = true)



# Calculate cosine similarity between reviews of restaurants

In [81]:
def cosineSimilarity(vector1, vector2):
    '''
    This function calculates the cosine similarity between the restaurant feature vector and target restaurant feature vector
    using formula (A.B) / (sqrt((A**2)  * sqrt((B**2)))
    '''

    dot_product = np.dot(vector1, vector2)
    product_of_norms = np.sqrt(np.dot(vector1, vector1)) * np.sqrt(np.dot(vector2, vector2))

    return float(dot_product/product_of_norms)

In [82]:
def getUserRecommendations(user_restaurants, all_restaurants_word2vec) :
    '''
    This function uses cosine similarity to recommend restaurants based for the user
    '''

    # schema for output DF
    schema = StructType([
                            StructField("business_id", StringType(), True)
                            ,StructField("cosine_similarity", IntegerType(), True)
                            ,StructField("prediction", DoubleType(), True)
                        ])

    result_restaurants = spark.createDataFrame([], schema)

    for rest_id in user_restaurants:

        # Get the feature values i.e. review texts for the restaurants rated by the user
        user_rated_restaurant_features = all_restaurants_word2vec.filter(lambda x: x[0] == rest_id).map(lambda x: x[1]).first()

        # Calculate cosine similarity
        similar_restaurants_rdd = all_restaurants_word2vec.filter(lambda x: x[0] != rest_id).map(lambda x: (x[0], cosineSimilarity(x[1], user_rated_restaurant_features)))

        similar_restaurants_df = similar_restaurants_rdd.toDF(schema = ['business_id', 'cosine_similarity']).orderBy(desc('cosine_similarity'))

        # Scale cosine similarity to ratings 1-5
        similar_restaurants_df = similar_restaurants_df.withColumn("prediction", round((similar_restaurants_df['cosine_similarity'] - 0) / (1 - 0) * (5 - 1) + 1, 2))

        result_restaurants = result_restaurants.union(similar_restaurants_df)

    result_restaurants = result_restaurants.dropDuplicates(['business_id']).orderBy(desc('cosine_similarity'))

    return result_restaurants

In [83]:
def getRestaurantDetails(sim_rest):
    '''
    Function to get the recommended restaurant details based on ids.
    '''

    restaurant_details = sim_rest.join(business_df, on='business_id', how = 'inner') \
                                 .select(sim_rest.business_id, \
                                       sim_rest.cosine_similarity, business_df.name, \
                                       business_df.categories, business_df.stars, business_df.review_count,
                                       business_df.latitude, business_df.longitude, sim_rest.prediction).filter(business_df.stars >= float(3.0))

    return restaurant_details

# Find Best Recommendations

### 1. Best Restaurants based on past reviews of the user

In [84]:
from pyspark.sql.functions import rand
# Selecting a random user
# usr_id = reviews_df.select('user_id').orderBy(rand()).limit(1).collect()
# target_user = [val.user_id for val in usr_id][0]
target_user = 'bds7VxgVm0e6Pu5RuVV-wg'

#Create user profile from the reviews the user has given
user_reviews = reviews_df_train.filter( (reviews_df_train.user_id == target_user) & (reviews_df_train.stars >= float(3.0)) )\
                        .select(reviews_df_train.business_id).distinct()

#Here we use collect only to retrieve user reviewed restaurants, so it is safe
user_restaurants = [val.business_id for val in user_reviews.collect()]
user_past_restaurants = user_reviews.join(business_df, on='business_id', how = 'inner')

print(f'\nRestaurants reviewed by target user: {target_user}')
user_past_restaurants.select('business_id', 'name', 'categories', 'stars').show()

# (business_id => [reviews word vectors] )
all_restaurant_word2vec = transformed_reviews_by_business.select('business_id', 'word_2_vec') \
                                                    .rdd.map(lambda x: (x[0], x[1]))

#Fetch recommendations for target user
recommended_restaurants = getUserRecommendations(user_restaurants, all_restaurant_word2vec)

#Get details about the recommended restaurants
recommended_restaurants_details = getRestaurantDetails(recommended_restaurants)

print(f'\nRecommendations for User: {target_user}')
recommended_restaurants_details.drop('cosine_similarity','latitude', 'longitude') \
                           .orderBy(desc('cosine_similarity')).show(10)


                                                                                


Restaurants reviewed by target user: bds7VxgVm0e6Pu5RuVV-wg


                                                                                

+--------------------+--------------------+--------------------+-----+
|         business_id|                name|          categories|stars|
+--------------------+--------------------+--------------------+-----+
|-Levln3VcfeXS4iHt...|                Nozy|Japanese, Restaur...|  4.5|
|QrFuG9RN3UfUeE843...|            Ramen-Ya|Noodles, Restaura...|  3.5|
|Jbj4VGceJVKXOoN0z...|   Dessert Cafe Momo|Coffee & Tea, Res...|  5.0|
|s0GeAnaUENWb2xAn_...|          Café Frida|Cafes, Mexican, B...|  4.5|
|q0GL761jc3ia_nEDt...|         Le Poké Bar|Seafood, Food, Ha...|  3.5|
|3cSe8U3u-rscNruJ6...|           Uniburger|Restaurants, Burg...|  4.0|
|qje0dtfyYDRrdgpmT...|   Pho Bang New York|Vietnamese, Resta...|  3.5|
|j3QIwGvwZHj459YHJ...|         Hotto Doggu|Hot Dogs, Restaur...|  3.5|
|s2I_Ni76bjJNK9yG6...|Maison Christian ...|Food, Sandwiches,...|  4.5|
|FTJqVHCcnjkvM3Bva...|           Seoul BBQ|Restaurants, Kore...|  4.0|
|nyeIcJRPs8n199nW4...|          Super Taco|Mexican, Restaurants|  4.0|
|f0-qm

                                                                                


Recommendations for User: bds7VxgVm0e6Pu5RuVV-wg




+--------------------+--------------------+--------------------+-----+------------+----------+
|         business_id|                name|          categories|stars|review_count|prediction|
+--------------------+--------------------+--------------------+-----+------------+----------+
|gZvwCOaMhxFXXNvy1...|             Marusan|Japanese, Restaur...|  4.5|         181|      4.93|
|xidr6_d3fwKSb_XaB...|            Le Ninja|Japanese, Sushi B...|  4.0|          32|      4.92|
|Wh_-S_tJvma15bxjS...|Saiko Bistrot Iza...|Sushi Bars, Japan...|  4.0|          78|      4.88|
|G1ynW1gclKMClO4It...|               Chops|Food, Asian Fusio...|  4.0|          15|      4.88|
|J2GTY3N_nH7d4L7ng...|     Espadon Plateau|Fish & Chips, Med...|  4.0|          13|      4.87|
|ZsA8J49XHGd6nrNhb...|              Toroli|Restaurants, Japa...|  4.5|          34|      4.86|
|EEHMMiJUklGGiikqd...|               Milos|  Restaurants, Greek|  4.0|         146|      4.86|
|AE-IzUqIkYfJrUqx3...|        La Republika|Filipin

                                                                                

### 2. Best Restaurants Recommendations based on a Keyword

In [85]:
def keyWordsRecommendation(keyword, all_restaurant_word2vec):

    keyword_df = sc.parallelize([(0, keyword)]).toDF(['business_id', 'text'])

    #Create word vector from keyword
    keyword_df = reviews_pipeline.transform(keyword_df)
    #Here we collect only one row, so it is safe
    keyword_w2v = keyword_df.select('word_2_vec').collect()[0][0]

    # Fetch cosine similarity
    similar_restaurants_rdd = all_restaurant_word2vec.map(lambda x: (x[0], cosineSimilarity(x[1], keyword_w2v)))

    similar_restaurants = similar_restaurants_rdd.toDF(['business_id', 'cosine_similarity']) \
                                             .orderBy(desc('cosine_similarity')).limit(10)

    # Scale cosine similarity to ratings 1-5
    similar_restaurants = similar_restaurants.withColumn("prediction", round((similar_restaurants['cosine_similarity'] - 0) / (1 - 0) * (5 - 1) + 1, 2))

    return similar_restaurants


In [86]:
keyword = 'Spicy Indian Food'

print(f'\nRestaurants similar to keyword - {keyword}')
similar_restaurants = keyWordsRecommendation(keyword, all_restaurant_word2vec)

# Fetch details of similar restaurants
similar_restaurants = getRestaurantDetails(similar_restaurants)
similar_restaurants.select('business_id', 'name','categories', 'stars','review_count').orderBy(desc('stars')).show()


Restaurants similar to keyword - Spicy Indian Food


                                                                                

+--------------------+--------------------+--------------------+-----+------------+
|         business_id|                name|          categories|stars|review_count|
+--------------------+--------------------+--------------------+-----+------------+
|f7RXwcus_-EOsR0q5...|               Didar| Indian, Restaurants|  4.5|          38|
|zQwAPLeF1hSXLIHjy...|   Restaurant Makkah|Indian, Afghan, H...|  4.5|          17|
|amlwSiqE8WrghAHO7...|    Thai Imagination|Asian Fusion, Chi...|  4.0|          10|
|Aol8whdcT7BfnhUCn...|    Étoile des Indes|Restaurants, Midd...|  4.0|          24|
|A_05gZ2QwayLb0YeQ...|     Chennai Express| Restaurants, Indian|  4.0|          53|
|eAOWvwdmCKIgFYx1a...|          Bab Sharqi|Syrian, Caterers,...|  4.0|           5|
|2f6N22VYSwEndSq7o...|   Restaurant Ganges| Restaurants, Indian|  3.5|          25|
|dckVjwfxYLVjDH516...| Restaurant Salateen|Restaurants, Chin...|  3.5|           5|
|-YxR1MgbIe3k__YBd...|Thali Cuisine Ind...|Food, Indian, Res...|  3.5|      

# Evaluation

In [87]:
reviews_df_test = reviews_df_test.withColumnRenamed("stars", "actual_stars")
evaluation = recommended_restaurants_details.join(reviews_df_test.where((col('user_id') == target_user)),['business_id'],'inner')

In [88]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="actual_stars",predictionCol="prediction")
rmse = evaluator.evaluate(evaluation)
print("rmse:",rmse)



rmse: 1.1721241401831122


                                                                                

# TF-IDF vs Word2Vec

In [89]:
def CompareWord2VecAndTFIDF(rest_id_1, res2_id_2):

    evaluation_restaurants2 = business_df.filter((col("business_id") == rest_id_1) | (col("business_id") == res2_id_2)).select('name','categories')

    #Word2Vec
    all_restaurants_word2vec = transformed_reviews_by_business.select('business_id', 'word_2_vec') \
                                                        .rdd.map(lambda x: (x[0], x[1]))

    user_rated_restaurant_features = all_restaurants_word2vec.filter(lambda x: x[0] == rest_id_1).map(lambda x: x[1]).first()

    similar_restaurants_rdd = all_restaurants_word2vec.filter(lambda x: x[0] == res2_id_2).map(lambda x: (x[0], cosineSimilarity(x[1], user_rated_restaurant_features)))


    similar_restaurants_df = similar_restaurants_rdd.toDF(schema = ['business_id', 'cosine_similarity']).orderBy(desc('cosine_similarity'))

    word2Vec_value = similar_restaurants_df.first()['cosine_similarity']

    # TF-IDF
    all_restaurants_tfidf = transformed_reviews_by_business.select('business_id', 'tfidf_vec') \
                                                        .rdd.map(lambda x: (x[0], x[1]))

    user_rated_restaurant_features = all_restaurants_tfidf.filter(lambda x: x[0] == rest_id_1).map(lambda x: x[1]).first()

    similar_restaurants_rdd = all_restaurants_tfidf.filter(lambda x: x[0] == res2_id_2).map(lambda x: (x[0], cosineSimilarity(x[1], user_rated_restaurant_features)))


    similar_restaurants_df = similar_restaurants_rdd.toDF(schema = ['business_id', 'cosine_similarity']).orderBy(desc('cosine_similarity'))

    tfiDF_value = similar_restaurants_df.first()['cosine_similarity']

    evaluation_restaurants2.show(truncate=False)
    print(f"Cosine similarity using Word2Vec: {word2Vec_value}")
    print(f"Cosine similarity using TF_IDF: {tfiDF_value}")


In [90]:
CompareWord2VecAndTFIDF("mm2wLW24ESxNIEL2bjseaQ","-FKQjRx3yFlPIVWhBZNttg")

                                                                                

+---------------+--------------------------------------------------------------------------------+
|name           |categories                                                                      |
+---------------+--------------------------------------------------------------------------------+
|Poutine Laurier|Sandwiches, Delis, Breakfast & Brunch, Poutineries, Restaurants                 |
|Poutineville   |Poutineries, Canadian (New), Restaurants, American (New), American (Traditional)|
+---------------+--------------------------------------------------------------------------------+

Cosine similarity using Word2Vec: 0.4984336589246826
Cosine similarity using TF_IDF: 0.027225698073355113
