# Import Libraries

In [136]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import IDF
from pyspark.ml import Pipeline, PipelineModel
import numpy as np
from pyspark.sql.functions import desc
from pyspark.sql.types import *
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import col

In [137]:
spark = SparkSession \
        .builder \
        .appName("Content Based restaurant recommendation") \
        .getOrCreate()
sc = spark.sparkContext

# Load the sampled data

In [138]:
business_df = spark.read.parquet('Sample_Datasets/montreal_business')
reviews_df = spark.read.parquet('Sample_Datasets/montreal_reviews')
users_df = spark.read.parquet('Sample_Datasets/montreal_users')

# Process Reviews
1. Group reviews of restaurants
2. Generate a Word2Vec vector from the reviews.

### Step 1 : Group reviews of restaurants

In [139]:
#TODO new change
reviews_df.printSchema()
reviews_df_train, reviews_df_test = reviews_df.randomSplit([0.6, 0.4], seed=42)

reviews_text = reviews_df_train.select('business_id', 'text')
# Group reviews by restaurant ID
reviews_by_business = reviews_text.rdd.map(lambda x: (x[0], x[1])).reduceByKey(lambda x, y: x + " " + y)
reviews_by_business_df = reviews_by_business.toDF(['business_id', 'text'])
reviews_by_business_df.show(3)

root
 |-- business_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)



                                                                                

+--------------------+--------------------+
|         business_id|                text|
+--------------------+--------------------+
|-1xuC540Nycht_iWF...|During my week-lo...|
|-4TVAE4KcLWVSXO2r...|While almost ever...|
|-ldO91cdwbzIHN5hD...|Went here a while...|
+--------------------+--------------------+
only showing top 3 rows



In [140]:
reviews_df_train.select('business_id').distinct().count()

                                                                                

4131

In [141]:
reviews_df_test.select('business_id').distinct().count()

                                                                                

3996

### Step 2: Convert reviews to Word2Vec feature vector

In [142]:
# Remove the stop words from text, and create the tf idf matrix
tokenize_sentence        = RegexTokenizer(pattern = '\w+', inputCol = 'text', outputCol = 'tokens', toLowercase=True, gaps = False)
remove_stopwords = StopWordsRemover(inputCol = 'tokens', outputCol = 'nostopwords')
# 1000 words to limit number of features and reduce overfitting
countVectorizer  = CountVectorizer(inputCol='nostopwords', outputCol='tf', vocabSize=1000)
tfiDF            = IDF(inputCol='tf', outputCol='tfidf_vec')
word2Vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'nostopwords', outputCol = 'word_2_vec', seed=123)
pipeline         = Pipeline(stages=[tokenize_sentence, remove_stopwords, countVectorizer, tfiDF, word2Vec])

reviews_pipeline = pipeline.fit(reviews_by_business_df)
reviews_pipeline.write().overwrite().save('reviews_pipeline')

                                                                                

In [143]:
reviews_pipeline = PipelineModel.load('reviews_pipeline')
transformed_reviews_by_business = reviews_pipeline.transform(reviews_by_business_df)
transformed_reviews_by_business.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- nostopwords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tf: vector (nullable = true)
 |-- tfidf_vec: vector (nullable = true)
 |-- word_2_vec: vector (nullable = true)



# Calculate cosine similarity between reviews of restaurants

In [144]:
def cosineSimilarity(vector1, vector2):
    '''
    This function calculates the cosine similarity between the restaurant feature vector and target restaurant feature vector
    using formula (A.B) / (sqrt((A**2)  * sqrt((B**2)))
    '''

    dot_product = np.dot(vector1, vector2)
    product_of_norms = np.sqrt(np.dot(vector1, vector1)) * np.sqrt(np.dot(vector2, vector2))

    return float(dot_product/product_of_norms)

In [145]:
def getUserRecommendationsTop10(user_restaurants, all_restaurants_word2vec) :
    '''
    This function uses cosine similarity to recommend restaurants based for the user
    '''

    # schema for output DF
    schema = StructType([
                            StructField("business_id", StringType(), True)
                            ,StructField("cosine_similarity", IntegerType(), True)
                            ,StructField("prediction", DoubleType(), True)
                        ])

    result_restaurants = spark.createDataFrame([], schema)

    for rest_id in user_restaurants:

        # Get the feature values i.e. review texts for the restaurants rated by the user
        user_rated_restaurant_features = all_restaurants_word2vec.filter(lambda x: x[0] == rest_id).map(lambda x: x[1]).first()

        # Calculate cosine similarity
        similar_restaurants_rdd = all_restaurants_word2vec.filter(lambda x: x[0] != rest_id).map(lambda x: (x[0], cosineSimilarity(x[1], user_rated_restaurant_features)))

        similar_restaurants_df = similar_restaurants_rdd.toDF(schema = ['business_id', 'cosine_similarity']).orderBy(desc('cosine_similarity'))

        # Scale cosine similarity to ratings 1-5
        similar_restaurants_df = similar_restaurants_df.withColumn("prediction", round((similar_restaurants_df['cosine_similarity'] - 0) / (1 - 0) * (5 - 1) + 1, 2))

        result_restaurants = result_restaurants.union(similar_restaurants_df)

    result_restaurants_top10 = result_restaurants.dropDuplicates(['business_id']).orderBy(desc('cosine_similarity'))

    return result_restaurants_top10

In [146]:
def getRestaurantDetails(sim_rest):
    '''
    Function to get the recommended restaurant details based on ids.
    '''

    restaurant_details = sim_rest.join(business_df, on='business_id', how = 'inner') \
                                 .select(sim_rest.business_id, \
                                       sim_rest.cosine_similarity, business_df.name, \
                                       business_df.categories, business_df.stars, business_df.review_count,
                                       business_df.latitude, business_df.longitude, sim_rest.prediction)

    return restaurant_details

# Find Best Recommendations

### 1. Best Restaurants based on past reviews of the user

In [147]:
train_users = reviews_df_train.select('user_id').distinct()
test_users = reviews_df_test.select('user_id').distinct()

common_users = train_users.join(test_users, ['user_id'], 'inner')
common_users.show(truncate=False)




                                                                                

+----------------------+
|user_id               |
+----------------------+
|EaBKe-8LB-NHuH7Us-QhGw|
|U9mfsF7Knvi4qkulS6k-Pg|
|VvBypfrIzspqwrbox_sUHA|
|M7vDDzoPNQDN2FdTcwCq4A|
|ucssLdHJmpo3lEyUtLi1Vg|
|6S4uxbPpb9pMk9bKMUVZng|
|eCy43X201JNHYKbPDN8xpA|
|s6qQj3otWtvBTv3wugeD5A|
|5cZEgqHwiN4XA2eOWWqnug|
|j78OeM6cLZ3OsFlJwwOgpA|
|4RjoIBBL450Z6EQenmWkSg|
|gurcmVDESVQ8X20xpw4btw|
|-7JSlmBJKUQwREG_yGuduQ|
|tCRK65LrfCEC1LzFS8LwvQ|
|IAP_k_esHa84vGKr8QerqQ|
|BxNBl3VrzLBAkwpJ3EUgXg|
|gGjQnTix1FeRPsOilTselA|
|xahq_WAFi6SYHFdwYVfVkg|
|B41w9LUYQHbkTjFRfpHEjA|
|N2HkEZn_EEtsy-KBg2NEmQ|
+----------------------+
only showing top 20 rows



In [155]:
from pyspark.sql.functions import rand

# Selecting a random user

# usr_id = reviews_df.select('user_id').orderBy(rand()).limit(1).collect()
# target_user = [val.user_id for val in usr_id][0]
#
# print(target_user)
# reviews_df.filter(reviews_df_train.user_id == target_user).show()

#has lot of reviews
target_user = '6S4uxbPpb9pMk9bKMUVZng'
# target_user = 'M7vDDzoPNQDN2FdTcwCq4A' 0.7
# target_user = 'bds7VxgVm0e6Pu5RuVV-wg' 1.6
# target_user= 'VvBypfrIzspqwrbox_sUHA'
reviews_df_train.filter(reviews_df_train.user_id == target_user).show()
reviews_df_test.filter(reviews_df_test.user_id == target_user).show()
# target_user = 'KVehpTNCERwMGAx-h9MR-A'
#
#Create user profile from the reviews the user has given
user_reviews = reviews_df_train.filter( (reviews_df_train.user_id == target_user) & (reviews_df_train.stars >= float(3.0)) )\
                        .select(reviews_df_train.business_id).distinct()

#Here we use collect only to retrieve user reviewed restaurants, so it is safe
user_restaurants = [val.business_id for val in user_reviews.collect()]
user_past_restaurants = user_reviews.join(business_df, on='business_id', how = 'inner')

print(f'\nRestaurants reviewed by target user: {target_user}')
user_past_restaurants.select('business_id', 'name', 'categories', 'stars').show()

# (business_id => [reviews word vectors] )
all_restaurant_word2vec = transformed_reviews_by_business.select('business_id', 'word_2_vec') \
                                                    .rdd.map(lambda x: (x[0], x[1]))

#Fetch recommendations for target user
recommended_restaurants = getUserRecommendationsTop10(user_restaurants, all_restaurant_word2vec)


#Get details about the recommended restaurants
recommended_restaurants_details = getRestaurantDetails(recommended_restaurants)

print(f'\nRecommendations for User: {target_user}')
recommended_restaurants_details.drop('cosine_similarity','latitude', 'longitude') \
                           .orderBy(desc('cosine_similarity')).show(10)


                                                                                

+--------------------+--------------------+--------------------+-----+--------------------+
|         business_id|             user_id|           review_id|stars|                text|
+--------------------+--------------------+--------------------+-----+--------------------+
|mmt2tfx6HkMPVJs2E...|6S4uxbPpb9pMk9bKM...|oy2lszrw8jywT_z1-...|  4.0|Great food here. ...|
+--------------------+--------------------+--------------------+-----+--------------------+



                                                                                

+--------------------+--------------------+--------------------+------------+--------------------+
|         business_id|             user_id|           review_id|actual_stars|                text|
+--------------------+--------------------+--------------------+------------+--------------------+
|0W4lkclzZThpx3V65...|6S4uxbPpb9pMk9bKM...|KNqN9LOd8TNMx3A0Y...|         4.0|A solid sandwich!...|
|1uPQTz5XZSWr0ti7k...|6S4uxbPpb9pMk9bKM...|IqukAFt9bocPd3Gm8...|         4.0|We were staying n...|
|_YzwG_p4MXtZnb3ZE...|6S4uxbPpb9pMk9bKM...|EOW5y_tqd2qQnhvzA...|         4.0|We were staying n...|
|x88xL8tJEhOS1Qxlv...|6S4uxbPpb9pMk9bKM...|7ZgD_0diS4b72Y2n4...|         3.0|We were walking b...|
+--------------------+--------------------+--------------------+------------+--------------------+


Restaurants reviewed by target user: 6S4uxbPpb9pMk9bKMUVZng


                                                                                

+--------------------+---------+--------------------+-----+
|         business_id|     name|          categories|stars|
+--------------------+---------+--------------------+-----+
|mmt2tfx6HkMPVJs2E...|Keung Kee|Restaurants, Cant...|  3.5|
+--------------------+---------+--------------------+-----+



                                                                                


Recommendations for User: 6S4uxbPpb9pMk9bKMUVZng




+--------------------+--------------------+--------------------+-----+------------+----------+
|         business_id|                name|          categories|stars|review_count|prediction|
+--------------------+--------------------+--------------------+-----+------------+----------+
|aJRoS2yy8HWQAMWvk...|Les Saisons de Corée| Restaurants, Korean|  3.5|          24|      4.93|
|Izd-To33XTMnjM19X...|               Amigo|Restaurants, Chinese|  4.0|          45|      4.92|
|2ZEnhnWEascgZDMdf...|           Chez Chen|Chinese, Restaurants|  4.5|          31|      4.92|
|bte91zhh1rTqMvlI-...|     Bangkok Express|   Restaurants, Thai|  3.0|          31|      4.91|
|ajkx7NWnjraj_uz20...|Restaurant Cuisin...|Restaurants, Chinese|  3.0|          31|      4.91|
|IIVGlLVf1oggMOOtb...|   Soups et Nouilles|Chinese, Restaurants|  3.0|          13|      4.91|
|q64TlS17kNMIxb8B6...|           Ave Seoul|Korean, Barbeque,...|  4.0|          40|      4.91|
|t749-3jKKDyO8QH3V...|         Chao Phraya|   Thai

                                                                                

# Evaluation

In [156]:
# target_user = 'bds7VxgVm0e6Pu5RuVV'

reviews_df_test = reviews_df_test.withColumnRenamed("stars", "actual_stars")
evaluation = recommended_restaurants_details.join(reviews_df_test.where((col('user_id') == target_user)),['business_id'],'inner')
evaluation.show()







+--------------------+------------------+--------------+--------------------+-----+------------+----------+-----------+----------+--------------------+--------------------+------------+--------------------+
|         business_id| cosine_similarity|          name|          categories|stars|review_count|  latitude|  longitude|prediction|             user_id|           review_id|actual_stars|                text|
+--------------------+------------------+--------------+--------------------+-----+------------+----------+-----------+----------+--------------------+--------------------+------------+--------------------+
|0W4lkclzZThpx3V65...|0.8424378031905233|    Schwartz's|Sandwiches, Speci...|  4.0|        2667| 45.516353| -73.577642|      4.37|6S4uxbPpb9pMk9bKM...|KNqN9LOd8TNMx3A0Y...|         4.0|A solid sandwich!...|
|1uPQTz5XZSWr0ti7k...|0.9580701551010609|Mai Xiang Yuan|Chinese, Dumpling...|  4.0|         443|  45.50831| -73.560778|      4.83|6S4uxbPpb9pMk9bKM...|IqukAFt9bocPd3Gm8...|

                                                                                

In [150]:
evaluation.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cosine_similarity: double (nullable = true)
 |-- name: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_count: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- prediction: double (nullable = true)
 |-- user_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- actual_stars: double (nullable = true)
 |-- text: string (nullable = true)



In [157]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse", labelCol="actual_stars",predictionCol="prediction")
rmse = evaluator.evaluate(evaluation)
print("rmse:",rmse)



rmse: 0.9177690341256891


                                                                                