# Import Libraries

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import IDF
from pyspark.ml import Pipeline, PipelineModel
import numpy as np
from pyspark.sql.functions import desc
from pyspark.sql.types import *

from pyspark.ml.feature import Word2Vec

In [3]:
spark = SparkSession \
        .builder \
        .appName("Content Based restaurant recommendation") \
        .getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Load the sampled data

In [4]:
business_df = spark.read.parquet('Sample_Datasets/montreal_business')
reviews_df = spark.read.parquet('Sample_Datasets/montreal_reviews')
users_df = spark.read.parquet('Sample_Datasets/montreal_users')

                                                                                

# Process Reviews
1. Group reviews of restaurants
2. Generate a TF-IDF vector from the reviews.

### Step 1 : Group reviews of restaurants

In [5]:
reviews_text = reviews_df.select('business_id', 'text')
# Group reviews by restaurant ID
reviews_by_business = reviews_text.rdd.map(lambda x: (x[0], x[1])).reduceByKey(lambda x, y: x + " " + y)
reviews_by_business_df = reviews_by_business.toDF(['business_id', 'text'])
reviews_by_business_df.show(3)

                                                                                

+--------------------+--------------------+
|         business_id|                text|
+--------------------+--------------------+
|m8LwM6D7UkgZxCj7u...|It is very small ...|
|Xt6OhDqC8J69wTkMN...|Nice new spot to ...|
|luPvNx4XSxaM7pka8...|I'm writing this ...|
+--------------------+--------------------+
only showing top 3 rows



### Step 2: Convert reviews to TF-IDF feature vector

In [6]:
# Term Frequency (TF) : The number of times the word appears in a document
# TF(t,d) = number of times t appears in d / number of terms in d
# Inverse Document Frequency (IDF) : Shows importance of a term across all documents
# IDF(t) = log( total number of documents / number of documents containing the term t)
# TF-IDF = TF * IDF

In [7]:
# Tokenzizing user reviews
tokenize_sentence = RegexTokenizer(pattern = '\w+', inputCol = 'text', outputCol = 'tokens', toLowercase=True, gaps = False)

# Removing the stop words
remove_stopwords = StopWordsRemover(inputCol = 'tokens', outputCol = 'nostopwords')

# Creating the Word2Vec model
word2vec = Word2Vec(vectorSize=100, minCount=5, inputCol="nostopwords", outputCol="w2v")

# Building a pipeline for execution of above stages in order
pipeline         = Pipeline(stages=[tokenize_sentence, remove_stopwords, word2vec])

# Execute all the stages
reviews_pipeline = pipeline.fit(reviews_by_business_df)

# Saving the pipeline
reviews_pipeline.write().overwrite().save('reviews_pipeline')

                                                                                

In [8]:
# Loading and executing the saved pipeline

reviews_pipeline = PipelineModel.load('reviews_pipeline')
transformed_reviews_by_business = reviews_pipeline.transform(reviews_by_business_df)
transformed_reviews_by_business.printSchema()

                                                                                

root
 |-- business_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- nostopwords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- w2v: vector (nullable = true)



# Calculate cosine similarity to find the closeness between the TF-IDF vectors and then recommend top 10 restaurants to a particular user

In [9]:
def cosineSimilarity(vector1, vector2):
    '''
    This function calculates the cosine similarity between the restaurant feature vector and target restaurant feature vector
    using formula (A.B) / (sqrt((A**2)  * sqrt((B**2)))
    '''

    dot_product = np.dot(vector1, vector2)
    product_of_norms = np.sqrt(np.dot(vector1, vector1)) * np.sqrt(np.dot(vector2, vector2))

    return float(dot_product/product_of_norms)

In [10]:
def getUserRecommendationsTop10(user_restaurants, all_restaurants_tfidf) :
    '''
    This function uses cosine similarity to recommend restaurants based for the user
    '''

    # schema for output DF
    schema = StructType([
                            StructField("business_id", StringType(), True)
                            ,StructField("cosine_similarity", IntegerType(), True)
                        ])

    result_restaurants = spark.createDataFrame([], schema)

    for rest_id in user_restaurants:

        # Get the feature values i.e. review texts for the restaurants rated by the user
        user_rated_restaurant_features = all_restaurants_tfidf.filter(lambda x: x[0] == rest_id).map(lambda x: x[1]).first()

        # Calculate cosine similarity
        similar_restaurants_rdd = all_restaurants_tfidf.filter(lambda x: x[0] != rest_id).map(lambda x: (x[0], cosineSimilarity(x[1], user_rated_restaurant_features)))

        similar_restaurants_df = similar_restaurants_rdd.toDF(schema = ['business_id', 'cosine_similarity']).orderBy(desc('cosine_similarity')).limit(5)
        result_restaurants = result_restaurants.union(similar_restaurants_df)

    result_restaurants_top10 = result_restaurants.dropDuplicates(['business_id']).orderBy(desc('cosine_similarity')).limit(10)

    return result_restaurants_top10

In [11]:
def getRestaurantDetails(sim_rest):
    '''
    Function to get the recommended restaurant details based on ids.
    '''

    restaurant_details = sim_rest.join(business_df, on='business_id', how = 'inner') \
                                 .select(sim_rest.business_id, \
                                       sim_rest.cosine_similarity, business_df.name, \
                                       business_df.categories, business_df.stars, business_df.review_count,
                                       business_df.latitude, business_df.longitude).filter(business_df.stars >= float(3.0))

    return restaurant_details

# Find Best Recommendations

### 1. Best Restaurants based on past reviews of the user

In [22]:
from pyspark.sql.functions import rand

# Selecting a random user

usr_id = reviews_df.select('user_id').orderBy(rand()).limit(1).collect()
target_user = [val.user_id for val in usr_id][0]
# target_user = 'KVehpTNCERwMGAx-h9MR-A'

#Create user profile from the reviews the user has given
user_reviews = reviews_df.filter( (reviews_df.user_id == target_user) & (reviews_df.stars >= float(3.0)) )\
                        .select(reviews_df.business_id).distinct()

#Here we use collect only to retrieve user reviewed restaurants, so it is safe
user_restaurants = [val.business_id for val in user_reviews.collect()]
user_past_restaurants = user_reviews.join(business_df, on='business_id', how = 'inner')

print(f'\nRestaurants reviewed by target user: {target_user}')
user_past_restaurants.select('business_id', 'name', 'categories', 'stars').show()

# (business_id => [reviews word vectors] )
all_restaurant_tfidf = transformed_reviews_by_business.select('business_id', 'w2v') \
                                                    .rdd.map(lambda x: (x[0], x[1]))

#Fetch recommendations for target user
recommended_restaurants = getUserRecommendationsTop10(user_restaurants, all_restaurant_tfidf)

#Get details about the recommended restaurants
recommended_restaurants_details = getRestaurantDetails(recommended_restaurants)

print(f'\nRecommendations for User: {target_user}')
recommended_restaurants_details.drop('cosine_similarity','latitude', 'longitude') \
                           .orderBy(desc('cosine_similarity')).show(10)



Restaurants reviewed by target user: ydLrbFJtw5zwil371TlnDA
+--------------------+-----------------+--------------------+-----+
|         business_id|             name|          categories|stars|
+--------------------+-----------------+--------------------+-----+
|ZS9v8ceJte-kbvdy1...|        Café Névé|Food, Coffee & Te...|  4.5|
|3lX1V-2qaoEqYgC4o...|Le Moineau Masqué|Cafes, Restaurant...|  4.0|
+--------------------+-----------------+--------------------+-----+



                                                                                


Recommendations for User: ydLrbFJtw5zwil371TlnDA


                                                                                

+--------------------+----------------+--------------------+-----+------------+
|         business_id|            name|          categories|stars|review_count|
+--------------------+----------------+--------------------+-----+------------+
|CESgogUugOYph4p12...|         Kitsuné|Restaurants, Coff...|  4.0|          64|
|nlssnslcQPqsTWBkg...|      Le Brûloir|Food, Coffee & Te...|  4.5|          20|
|Tv70ZOXji7yCfsoe-...|    Cafe Central|  Cafes, Restaurants|  4.5|          16|
|K1ala_vciG1UETtJE...|      Café Pista|Cafes, Food, Brea...|  4.5|          34|
|Bc13zfSbo1AJjxdjP...|Café Expressions|Coffee & Tea, Foo...|  4.0|          26|
|WClZEbe0Eth8POTWj...|       Café 8 Oz|Cafes, Food, Coff...|  4.5|          21|
|A_Gl8o6xlhugD0r8F...|    Kokkino Cafe|Food, Coffee & Te...|  4.0|           8|
|vw2KJ_5tlFIALaHfz...|      Kahwa Café|Cafes, Restaurant...|  4.5|          42|
|3oEZqnsMSGsSEoXI5...|   L'Or en Grain|Cafes, Coffee & T...|  4.5|          12|
+--------------------+----------------+-

### 2. Best Restaurants Recommendations based on a Keyword

If a user wants some recommendations based on a specific keyword, like a particular type of restaurant, or recommendations based on a specific food item.


In [13]:
def keyWordsRecommendation(keyword, all_restaurant_tfidf):

    keyword_df = sc.parallelize([(0, keyword)]).toDF(['business_id', 'text'])

    #Create word vector from keyword
    keyword_df = reviews_pipeline.transform(keyword_df)
    #Here we collect only one row, so it is safe
    keyword_tfidf = keyword_df.select('w2v').collect()[0][0]

    # Fetch cosine similarity
    similar_restaurants_rdd = all_restaurant_tfidf.map(lambda x: (x[0], cosineSimilarity(x[1], keyword_tfidf)))

    similar_restaurants = similar_restaurants_rdd.toDF(['business_id', 'cosine_similarity']) \
                                             .orderBy(desc('cosine_similarity')).limit(10)

    return similar_restaurants


In [29]:
keyword = 'poutine'

print(f'\nRestaurants similar to keyword - {keyword}')
similar_restaurants = keyWordsRecommendation(keyword, all_restaurant_tfidf)

# Fetch details of similar restaurants
similar_restaurants = getRestaurantDetails(similar_restaurants)
similar_restaurants.drop('input_business_id', 'cosine_similarity','latitude', 'longitude').orderBy(desc('stars')).show()


Restaurants similar to keyword - poutine


                                                                                

+--------------------+--------------------+--------------------+-----+------------+
|         business_id|                name|          categories|stars|review_count|
+--------------------+--------------------+--------------------+-----+------------+
|-FKQjRx3yFlPIVWhB...|     Poutine Laurier|Sandwiches, Delis...|  4.5|           4|
|mm2wLW24ESxNIEL2b...|        Poutineville|Poutineries, Cana...|  4.5|         536|
|spesLRasSihLlG2J2...|       Restaurant AA|Restaurants, Hot ...|  4.0|          36|
|EbLsUk8uKpA_vsYxf...|Planète Poutine e...|Restaurants, Pout...|  4.0|          15|
|5T6kFKFycym_GkhgO...|         La Banquise|Fast Food, Diners...|  4.0|        1945|
|QEHnipQSRUlshVQPO...|           Dic Ann's|Burgers, Restaurants|  3.5|           3|
|DJjjze3lyq3XKwFMt...|Restaurant La Bel...|Fast Food, Restau...|  3.5|           6|
|09enlT7b86opzmgL-...|Restaurants D Laf...|Restaurants, Fast...|  3.5|          21|
|F-SXXhfcCZpPya6V2...|           Valentine|Burgers, Hot Dogs...|  3.0|      

In [15]:
sentence1 = 'Good Indian Food'
sentence2 = 'Good Mexican food'

sentence1_df = sc.parallelize([(0, sentence1)]).toDF(['business_id', 'text'])
sentence2_df = sc.parallelize([(0, sentence2)]).toDF(['business_id', 'text'])

#Create word vector from keyword
sentence1_df = reviews_pipeline.transform(sentence1_df)
sentence1_tfidf = sentence1_df.select('w2v').collect()[0][0]
print(sentence1_tfidf)

sentence2_df = reviews_pipeline.transform(sentence2_df)
sentence2_tfidf = sentence2_df.select('w2v').collect()[0][0]
print(sentence2_tfidf)


[-0.25863617161909735,-0.26485103368759155,-0.1658378591140111,0.000828633705774943,0.1919932203988234,0.3378332257270813,0.03475867584347725,0.30242328842480976,-0.011711617931723595,-0.058146166537577905,0.08294862136244774,0.09100144915282726,-0.06132379546761513,0.07542818039655685,-0.0027947602793574333,0.030439416567484535,0.12304001643011966,-0.08529726167519887,-0.036115189393361405,0.04767097036043803,0.0834480500780046,0.17473609000444412,-0.2632087549815575,0.06288785859942436,0.3026687850554784,0.02739808956782023,-0.009677484631538391,-0.06734986106554666,0.03040852646032969,-0.09516599208776218,0.029463755587736763,-0.19014890491962433,0.026717002193133034,0.020970497901240982,0.17514940599600473,0.10146499560990681,-0.07324376034860809,0.22417822728554407,-0.008080226058761278,-0.24123933259397745,-0.07190421099464098,-0.11949200928211212,-0.004041691310703754,-0.16597258672118187,-0.028653038665652275,-0.12763248880704242,0.03158486261963844,0.12734217320879299,-0.04172