# Recomendation System - Yelp

In [32]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')

## Loading Dataset

In [2]:
df = pd.read_parquet('./dataset/yelp_dataset.parquet')
df.head(10)

Unnamed: 0,id_review,stars,useful,funny,cool,text,food mentioned,sentiment
0,0,4.0,0,0,0,we landed at the reno airport for our honeymoo...,"[pizza, egg, beef, cake, coffee, tea, pancake,...",positive
1,1,4.0,12,3,7,this atmosphere is certainly geared towards am...,"[pork, shrimp, ribs, bbq, ramen, potato]",positive
2,2,1.0,3,0,0,food and service are key components to enjoy a...,"[rice, dessert]",negative
3,3,5.0,4,2,4,cyranos is a hidden gem that i wish i had know...,"[rice, bread, crab, cake, dessert, cocktail]",positive
4,4,5.0,1,0,2,wow kekes truly impressed me this afternoon m...,[egg],positive
5,5,5.0,0,0,0,its quite a drive to sparks but there are only...,"[soup, pie, falafel]",positive
6,6,5.0,1,0,1,my husband and i visited this location for lun...,"[steak, salad, cheese, shrimp, tea, taco]",positive
7,7,4.0,7,0,1,finally stopped by here after getting tired of...,"[chicken, fish, salad, rice, bread, egg, pork,...",positive
8,8,1.0,1,1,0,i went here for lunch after an extremely cold ...,"[soup, sandwich, meat]",negative
9,9,5.0,0,0,0,called and ordered the chicken tikka pizza es...,"[pizza, chicken]",positive


In [3]:
df.shape

(10000, 8)

In [4]:
df_cluster = df.copy()

# Preprocessing

In [5]:
# Checking the number of NaN values in the 'food mentioned' column
nan_count = df_cluster['food mentioned'].isna().sum()
total_rows = len(df_cluster)
nan_percentage = (nan_count / total_rows) * 100

nan_count, nan_percentage


(663, 6.63)

In [6]:
df_cluster = df_cluster.dropna()

In [7]:
# Checking the number of NaN values in the 'food mentioned' column
nan_count = df_cluster['food mentioned'].isna().sum()
total_rows = len(df_cluster)
nan_percentage = (nan_count / total_rows) * 100

nan_count, nan_percentage

(0, 0.0)

In [8]:
df_cluster.shape

(9337, 8)

## Text Cleaning/ Text Preprocessing

In [9]:
df_cluster['text'][0]

'we landed at the reno airport for our honeymoon in lake tahoe and saw the advertisement for the hash house  despite being tired from the long day and the time difference my eyes lit up with the gigantic portion of food shown in the ad\n\ndid not even get the name of the place but said to my wife thats a place to go eat  we were shocked to see it was in our hotel harrahs where we spent the first night and then drive to tahoe the next morning\n\nafter scoring coffee next store at starbucks we headed off to breakfast at the hash house  \n\ni ordered the machaca and my wife had blackberry granola pancake just one   when the food arrived placed in front of us was enough food to feed the masses  the pancake was larger than the standard plate with a container of syrup pushed into the enormous pancake and the machaca was piled high with shredded beef eggs and a mound of mashed potatoes\n\ni managed to finish the eggs beef along with the tortillas but left behind some of the potato  on the oth

In [10]:
df_cluster['text'] = df['text'].str.lower().replace(r'^\w\s', '').replace(r'\n', '', regex=True)

In [11]:
df_cluster.head()

Unnamed: 0,id_review,stars,useful,funny,cool,text,food mentioned,sentiment
0,0,4.0,0,0,0,we landed at the reno airport for our honeymoo...,"[pizza, egg, beef, cake, coffee, tea, pancake,...",positive
1,1,4.0,12,3,7,this atmosphere is certainly geared towards am...,"[pork, shrimp, ribs, bbq, ramen, potato]",positive
2,2,1.0,3,0,0,food and service are key components to enjoy a...,"[rice, dessert]",negative
3,3,5.0,4,2,4,cyranos is a hidden gem that i wish i had know...,"[rice, bread, crab, cake, dessert, cocktail]",positive
4,4,5.0,1,0,2,wow kekes truly impressed me this afternoon m...,[egg],positive


In [12]:
df_cluster['food mentioned'] = df_cluster['food mentioned'].astype('string')


In [13]:
# Parsing the 'food_mentioned' column (assuming it contains strings of lists)
df_cluster['food mentioned'] = df_cluster['food mentioned'].apply(lambda x: x.strip('[]').split(', '))

# Exploding the 'food_mentioned' column
df_food = df_cluster.explode('food mentioned')

# Show the first few rows of the exploded dataframe to confirm the operation
df_food.head()

Unnamed: 0,id_review,stars,useful,funny,cool,text,food mentioned,sentiment
0,0,4.0,0,0,0,we landed at the reno airport for our honeymoo...,'pizza' 'egg' 'beef' 'cake' 'coffee' 'tea' 'pa...,positive
1,1,4.0,12,3,7,this atmosphere is certainly geared towards am...,'pork' 'shrimp' 'ribs' 'bbq' 'ramen' 'potato',positive
2,2,1.0,3,0,0,food and service are key components to enjoy a...,'rice' 'dessert',negative
3,3,5.0,4,2,4,cyranos is a hidden gem that i wish i had know...,'rice' 'bread' 'crab' 'cake' 'dessert' 'cocktail',positive
4,4,5.0,1,0,2,wow kekes truly impressed me this afternoon m...,'egg',positive


In [14]:
df_food.head()

Unnamed: 0,id_review,stars,useful,funny,cool,text,food mentioned,sentiment
0,0,4.0,0,0,0,we landed at the reno airport for our honeymoo...,'pizza' 'egg' 'beef' 'cake' 'coffee' 'tea' 'pa...,positive
1,1,4.0,12,3,7,this atmosphere is certainly geared towards am...,'pork' 'shrimp' 'ribs' 'bbq' 'ramen' 'potato',positive
2,2,1.0,3,0,0,food and service are key components to enjoy a...,'rice' 'dessert',negative
3,3,5.0,4,2,4,cyranos is a hidden gem that i wish i had know...,'rice' 'bread' 'crab' 'cake' 'dessert' 'cocktail',positive
4,4,5.0,1,0,2,wow kekes truly impressed me this afternoon m...,'egg',positive


In [15]:
def split_food_and_duplicate_rows(row):
    # Splitting the string into a list of foods
    foods = row['food mentioned'].replace("'", "").split()
    return [{**row.to_dict(), 'food mentioned': food} for food in foods]

# Applying the function and creating a new DataFrame
new_rows = [item for _, row in df_food.iterrows() for item in split_food_and_duplicate_rows(row)]
new_df = pd.DataFrame(new_rows)

In [16]:
new_df

Unnamed: 0,id_review,stars,useful,funny,cool,text,food mentioned,sentiment
0,0,4.0,0,0,0,we landed at the reno airport for our honeymoo...,pizza,positive
1,0,4.0,0,0,0,we landed at the reno airport for our honeymoo...,egg,positive
2,0,4.0,0,0,0,we landed at the reno airport for our honeymoo...,beef,positive
3,0,4.0,0,0,0,we landed at the reno airport for our honeymoo...,cake,positive
4,0,4.0,0,0,0,we landed at the reno airport for our honeymoo...,coffee,positive
...,...,...,...,...,...,...,...,...
33371,9998,5.0,1,0,0,very good salads and amazing mustard the patio...,salad,positive
33372,9999,3.0,0,0,0,after hearing so many great things about mayna...,steak,positive
33373,9999,3.0,0,0,0,after hearing so many great things about mayna...,cheese,positive
33374,9999,3.0,0,0,0,after hearing so many great things about mayna...,wine,positive


## Word2Vec

In [17]:
df_word2vec = new_df.copy()

In [19]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from gensim.models import Word2Vec

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
# Apply re.sub to remove non-alphabetic characters
df_word2vec['text'] = df_word2vec['text'].apply(lambda x: re.sub("[^A-Za-z ]", "", x))

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization to the 'review_full' column
df_word2vec['text'] = df_word2vec['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))



In [21]:
# Tokenize the text into words
df_word2vec['tokenized_text'] =df_word2vec['text'].apply(word_tokenize)

In [22]:
custom_stop_words = [
    'a', 'about', 'above', 'again', 'against', 'all', 'also', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at',
    'b', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'c', 'can', 'couldn',
    'd', 'did', "didn't", 'drink', 'do', 'does', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'e', 'each',
    'few', 'for', 'from', 'further', 'g', 'h', 'had', "hadn't", 'has', "hasn't", 'have', 'haven', "haven't", 'having',
    'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't",
    'it', "it's", 'its', 'itself', 'j', 'just', 'k', 'l', 'last' 'lot', 'lots','list', 'll', 'm', 'ma', 'me', 'mightn', 'more', 'most',
    'mustn', "mustn't", 'my', 'myself', 'n', 'neg', 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or',
    'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'p', 'q', 'r', 're', 's', 'same', 'shan', "shan't", 'she',
    "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the',
    'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too',
    'u', 'under', 'until', 'up', 'v', 'very', 'w', 'was', 'wasn', 'we', 'were', 'weren', 'what',
    'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 'would', "won't", 'wouldn', 't', 'y',
    'you',  "youre", "youve", 'your', 'yours', 'yourself', 'yourselves', 'z'
]

In [23]:
df_word2vec['tokenized_text'] = df_word2vec['tokenized_text'].apply(lambda x: [word for word in x if word.lower() not in custom_stop_words])

In [24]:
# Train Word2Vec model using the 'tokenized_text' column
model = Word2Vec(sentences=df_word2vec['tokenized_text'], min_count=2, workers=4)

In [25]:
def rec(target_word):
    similar_words = model.wv.most_similar(target_word, topn=5)
    print(f"Similar words to {target_word}:")
    for word, _ in similar_words:
        similar = print(f"{word}")
    return similar

In [31]:
rec('pizza')

Similar words to pizza:
pepperoni
crust
breadstick
calzone
margherita
