# Cafe Recommender

### Imports

In [17]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.neighbors import NearestNeighbors
import numpy as np
from textblob import TextBlob
from geopy.distance import great_circle


### Data Cleaning

Started by reading in the business dataset from the Yelp academic dataset, filtered for coffee and tea business, and dropped irrelevant columns. Then read in the reviews dataset while merging the data with the business data.

In [18]:
# Read in business data
business_json_path = '/Users/amelialei/yelp_dataset/yelp_academic_dataset_business.json'
bus_df = pd.read_json(business_json_path, lines = True)


# Filter dataset for coffee and tea establishments
bus_df = bus_df[bus_df['categories'].str.contains('Bubble Tea|Coffee and Tea', case = False, na = False)] 


# Include only businesses that are open
bus_df = bus_df[bus_df['is_open'] == 1]


# Obtain wifi information for each business and record as 'unknown' if there is no known wifi data
def get_wifi_info(attributes):
    if isinstance(attributes, dict): 
        info = attributes.get('WiFi', 0) 
        if info != 0:
            info = info.replace("u'", "").replace("'", "").strip()
            if info == 'free':
                info = 1
            else:
                info = 0
    else:
        info = 0
    return info
    
bus_df['free_wifi'] = bus_df['attributes'].apply(get_wifi_info)


# Drop irrelevant columns 
drop_cols = ['is_open', 'review_count', 'attributes', 'categories']
bus_df = bus_df.drop(drop_cols, axis =1)



In [19]:
## Read in user reviews data
review_json_path = '/Users/amelialei/yelp_dataset/yelp_academic_dataset_review.json'

size = 1000000
review = pd.read_json(review_json_path, lines=True, dtype = {'review_id': str, 'user_id': str, 'business_id': str, 'stars': int, 'date': str, 'text': str, 'useful': int, 'funny': int, 'cool': int}, chunksize = size)


# Split data into chunks because the file is too large to read all at once. 
# Merge the reviews data with the businesses dataframe to create one dataframe.
chunk_list = []
for review_chunk in review:
    review_chunk = review_chunk.drop(['review_id', 'useful', 'funny', 'cool'], axis=1)
    review_chunk = review_chunk.rename(columns={'stars': 'review_stars'})
    chunk_merged = pd.merge(bus_df, review_chunk, on='business_id', how='inner')
    chunk_list.append(chunk_merged)
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

### Sentiment Analysis
Performed a sentiment analysis on the reviews to achieve a more hollistic view on users' experiences with the cafe.

In [20]:
# Remove stop words
df['text'] = (df['text']
              .str.lower()
              .replace(r'\band\b|\bor\b|\bthe\b|\bis\b|\bto\b', '', regex=True)
              .replace(r'[^A-Za-z\s]', '', regex=True))

# Stem words
ps = PorterStemmer()
def tokenize_and_stem(review):
    tokens = word_tokenize(review)
    stemmed_tokens = [ps.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

# Calculate the review's polarity score
cleaned_reviews = df['text'].apply(tokenize_and_stem)
df['text'] = cleaned_reviews
df['sentiment'] = df['text'].apply(lambda review: TextBlob(review).sentiment.polarity)




### Collaborative Filtering
Used a collaborative filtering recommendation system to recommend new cafes based on other users with similar preferences. 

In [42]:
# Normalize numerical ratings 
df['normalized_stars'] = (df['stars'] - df['stars'].min())/(df['stars'].max()-df['stars'].min())
df['normalized_review_stars'] = (df['review_stars'] - df['review_stars'].min())/(df['review_stars'].max()-df['review_stars'].min())

# Create a user-item matrix consisting of users and their ratings for each business
user_item_matrix = df.pivot_table(index='user_id', columns='business_id', values='review_stars')

# Fill in missing ratings with the average rating for each cafe
user_item_matrix = user_item_matrix.apply(lambda col: col.fillna(col.mean()), axis=0)

# Use the nearest neighbors algorithm along with cosine similarity to measure similarity between users' preferences
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_item_matrix.values)

# Find the most similar users
user_id = '79nOboO-4_yNuQQ21EAU1A'
distances, indices = knn_model.kneighbors([user_item_matrix.loc[user_id]])
similar_users = indices.flatten()[1:]
similar_users_ids = user_item_matrix.index[similar_users]

# Average the similar users' cafe ratings and sort them in descending order to find the top cafe to recommend
cafe_recs = user_item_matrix.loc[similar_users_ids].mean(axis=0)








business_id
-3-6BB10tIWNKGEF0Es2BA    4.206767
-9r8nAzWyRSLxBWt8uQOdA    3.035061
-OX0MJDPRHV0RCRvwYnvBQ    4.666667
-TgNuaotu3vobBRt5KyiVw    4.142857
-VHqHLVWOLb1BsP9H-h7Ig    3.900000
                            ...   
zLN6ix7txwpfwRKFi2MfCQ    4.200000
zS7ygYwgDMh5HSDzG3oShA    4.277477
zUA1hGE1NWGO66OTbKuZuQ    4.461538
zWfzgYe2nT9mHYm_vl1Ukw    3.937636
zun6IVJa7wYe3wAPqWnPGw    4.454023
Length: 362, dtype: float64

### Location-Based Filtering
Used locaton-based filtering to give more weight to cafes that are closer to the user

In [22]:
# Latitude and longitude of user's location
user_location = (37.7749, 122.4194)

# Calculate the distance between the user's location and the cafes
def get_distance(cafe_location):
    return great_circle(user_location, cafe_location).miles

# Normalize distances for calculations
bus_df['distance'] = bus_df.apply(lambda row: get_distance((row['latitude'], row['longitude'])), axis=1)
bus_df['distance'] = 1 - (bus_df['distance']/bus_df['distance'].max())




### Hybrid Filtering ###
Combined user similarity, sentiment, and location to create a more personlized recommendation system

In [63]:
sentiment_df = df.groupby('business_id')['sentiment'].mean().to_frame()
similar_user_recs_df = cafe_recs.to_frame()
similar_user_recs_df.columns = ['rec_cafe_rating']

# Merge dataframes to match corresponding business data
features_df = bus_df.merge(similar_user_recs_df, how='inner', left_on='business_id', right_index=True)
features_df = features_df.merge(sentiment_df, how='inner', left_on='business_id', right_index=True)

# Calculate new score using a weighted formula to get final recommendations
features_df['final_score'] = features_df['rec_cafe_rating']*0.6 + features_df['sentiment']*0.2 + features_df['distance']*0.1 + features_df['free_wifi']*0.1
features_df = features_df.sort_values(by='final_score', ascending=False)
top_5 = features_df.head(5)
top_5

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,hours,free_wifi,distance,rec_cafe_rating,sentiment,final_score
145048,MM-IvUB7qBMrBTDm_4MbCw,Q Tea Vietnamese Cafe,671 Jeffco Blvd,Arnold,MO,63010,38.450577,-90.365982,5.0,"{'Tuesday': '14:0-21:0', 'Wednesday': '11:0-21...",1,0.112509,5.0,0.453742,3.201999
66810,xt-GigN6sRFQXKCIyIOCYA,Cafe de Blaire,442 N 4th St,St. Louis,MO,63102,38.629466,-90.18688,5.0,"{'Monday': '7:0-18:0', 'Tuesday': '7:0-19:0', ...",1,0.113424,4.909091,0.384256,3.133648
60823,ToW6MFXu2C6N04FualtFxQ,Yaa’s Fruitea Cafe,1200 South St,Philadelphia,PA,19147,39.943179,-75.161939,5.0,"{'Tuesday': '11:0-19:0', 'Wednesday': '11:0-19...",1,0.088892,4.9,0.347193,3.118328
51350,RGtMrLcbsss7lbZsdVYrqQ,Ding Tea Tucson,2739 E Speedway Blvd,Tucson,AZ,85716,32.236409,-110.931314,5.0,"{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ...",1,0.147552,4.851852,0.343786,3.094624
61686,EvwnRnlgQxjhlhhdK3iS_A,Dream House Cafe,"1331 Barataria Blvd, Ste C",Marrero,LA,70072,29.887263,-90.09891,5.0,"{'Wednesday': '11:0-20:30', 'Thursday': '11:0-...",1,0.041869,4.882353,0.245387,3.082676
