# Cafe Recommender

### Imports

In [78]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.neighbors import NearestNeighbors
import numpy as np
from textblob import TextBlob
from geopy.distance import great_circle


### Data Cleaning

Started by reading in the business dataset from the Yelp academic dataset, filtered for coffee and tea business, and dropped irrelevant columns. Then read in the reviews dataset while merging the data with the business data.

In [79]:
# Read in business data
business_json_path = '/Users/amelialei/yelp_dataset/yelp_academic_dataset_business.json'
bus_df = pd.read_json(business_json_path, lines = True)


# Filter dataset for coffee and tea establishments
bus_df = bus_df[bus_df['categories'].str.contains('Bubble Tea|Coffee and Tea', case = False, na = False)] 


# Include only businesses that are open
bus_df = bus_df[bus_df['is_open'] == 1]


# Obtain wifi information for each business and record as 'unknown' if there is no known wifi data
def get_wifi_info(attributes):
    if isinstance(attributes, dict): 
        info = attributes.get('WiFi', 0) 
        if info != 0:
            info = info.replace("u'", "").replace("'", "").strip()
            if info == 'free':
                info = 1
            else:
                info = 0
    else:
        info = 0
    return info
    
bus_df['free_wifi'] = bus_df['attributes'].apply(get_wifi_info)


# Drop irrelevant columns 
drop_cols = ['is_open', 'review_count', 'attributes', 'categories']
bus_df = bus_df.drop(drop_cols, axis =1)



In [80]:
## Read in user reviews data
review_json_path = '/Users/amelialei/yelp_dataset/yelp_academic_dataset_review.json'

size = 1000000
review = pd.read_json(review_json_path, lines=True, dtype = {'review_id': str, 'user_id': str, 'business_id': str, 'stars': int, 'date': str, 'text': str, 'useful': int, 'funny': int, 'cool': int}, chunksize = size)


# Split data into chunks because the file is too large to read all at once. 
# Merge the reviews data with the businesses dataframe to create one dataframe.
chunk_list = []
for review_chunk in review:
    review_chunk = review_chunk.drop(['review_id', 'useful', 'funny', 'cool'], axis=1)
    review_chunk = review_chunk.rename(columns={'stars': 'review_stars'})
    chunk_merged = pd.merge(bus_df, review_chunk, on='business_id', how='inner')
    chunk_list.append(chunk_merged)
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

### Sentiment Analysis
Performed a sentiment analysis on the reviews to achieve a more hollistic view on users' experiences with the cafe.

In [None]:
# Remove stop words
df['text'] = (df['text']
              .str.lower()
              .replace(r'\band\b|\bor\b|\bthe\b|\bis\b|\bto\b', '', regex=True)
              .replace(r'[^A-Za-z\s]', '', regex=True))

# Stem words
ps = PorterStemmer()
def tokenize_and_stem(review):
    tokens = word_tokenize(review)
    stemmed_tokens = [ps.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

# Calculate the review's polarity score
cleaned_reviews = df['text'].apply(tokenize_and_stem)
df['text'] = cleaned_reviews
df['sentiment'] = df['text'].apply(lambda review: TextBlob(review).sentiment.polarity)

# Calculate the new review rating, taking into account it's sentiment
df['review_stars'] = df['review_stars']*0.8 + df['sentiment']*0.2


### Collaborative Filtering
Used a collaborative filtering recommendation system to recommend new cafes based on other users with similar preferences. 

In [None]:
# Normalize numerical ratings 
df['normalized_stars'] = (df['stars'] - df['stars'].min())/(df['stars'].max()-df['stars'].min())
df['normalized_review_stars'] = (df['review_stars'] - df['review_stars'].min())/(df['review_stars'].max()-df['review_stars'].min())

# Create a user-item matrix consisting of users and their ratings for each business
user_item_matrix = df.pivot_table(index='user_id', columns='business_id', values='review_stars')

# Fill in missing ratings with the average rating for each cafe
user_item_matrix = user_item_matrix.apply(lambda col: col.fillna(col.mean()), axis=0)

# Use the nearest neighbors algorithm along with cosine similarity to measure similarity between users' preferences
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_item_matrix.values)

# Find the most similar users
user_id = '79nOboO-4_yNuQQ21EAU1A'
distances, indices = knn_model.kneighbors([user_item_matrix.loc[user_id]])
similar_users = indices.flatten()[1:]
similar_users_ids = user_item_matrix.index[similar_users]

# Average the similar users' cafe ratings and sort them in descending order to find the top cafe to recommend
cafe_recs = user_item_matrix.loc[similar_users_ids].mean(axis=0).sort_values(ascending=False)

rec_cafe_df = bus_df[bus_df['business_id'].isin(cafe_recs.index)]






business_id
MM-IvUB7qBMrBTDm_4MbCw    4.090748
hemQ0_nE8du-ednYNYMvLw    4.056280
YTxOgn-vquErIXTA97Lrlw    4.049093
xt-GigN6sRFQXKCIyIOCYA    4.004124
BxLqgAycWWeq9wFbPh58pg    3.999373
                            ...   
Lkt2-GnwnnfexcYa_z9ezA    2.040707
7BZo_arIR_qvyyVMRBxQDg    1.992055
Q4NmqOOn2ZhzgxtcTJXh3g    1.943351
56YIYqSVmJWwOW5o1KxEYQ    1.868098
MFjH1KLyQVZuv6Q81xgcXg    1.842221
Length: 362, dtype: float64

### Location-Based Filtering

In [None]:
# Latitude and longitude of user's location
user_location = (37.7749, 122.4194)

# Calculate the distance between the user's location and the cafes
def get_distance(cafe_location):
    return great_circle(user_location, cafe_location).miles

bus_df['distance'] = bus_df.apply(lambda row: get_distance((row['latitude'], row['longitude'])), axis=1)
weighted_distance = 1 - (bus_df['distance']/bus_df['distance'].max())

NameError: name 'great_circle' is not defined

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
555,bdfZdB2MTXlT6-RBjSIpQg,Pho Bistro,903 Embarcadero Del Norte,Isla Vista,CA,93117,34.412934,-119.855531,3.0,184,1,"{'RestaurantsDelivery': 'True', 'BikeParking':...","Food, Restaurants, Chinese, Bubble Tea, Vietna...","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
1447,i9eMSNpOA8BfHGZrM-9sZg,Kung Fu Tea,1469 Brace Rd,Cherry Hill,NJ,08034,39.889260,-75.015852,4.5,10,1,"{'WiFi': 'u'free'', 'RestaurantsTakeOut': 'True'}","Bubble Tea, Juice Bars & Smoothies, Food, Coff...","{'Monday': '10:0-21:0', 'Tuesday': '10:0-21:0'..."
1874,-3-6BB10tIWNKGEF0Es2BA,The 81 Hong Kong Cafe,"625 E Wetmore Rd, Ste 109",Tucson,AZ,85705,32.288540,-110.963144,4.0,133,1,"{'Alcohol': 'u'none'', 'DogsAllowed': 'False',...","Coffee & Tea, Asian Fusion, Food, Bubble Tea, ...","{'Monday': '0:0-0:0', 'Wednesday': '10:0-21:0'..."
2926,Zm3X8i9GsYaxdWHlI-WQwg,Eleven Cafē,"33 N Stone Ave, Ste 150",Tucson,AZ,85701,32.222606,-110.972193,5.0,21,1,"{'BusinessParking': '{'garage': True, 'street'...","Coffee & Tea, Restaurants, Food, Cafes, Bubble...","{'Monday': '7:0-14:0', 'Tuesday': '7:0-14:0', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147334,OOv7OvZlExF2Z3Q569RtEw,Poke Island,7093 66th St N,Pinellas Park,FL,33781,27.836506,-82.728235,5.0,140,1,"{'Ambience': '{'touristy': False, 'hipster': F...","Poke, Bubble Tea, Food, Sushi Bars, Restaurants","{'Monday': '0:0-0:0', 'Tuesday': '11:0-20:0', ..."
148099,Ub_6YsWqb90lfT6k1kRbwA,TKK Fried Chicken,4500 City Ave,Philadelphia,PA,19131,40.003342,-75.221926,4.0,12,1,"{'RestaurantsDelivery': 'False', 'HasTV': 'Tru...","Bubble Tea, Taiwanese, Chicken Shop, Chicken W...",
149093,lsmkYSSE5ycTNGLc69h_xw,Dumplings2Go,2300 E Lincoln Hwy,Langhorne,PA,19047,40.183638,-74.881048,3.5,15,1,"{'RestaurantsTableService': 'False', 'Restaura...","Chinese, Fast Food, Restaurants, Food, Bubble ...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-19:0', ..."
149269,-h5xCtWvNIcIvRlYM2SWdA,Gong Cha Whyte ave,10342 81 Avenue Northwest,Edmonton,AB,T6E 4E4,53.517186,-113.497197,4.0,16,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsPri...","Food, Bubble Tea",
