# Cafe Recommender

### Imports

In [12]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.neighbors import NearestNeighbors
import numpy as np
from textblob import TextBlob




### Data Cleaning

Started by reading in the business dataset from the Yelp academic dataset, filtered for coffee and tea business, and dropped irrelevant columns. Then read in the reviews dataset while merging the data with the business data.

In [2]:
# Read in business data
business_json_path = '/Users/amelialei/yelp_dataset/yelp_academic_dataset_business.json'
bus_df = pd.read_json(business_json_path, lines = True)


# Filter dataset for coffee and tea establishments
bus_df = bus_df[bus_df['categories'].str.contains('Bubble Tea|Coffee and Tea', case = False, na = False)] 


# Include only businesses that are open
bus_df = bus_df[bus_df['is_open'] == 1]


# Obtain wifi information for each business and record as 'unknown' if there is no known wifi data
def get_wifi_info(attributes):
    if isinstance(attributes, dict): 
        info = attributes.get('WiFi', 'unknown') 
        if info != 'unknown':
            info = info.replace("u'", "").replace("'", "").strip()
    else:
        info = 'unknown'
    return info
    
bus_df['wifi'] = bus_df['attributes'].apply(get_wifi_info)


# Drop irrelevant columns 
drop_cols = ['is_open', 'review_count', 'attributes', 'categories']
bus_df = bus_df.drop(drop_cols, axis =1)



In [8]:
## Read in user reviews data
review_json_path = '/Users/amelialei/yelp_dataset/yelp_academic_dataset_review.json'

size = 1000000
review = pd.read_json(review_json_path, lines=True, dtype = {'review_id': str, 'user_id': str, 'business_id': str, 'stars': int, 'date': str, 'text': str, 'useful': int, 'funny': int, 'cool': int}, chunksize = size)


# Split data into chunks because the file is too large to read all at once. 
# Merge the reviews data with the businesses dataframe to create one dataframe.
chunk_list = []
for review_chunk in review:
    review_chunk = review_chunk.drop(['review_id', 'useful', 'funny', 'cool'], axis=1)
    review_chunk = review_chunk.rename(columns={'stars': 'review_stars'})
    chunk_merged = pd.merge(bus_df, review_chunk, on='business_id', how='inner')
    chunk_list.append(chunk_merged)
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

### Collaborative Filtering
A collaborative filtering recommendation systems uses data on other users with similar preferences to recommend new cafes. 

In [None]:
# Normalize numerical ratings 
df['normalized_stars'] = (df['stars'] - df['stars'].min())/(df['stars'].max()-df['stars'].min())
df['normalized_review_stars'] = (df['review_stars'] - df['review_stars'].min())/(df['review_stars'].max()-df['review_stars'].min())

# One hot encode categorical columns
df = pd.get_dummies(df, columns=['wifi'], drop_first=True)


# Create a user-item matrix consisting of users and their ratings for each business
user_item_matrix = df.pivot_table(index='user_id', columns='business_id', values='review_stars')

# Fill in missing ratings with the average rating for each cafe
user_item_matrix = user_item_matrix.apply(lambda col: col.fillna(col.mean()), axis=0)

# Use the nearest neighbors algorithm along with cosine similarity to measure similarity between users' preferences
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_item_matrix.values)

# Get the 6 most similar users to the given user
user_id = '79nOboO-4_yNuQQ21EAU1A'
distances, indices = knn_model.kneighbors([user_item_matrix.loc[user_id]], n_neighbors=6)

# Exclude the given user to find the top 5 most similar users
similar_users = indices.flatten()[1:]
similar_users_ids = user_item_matrix.index[similar_users]

# Average the similar users' cafe ratings and sort them in descending order to find the top cafe to recommend
cafe_recs = user_item_matrix.loc[similar_users_ids].mean(axis=0).sort_values(ascending=False).head()
print(cafe_recs)



In [13]:
df['text'] = (df['text']
              .str.lower()
              .replace(r'\band\b|\bor\b|\bthe\b|\bis\b|\bto\b', '', regex=True)
              .replace(r'[^A-Za-z\s]', '', regex=True))

ps = PorterStemmer()
def tokenize_and_stem(review):
    tokens = word_tokenize(review)
    stemmed_tokens = [ps.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

cleaned_reviews = df['text'].apply(tokenize_and_stem)
df['text'] = cleaned_reviews
df['sentiment'] = df['text'].apply(lambda review: TextBlob(review).sentiment.polarity)