In [20]:
# import custom functions and required libraries
import pandas as pd
import numpy as np
import functions
from surprise import SVD, Dataset, accuracy, BaselineOnly, Reader, KNNWithMeans, KNNBasic, NormalPredictor
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from collections import defaultdict
from surprise.model_selection import KFold
# import pyspark as spark
# from pyspark.sql import SparkSession, Row
# from pyspark.ml.evaluation import RegressionEvaluator
# from pyspark.ml.recommendation import ALS
# spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()
# from pyspark.ml.feature import StringIndexer
# from pyspark.ml import Pipeline
# from pyspark.sql.functions import col

In [72]:
beers = pd.read_csv('beers-breweries-and-beer-reviews/beers.csv')
breweries = pd.read_csv('beers-breweries-and-beer-reviews/breweries.csv')
reviews = pd.read_csv('beers-breweries-and-beer-reviews/reviews.csv')

In [4]:
# preprocess reviews and beer csvs and output a DF ready to be used by collaborative filter
df_with_mins = functions.preprocess_reviews(reviews,beers)

In [5]:
df_with_mins.shape

(1948703, 23)

# Enter PySpark

This proved to not be a better performer than the surprise SVD, feel free to uncomment the imports in the first cell if you'd like to run through these.

In [None]:
df_spark = df_with_mins[['username', 'id', 'score']]

In [None]:
from pyspark.sql.types import *
mySchema = StructType([StructField('username', StringType(), True),\
                      StructField('id', IntegerType(), True),\
                      StructField('score', FloatType(), True)
                      ])

In [None]:
df_spark = spark.createDataFrame(df_spark, schema=mySchema)

In [None]:
df_spark.show(10)

In [None]:
indexer = [StringIndexer(inputCol=column, outputCol=column+"_index")\
          for column in list(set(df_spark.columns)-set(['id','score']))]

In [None]:
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(df_spark).transform(df_spark)

In [None]:
transformed.show(1)

In [None]:
(training,test) = transformed.randomSplit([0.8, 0.2])

In [None]:
als=ALS(maxIter=5,regParam=0.09,rank=25,\
        userCol="username_index",itemCol="id",\
        ratingCol="score",coldStartStrategy="drop",
        nonnegative=True)

model=als.fit(training)

In [None]:
evaluator=RegressionEvaluator(metricName="rmse",
                              labelCol="score",
                              predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))
predictions.show()

In [None]:
user_recs=model.recommendForAllUsers(20).show(10)

In [None]:
df_spark

In [None]:
transformed.filter(transformed.username_index == 1580).collect()

In [None]:
df_with_mins.loc[(df_with_mins.id == 78820) &
                 (df_with_mins.username == 'hoppytobehere')]

# SURPRISE!

The Surprise SVD proved to be great to work with, however the resulting recommender proved to not adequately recommend a variety of beers/breweries. 

In [16]:
# Set the rating scale to 1-5, and create the user/item matrix
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_with_mins[['username', 'id', 'score']], reader)

In [17]:
# cross validate data with a normal predictor to get a baseline
cross_validate(NormalPredictor(), data, cv=2)

{'test_rmse': array([0.82007073, 0.82050574]),
 'test_mae': array([0.64395869, 0.64446125]),
 'fit_time': (1.650062084197998, 1.865022897720337),
 'test_time': (14.152489185333252, 12.945470809936523)}

In [18]:
# test set is made of 20% of the ratings.
trainset, testset = train_test_split(data, test_size=.2)

# Using Surprise's SVD model
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.4189


0.41885171629375584

In [None]:
# the below grid search took approx. 35 minutes on my local machine

In [None]:
# Grid Search with SVD
params = {'n_factors': [20, 50, 100],
         'reg_all': [0.01, 0.02, 0.05]}
# n_jobs = -1 means all CPUs are used
gs_svd = GridSearchCV(SVD, param_grid=params, n_jobs=-1)
gs_svd.fit(data)

In [None]:
gs_svd.best_score

In [None]:
gs_svd.best_params

In [23]:
# Perform KFold Cross Validation with best params from grid search, then output 
# evaluation metrics
kf = KFold(n_splits=5)
algo = SVD(n_factors= 20, reg_all= 0.02)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = functions.precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

0.8497791118294428
0.6053896940054159
0.8446046612265048
0.6087237132677664
0.8491118222716928
0.6063283369361788
0.8487067631953434
0.6082032354045026
0.8480427708042743
0.6056180260920596


In [None]:
## initial precision is ~84% @ k=5 and, 4 as threshold 
## initial recall is 60.3%

In [None]:
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=3, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

In [None]:
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=1, threshold=4.2)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

In [None]:
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=2, threshold=4.2)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

In [None]:
# How is A/P when predicting many beers?
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=200, threshold=4.2)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

In [25]:
# How is A/P when predicting many beers?
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = functions.precision_recall_at_k(predictions, k=400, threshold=4.2)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

0.8411080671117886
0.569058884423432
0.8407760575004992
0.5750244368260659
0.8399386974562533
0.5742876049736397
0.8424135710252793
0.5751437131537722
0.8444531962150195
0.574099796311424


Looking at all users, instead of the 2STD subgroup. Not expecting it to improve scores but thought I should check

In [26]:
# the below did not have a significant effect on recall or precision

# reader = Reader(rating_scale=(1, 5))
# data2 = Dataset.load_from_df(df_all_users[['username', 'id', 'score']], reader)
# trainset, testset = train_test_split(data2, test_size=.2)

# # Using Surprise's SVD model
# algo = SVD()

# # Train the algorithm on the trainset, and predict ratings for the testset
# algo.fit(trainset)
# predictions = algo.test(testset)

# # Then compute RMSE
# accuracy.rmse(predictions)

### KNN

In [None]:
# this hurts the computer
knn_means = KNNWithMeans(k=10, sim_options={'name':'pearson', 'user_based':True})
cv_knn_means = cross_validate(knn_means, data, n_jobs=-1)

## There are other models too: https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html

## Let's make some functions to spit out location based results

In [27]:
svd_df = df_with_mins[['username', 'id', 'score']].copy()

In [29]:
# instantiate SVD algorithm, train/test split and fit train data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_with_mins[['username', 'id', 'score']], reader)
trainset, testset = train_test_split(data, test_size=.2)
algo = SVD(n_factors= 20, reg_all= 0.02)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x159ab7400>

In [30]:
# testing a prediction for an existing user
uid = str('GratefulBeerGuy')
iid = 265678

In [31]:
pred = algo.predict(uid, iid, verbose = True)

user: GratefulBeerGuy item: 265678     r_ui = None   est = 4.05   {'was_impossible': False}


In [38]:
svd_df.head()

Unnamed: 0,username,id,score
0,GratefulBeerGuy,125646,4.58
1,GratefulBeerGuy,47678,3.69
2,GratefulBeerGuy,71930,4.37
3,GratefulBeerGuy,326798,3.99
4,GratefulBeerGuy,48824,4.51


In [45]:
# how to inpute new ratings
user_rating = [{'username': 'tester-mctestyface', 'id': 55245, 'score': 5},
               {'username': 'tester-mctestyface', 'id': 237806, 'score': 3},
               {'username': 'tester-mctestyface', 'id': 1062, 'score': 4},
               {'username': 'tester-mctestyface', 'id': 9353, 'score': 5},
               {'username': 'tester-mctestyface', 'id': 9353, 'score': 2},
               {'username': 'tester-mctestyface', 'id': 1286, 'score': 1.5},
              ]

## add the new ratings to the original ratings DataFrame
new_ratings_df = svd_df.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

In [46]:
predictions = algo.test(testset)

In [48]:
predictions[-1]

Prediction(uid='comfortablynumb1', iid=1901, r_ui=3.58, est=3.8079624162638273, details={'was_impossible': False})

In [49]:
# this makes a dict for all users and ranks their predictions. I believe it is only on
# test data

user_est_true = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    user_est_true[uid].append((est, iid, true_r))


for uid, user_ratings in user_est_true.items():

    # Sort user ratings by estimated value
    user_ratings.sort(key=lambda x: x[0], reverse=True)

In [55]:
# this predicts ratings for all beers for a single user, these are random users to test
# predictions
egads = functions.user_recs(algo, df_with_mins, 'EgadBananas')
macca = functions.user_recs(algo, df_with_mins, 'Macca')
gbg = functions.user_recs(algo, df_with_mins, 'GratefulBeerGuy')
orioles = functions.user_recs(algo, df_with_mins, 'oriolesfan4')
w = functions.user_recs(algo, df_with_mins, 'CJDUBYA')

In [79]:
beers_lookup = beers[['id','brewery_id', 'name']]
beers_lookup = beers_lookup.rename(columns={'id':'beer_id'})
breweries = breweries.rename(columns={'id':'brewery_id', 'name':'brewery_name'})
breweries_lookup = breweries[['brewery_id', 'city', 'state', 'country', 'brewery_name']]
beer_breweries_lookup = pd.merge(beers_lookup, breweries_lookup, on='brewery_id')

In [74]:
beers_dict = beers_lookup.set_index('beer_id').to_dict()

In [80]:
# dict with beer_id as key, and the value is a dict with brewery_id, city, state, country 
# as the keys to that dictionary
beer_breweries_lookup = beer_breweries_lookup.set_index('beer_id').to_dict(orient='index')

In [81]:
functions.output_brewery(functions.location_filter(gbg, beer_breweries_lookup,\
                                                   'WA','Seattle', 4))

('Fremont Brewing Company', 149554, 'Coffee Cinnamon B-Bomb')
('Holy Mountain Brewing Company', 216398, 'Midnight Still')
('Elysian Brewing Company', 192252, 'The Fix')
("Reuben's Brews", 113560, "Blimey That's Bitter!")


In [82]:
functions.output_brewery(functions.location_filter(macca, beer_breweries_lookup,\
                                                   'WA','Seattle', 4))

('Fremont Brewing Company', 116702, 'The Rusty Nail')
('Holy Mountain Brewing Company', 216398, 'Midnight Still')
('Elysian Brewing Company', 192252, 'The Fix')
("Reuben's Brews", 113560, "Blimey That's Bitter!")


In [83]:
functions.output_brewery(functions.location_filter(egads, beer_breweries_lookup,\
                                                 'WA','Seattle', 4))

('Fremont Brewing Company', 149554, 'Coffee Cinnamon B-Bomb')
('Holy Mountain Brewing Company', 216398, 'Midnight Still')
('Elysian Brewing Company', 33394, 'The Great Pumpkin')
("Reuben's Brews", 113560, "Blimey That's Bitter!")


In [84]:
functions.output_brewery(functions.location_filter(orioles, beer_breweries_lookup,\
                                                   'WA','Seattle', 4))

('Fremont Brewing Company', 116702, 'The Rusty Nail')
('Holy Mountain Brewing Company', 216398, 'Midnight Still')
('Elysian Brewing Company', 192252, 'The Fix')
("Reuben's Brews", 113560, "Blimey That's Bitter!")


In [None]:
# this searches through ranked_beers to see how far into the list the beer appears
# this was done to test if the return of the state/city search function was returning
# a ranked list or not
count = 0
for i in ranked_beers:
    if i[0] == 155828:
        print(i)
        print(count)
    count += 1

I need to see how many beers are in a given location

In [None]:
brew_city = breweries[['brewery_id', 'city']]
brew_city = brew_city.set_index('brewery_id')

In [None]:
beer_explore = pd.merge(beers, brew_city, on='brewery_id')

In [None]:
LA_beer = beer_explore.loc[beer_explore.city == 'Los Angeles']

In [None]:
LA_beer = LA_beer.drop(['style', 'availability', 'abv', 'notes'], axis =1)

In [None]:
LA_reviews = pd.merge(LA_beer, df_with_mins, on = 'id')

In [None]:
LA_reviews.id.value_counts()

# CONTENT BASED -- NLP

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df_with_mins.columns

In [None]:
# subsets reviews df, and then joins all text reviews for each individual beer together
df_joined = df_with_mins.copy()
df_joined['joined_text'] = df_joined.groupby('id')['text'].\
                               transform(lambda x: ''.join(x))

In [None]:
# removes duplicate beers, and subsets to just beer id, joined_text and rating info
# to be cleaned and then joined to beers df
df_joined_sub = df_joined[['id', 'joined_text', 'avg_score', 'no_of_ratings']].drop_duplicates(\
                                                                        subset='id')

In [None]:
# removes \xa0 remove text
df_joined_sub['joined_text'] = df_joined_sub['joined_text'].apply(lambda x: re.sub\
                                                                  (r'\xa0', '', x))

In [None]:
# looks like it worked!
df_joined_sub.joined_text[0][:400]

In [None]:
count_vect = CountVectorizer(stop_words='english')
counts = count_vect.fit_transform(df_joined_sub.joined_text)

In [None]:
count_vect.vocabulary_

In [None]:
cos_sim = cosine_similarity(counts, counts)

In [None]:
indices = pd.Series(df_joined_sub.index)

In [None]:
indices[indices == 4].index[0]

In [None]:
def recommendations(beer_id, cos_sim = cos_sim):
    """
    Takes a beer id and cosine similarty matrix in as arguments and returns beers closely related to the input beer
    """
    # initializing the empty list of recommended movies
    recommended_beers = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == beer_id].index[0]
    print(idx)
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    print(top_10_indexes)
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_beers.append(list(beers_text.name)[i])
        
    return recommended_beers

def tfidf_recs(beer_id, cos_sim = tfidf_cos):
    """
    Takes a beer id and cosine similarty matrix in as arguments and returns beers closely related to the input beer
    """
    # initializing the empty list of recommended movies
    recommended_beers = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == beer_id].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:21].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_beers.append(list(beers_text.name)[i])
        
    return beers_text.name[beer_id], recommended_beers

In [None]:
recommendations(2)

In [None]:
beers_text = pd.merge(df_joined_sub, beers, on='id')

In [1]:
## Below is code for 2.0 recommender from Wednesday that might not have saved in JN

# how to inpute new ratings
user_rating = [{'username': 'tester-mctestyface', 'id': 55245, 'score': 1},
               {'username': 'tester-mctestyface', 'id': 237806, 'score': 1},
               {'username': 'tester-mctestyface', 'id': 1062, 'score': 1},
               {'username': 'tester-mctestyface', 'id': 116702, 'score': 1},
               {'username': 'tester-mctestyface', 'id': 140119, 'score': 1},
               {'username': 'tester-mctestyface', 'id': 143753, 'score': 1.5},
               {'username': 'tester-mctestyface', 'id': 265678, 'score': 1.5},
               {'username': 'tester-mctestyface', 'id': 237806, 'score': 1},
              ]

## add the new ratings to the original ratings DataFrame
reader = Reader(rating_scale=(1, 5))
new_ratings_df = svd_df.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)
trainset = new_data.build_full_trainset()
model = SVD(n_factors = 20, reg_all=0.02,)
model.fit(trainset)

# make a set of beers the user did not rate
svd_pred_set = get_user_pred_set(user_rating, new_ratings_df)
located_beers = svd_location_filter(svd_pred_set, lookup_dict, 'WA', 'Seattle', 500)

located_preds = pred_for_user_location(located_beers, 'tester-mctestyface', model)
return_top_breweries(located_preds,located_beers,2)

def get_user_pred_set(user_rating_list, rating_df):
    """returns a list of beer id's to be predicted. excludes beers the users imputed
    user_rating_list: is a list of dictionaries produced when the user provides
                      initial ratings
    rating_df: is a df of all ratings subseted to the columns needed for SVD"""
    
    user_ratings_ids = []
    for rating in user_rating_list:
        user_ratings_ids.append(rating['id'])
    beers_for_pred = []
    for beer_id in rating_df['id']:
        if beer_id not in user_ratings_ids:
            beers_for_pred.append(str(beer_id))
    return set(beers_for_pred)

def svd_location_filter(user_pred_list, lookup_dict, state, city, n):
    """ 
    takes in list from get_user_pred_list and filters list down to only the location they
    provided, returns a dictionary with the beer id as key, and beer_name and brewery_name
    as values
    """
    located_beer = {}
    counter = 0

    for beer in user_pred_list:
#         print(beer)
        if counter < n:
            dict_state = lookup_dict[beer]['state']
            dict_city = lookup_dict[beer]['city']
            brewery_id = lookup_dict[beer]['brewery_id']
            brewery_name = lookup_dict[beer]['brewery_name']
            beer_name = lookup_dict[beer]['name']
            if (dict_state == state) and (dict_city == city):
        #             print(beer_breweries_lookup[beer[0]])
                if brewery_id in located_beer:
                    continue
                else:  
                    located_beer[beer] = (beer_name,brewery_name)
                counter += 1
    return located_beer

def sort_score(val):
    """used to sort predictions by their estimated score"""
    return val[1]

def pred_for_user_location(to_predict_list, username, model):
    """Takes in a list of beer ID's (that have been filtered by location) to be predicted for 
    the given user. Also takes the trained model as an argument"""
    predictions = []
    for iid in to_predict_list:
        pred = model.predict(username, int(iid), verbose = False)
        predictions.append((pred[1],pred[3]))
    predictions.sort(key = sort_score, reverse = True)
    return predictions

def return_top_breweries(top_beers, svd_loc_filter_output, n):
    
    counter = 0
    top_breweries = {}
    for beer in top_beers:
        if counter < n:
            beer_name = svd_loc_filter_output[str(beer[0])][0]
            brewery_name = svd_loc_filter_output[str(beer[0])][1]
            if brewery_name in top_breweries:
                continue
            else:
                top_breweries[brewery_name] = (brewery_name, beer_name)
#             top_breweries.append(svd_loc_filter_output[str(beer[0])])
            
            
            counter+=1
    return top_breweries