In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import clean
import process
import train_test_split
from collections import defaultdict
from numpy.random import choice

## Read in and clean data

In [2]:
reviews = pd.read_parquet('raw-data.pq')
reviews.review_time = pd.to_datetime(reviews.review_time,unit = 's')

In [3]:
cleaned_df = clean.merge_similar_name_breweries(reviews)
cleaned_df = clean.merge_brewery_ids(cleaned_df)
cleaned_df = clean.remove_dup_beer_rows(cleaned_df)
cleaned_df = clean.remove_null_rows(cleaned_df)
cleaned_df = clean.remove_duplicate_reviews(cleaned_df)

In [4]:
def get_beers(df):
    """ Input is a data frame of reviews and output is a data frame of unique beers
    """
    return df[['beer_id','brewery_id','beer_style']].drop_duplicates()

def assign_popularity(df):
    """ Calculates and adds a popularity column to the data frame of reviews
    """
    if 'popularity' in list(df):
        # if we've already added the popularity column, then do nothing
        return df
    mean_rating = df.groupby('beer_id')['review_overall'].mean()
    # normalization puts value counts and ratings on the same scale
    max_revs = df.value_counts('beer_id').max()
    normalized_counts = df.value_counts('beer_id')/max_revs*5
    popularity = mean_rating/2 + normalized_counts/2
    popularity.name = 'popularity'
    
    return df.join(popularity, on='beer_id')

def best_beers_by_style(df):
    """ returns a dictionary whose keys are beer styles and whose values are an array of beers of 
    of that beer style, sorted by popularity in ascending order.
    """
    d = dict()
    grouped = df[['beer_style','beer_id','popularity']].drop_duplicates().groupby('beer_style')
    for beer_style in grouped.groups:
        df = grouped.get_group(beer_style).sort_values('popularity')
        d[beer_style] = df.beer_id.values
    return d

def best_beers_by_brewery(df):
    """ returns a dictionary whose keys are brewery ids and whose values are an array of beers of 
    of that brewery, sorted by popularity in ascending order.
    """
    d = dict()
    grouped = df[['brewery_id','beer_id','popularity']].drop_duplicates().groupby('brewery_id')
    for brewery_id in grouped.groups:
        df = grouped.get_group(brewery_id).sort_values('popularity')
        d[brewery_id] = df.beer_id.values
    return d

def split_to_dict(split):
    """ Takes in a 2d array with a column of users and a column of beers they like and
    converts it to a dictionary where the keys are users and the entries are the lists of beers
    they like.
    """
    split = list(split)
    d = defaultdict(list)
    for review in split:
        d[review[0]].append(review[1])
    return d

In [5]:
cleaned_df['reviewer_id'] = cleaned_df["review_profilename"].astype('category').cat.codes
cleaned_df['beer_id'] = cleaned_df["beer_beerid"].astype("category").cat.codes

cleaned_df = cleaned_df.drop(['review_aroma','review_appearance','review_time','review_palate',
                        'review_taste','beer_abv'], axis=1)
cleaned_df = assign_popularity(cleaned_df)
cleaned_df.popularity.max()
cleaned_df.head()

Unnamed: 0,brewery_id,brewery_name,review_overall,review_profilename,beer_style,beer_name,beer_beerid,reviewer_id,beer_id,popularity
0,10325,Vecchio Birraio,1.5,stcules,Hefeweizen,Sausa Weizen,47986,30546,39207,0.75078
1,10325,Vecchio Birraio,3.0,stcules,English Strong Ale,Red Moon,48213,30546,39409,1.50078
2,10325,Vecchio Birraio,3.0,stcules,Foreign / Export Stout,Black Horse Black Beer,48215,30546,39411,1.50078
3,10325,Vecchio Birraio,3.0,stcules,German Pilsener,Sausa Pils,47969,30546,39192,1.50078
4,1075,Caldera Brewing Company,4.0,johnmichaelsen,American Double / Imperial IPA,Cauldron DIPA,64883,22994,53857,2.00078


In [6]:
int_matrix_trans = process.InteractionMatrixTransformer(cleaned_df)
int_matrix = int_matrix_trans.to_zero_one()

In [7]:
splits = train_test_split.get_splits(int_matrix)

## Baseline model: Favorite beer styles

In [8]:
np.random.seed(1111)

In [9]:
k = 10
beers = get_beers(cleaned_df)
pop_beers_by_style = best_beers_by_style(cleaned_df)
avg_recall_style = np.zeros(5)

for i in range(5):
    train, test = splits[i][0], splits[i][1]
    train = split_to_dict(train)
    test = split_to_dict(test)

    recalls = np.array([])
    weights = np.array([])
    #for each user in the test set, want to know how many of each beer type/brewery they reviewed
    # then want to sample from them accordingly
    counter = 0
    for user in test:
        recs = []
        d = defaultdict(int)
        #train[user] is the list of beer_ids they reviewed
        # for each beer_id in user, locate it in beers and
        # how to sample from 
        for beer_id in train[user]:
            d[beers.loc[beers.beer_id == beer_id, 'beer_style'].values[0]] += 1
        draw = list(choice(list(d.keys()),10, list(d.values())))
        draw = {a:draw.count(a) for a in draw}
        for beer_type in draw:
            beer_list = list(pop_beers_by_style[beer_type].copy())
            i = 0
            while i < draw[beer_type]:
                beer_id = beer_list.pop()
                if beer_id in train[user]:
                    continue # if we already know the user likes this beer, then skip it and draw another
                recs.append(beer_id)
                i+=1
        recs_set = set(recs)
        test_set = set(test[user])
        num_hits = len(recs_set.intersection(test_set))
        # append the recall at k value and the weight (i.e. how many liked beers in the test set)
        recalls = np.append(recalls,num_hits/min(k,len(test[user])))
        weights = np.append(weights, len(test[user]))
    avg_recall_style[i] = (recalls*weights/sum(weights)).sum()      

In [10]:
avg_recall_style

array([0.        , 0.06790987, 0.07066503, 0.07314833, 0.        ])

## Baseline Model: Favorite Breweries

In [11]:
k = 10
beers = get_beers(cleaned_df)
pop_beers_by_brewery = best_beers_by_brewery(cleaned_df)
avg_recall_brewery = np.zeros(5)

for i in range(5):
    train, test = splits[i][0], splits[i][1]
    train = split_to_dict(train)
    test = split_to_dict(test)

    recalls = []
    weights = []

    #for each user in the test set, want to know how many of each beer type/brewery they reviewed
    # then want to sample from them accordingly
    counter = 0
    for user in test:
        recs = []
        d = defaultdict(int)
        #train[user] is the list of beer_ids they reviewed
        # for each beer_id in user, locate it in beers and
        # how to sample from 
        for beer_id in train[user]:
            d[beers.loc[beers.beer_id == beer_id, 'brewery_id'].values[0]] += 1
        draw = list(choice(list(d.keys()),50, list(d.values())))
        pop_beers_copy = dict()
        draw_counts = {a:draw.count(a) for a in draw}
        for brewery_id in draw_counts:
            pop_beers_copy[brewery_id] = list(pop_beers_by_brewery[brewery_id]).copy()
        for brewery_id in draw:
            if len(pop_beers_copy[brewery_id]) > 0:
                beer_id = pop_beers_copy[brewery_id].pop()
                while beer_id in train[user] and len(pop_beers_copy[brewery_id])>0:
                    beer_id = pop_beers_copy[brewery_id].pop() # if we already know the user likes this beer, then skip it and draw another
                if beer_id not in train[user]:
                    recs.append(beer_id)
                    if len(recs) == 10:
                        break
        recs_set = set(recs)
        test_set = set(test[user])
        num_hits = len(recs_set.intersection(test_set))
        # append the recall at k value and the weight (i.e. how many liked beers in the test set)
        recalls.append(num_hits/min(k,len(test[user])))
        weights.append(len(test[user]))
    recalls = np.array(recalls)
    weights = np.array(weights)
    avg_recall_brewery[i] = (recalls*weights/sum(weights)).sum()

In [12]:
avg_recall_brewery

array([0.04684636, 0.04918675, 0.04835255, 0.04808402, 0.04899598])