# Recommender Systems evaluated by different metrics

In [1]:
import warnings
warnings.filterwarnings("ignore")
import csv
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [2]:
beers = pd.read_csv('df_clean.csv')

In [3]:
beers.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,names,Id,brewerId,ABV,style,appearance,aroma,...,profile_name,text,time2,day,month,year,user_id,beers_by_day,more_beers_than_10,too_much_beers
0,5930,7,7,Caldera Ginger Beer,52159,1075,4.7,Herbed / Spiced Beer,3.5,2.5,...,alpinebryant,""" Bottle says """"Malt beverage brewed with Ging...",2011-05-24,24,5,2011,13938,2,0,0


In [4]:
beers['beerId'] = LabelEncoder().fit_transform(beers.names)

In [5]:
beers.rename(columns={'names':'beerName'}, inplace=True)

In [6]:
df=beers.filter(items=['user_id','profile_name', 'beerId', 'beerName', 'overall'])
df.head(2)

Unnamed: 0,user_id,profile_name,beerId,beerName,overall
0,13938,alpinebryant,6549,Caldera Ginger Beer,3.0
1,13938,alpinebryant,28676,Short's Cup A Joe Coffee Creme Stout,4.0


In [7]:
df2=beers.filter(items=['beerId', 'beerName'])

#### We are going to create a dictionary with the the beers Ids as a key

In [8]:
df3=df2.groupby('beerId')['beerName'].first()

In [9]:
beer_dict=df3.to_dict()

In [10]:
beer_dict

{0: ' # 100',
 1: " #'s Ale",
 2: ' #1 Abbey Ale',
 3: ' #13',
 4: ' #14',
 5: ' #20 Jubilee (Yubilejnoe)',
 6: ' #9',
 7: ' $ellout $tout',
 8: ' &#268;ernovar &#268;ern\xc3\xa9',
 9: ' &#268;ern\xc3\xa1 Hora Moravsk\xc3\xa9 Sklepn\xc3\xad',
 10: ' &#268;ern\xc3\xa1 Hora P\xc3\xa1ter',
 11: ' &#268;ern\xc3\xa1 Hora Velen',
 12: ' &#268;ern\xc3\xa9 Z\xc3\xa1meck\xc3\xa9',
 13: ' &#960;&#964;&#953;&#963;&#940;&#957;&#951; (tisane)',
 14: " '71 Pale Ale",
 15: " 'F\xc3\xa8ileadh Air Teine' - Scottish Oatmeal Stout",
 16: " 'Pooya Porter",
 17: " 'Stock Stout",
 18: " 'T Zelfde",
 19: " 'Tis The Saison",
 20: " 'Vagabond Blonde' Blonde Ale",
 21: " 'Zula Stout",
 22: " 't Gaverhopke / Tired Hands Bitter Sweet Symphony",
 23: " 't Gaverhopke De Kriek (Red Cap)",
 24: " 't Gaverhopke Den Blond 8\xc2\xb0 (White Cap)",
 25: " 't Gaverhopke Den Bruin 8\xc2\xb0 (Blue Cap)",
 26: " 't Gaverhopke Extra",
 27: " 't Gaverhopke Zingende Blondine",
 28: " 't Goeye Goet Dubbel",
 29: " 't Hartje( For 

In [11]:
beer_dict[0]

' # 100'

In [12]:
n_users = df.profile_name.unique().shape[0]
n_beers = df.beerId.unique().shape[0]
print("There are %s users, and %s different beers" %(n_users, n_beers))

There are 31651 users, and 35800 different beers


## Split the dataset into train and test sets

In [13]:
import sklearn
from sklearn.cross_validation import train_test_split



In [14]:
df.head(1)

Unnamed: 0,user_id,profile_name,beerId,beerName,overall
0,13938,alpinebryant,6549,Caldera Ginger Beer,3.0


In [15]:
train, test = train_test_split(df, test_size=0.2)

# 1. Recommender taking into account the most popular Beers

* Most rated beers (it is assumed that this is the most consumed beer)
* Most positively rated beer (rating > 4.0)
* Highest rated beer

### Most Rated Beer

In [16]:
mostRated = train.groupby('beerId')['profile_name'].count().sort_values(ascending = False)
mostRated.head(3)

beerId
545      1899
23132    1790
28766    1743
Name: profile_name, dtype: int64

In [17]:
a=df[(df['profile_name']==' wagenvolks') & (df['beerName']== ' (512) Cascabel Cream Stout')]
a

Unnamed: 0,user_id,profile_name,beerId,beerName,overall
238773,32481,wagenvolks,49,(512) Cascabel Cream Stout,4.0


In [18]:
beer_dict[49]

' (512) Cascabel Cream Stout'

In [19]:
mostRatedBeers = np.array([np.array([i, beer_dict[i], mostRated[i]], dtype=np.object) 
                          for i in mostRated.index])
mostRatedBeers[:10,1:]

array([[' 90 Minute IPA', 1899],
       [' Old Rasputin Russian Imperial Stout', 1790],
       [' Sierra Nevada Celebration Ale', 1743],
       [' Stone Ruination IPA', 1537],
       [' Arrogant Bastard Ale', 1510],
       [' Sierra Nevada Pale Ale', 1510],
       [' Two Hearted Ale', 1509],
       [' Stone IPA (India Pale Ale)', 1444],
       [' Founders Breakfast Stout', 1427],
       [' La Fin Du Monde', 1425]], dtype=object)

### Positive Rated Beer

In [20]:
positiveRated = train[train.overall>=4.0].groupby('beerId')['profile_name'].count().sort_values(ascending = False)

In [21]:
positiveRatedBeers = np.array([np.array([i, beer_dict[i], positiveRated[i]], dtype=np.object) 
                          for i in positiveRated.index])
positiveRatedBeers[:10,1:]

array([[' 90 Minute IPA', 1577],
       [' Old Rasputin Russian Imperial Stout', 1533],
       [' Sierra Nevada Celebration Ale', 1484],
       [' Pliny The Elder', 1372],
       [' Two Hearted Ale', 1354],
       [' Sierra Nevada Pale Ale', 1340],
       [' Stone Ruination IPA', 1320],
       [' Founders Breakfast Stout', 1315],
       [' Stone IPA (India Pale Ale)', 1283],
       [" Bell's Hopslam Ale", 1268]], dtype=object)

### Highest mean rating beer

In [22]:
min_ratings = 50
listRatedBeers = train.groupby('beerId')['overall'].apply(list).reset_index()

In [23]:
filteredListRatedBeers = listRatedBeers[listRatedBeers.overall.apply(lambda x: len(x)>min_ratings)]

In [24]:
filteredListRatedBeers.head()

Unnamed: 0,beerId,overall
0,0,"[4.5, 4.0, 4.0, 4.5, 4.0, 4.0, 4.0, 4.0, 4.0, ..."
5,6,"[3.5, 4.5, 3.0, 3.5, 3.0, 4.0, 3.5, 4.5, 4.0, ..."
68,76,"[2.5, 3.5, 3.0, 2.0, 3.5, 4.0, 4.0, 4.0, 4.0, ..."
81,90,"[2.0, 4.0, 4.0, 4.5, 4.0, 3.0, 3.0, 4.5, 4.5, ..."
93,103,"[4.0, 4.0, 4.0, 4.5, 4.0, 4.5, 4.0, 3.5, 3.5, ..."


In [25]:
meanBeers = filteredListRatedBeers.overall.apply(lambda x: np.mean(np.array(x))).sort_values(ascending=False)

In [26]:
meanRateBeers = np.array([np.array([i, beer_dict[i], meanBeers[i]], dtype=np.object) 
                          for i in meanBeers.index])

meanRateBeers[:10,1:]

array([[' Gourmetbryggeriet Piney', 4.6394230769230766],
       [' Centennial Alt', 4.6284403669724767],
       [' St Arnou St Cloud', 4.621382636655949],
       [' Oktoberator', 4.6135593220338986],
       [' Old Morgantown Amber', 4.6104651162790695],
       [' OktoberFish', 4.6079465541490858],
       [' Ettaler Curator Doppelbock (Original German Version)',
        4.604982206405694],
       [' Oyster Point Oyster Stout', 4.5943396226415096],
       [' Kirin Tanrei Alpha Happoshu Beer', 4.5625],
       [' Tripel Van De Garre', 4.5508021390374331]], dtype=object)

### Function to calculate the recall metric

In [27]:
def recall_at_n(N, rated, recommended):
    """
    :param N: number of recommendations
    :param rated: list of beers rated by user
    :param recommended: list of beers recommended
    
    :return the recall
    """
    intersection = len(set(rated) & set(recommended[:N]))
    return intersection / np.min([float(N), len(rated)])

In [28]:
rated = [" # 100"]
recommended = [' Stone Imperial Russian Stout',' Stone Sublimely Self-Righteous Ale']

In [29]:
recall_at_n(1, rated, recommended)

0.0

In [30]:
recall_at_n(2, rated, recommended)

0.0

### Looking for the recall metrics.
#### Now, we are going to measure the efficiency of the popularity in the test set. Using the top-5 beers

In [31]:
mostRatedBeers[:10,1:]

array([[' 90 Minute IPA', 1899],
       [' Old Rasputin Russian Imperial Stout', 1790],
       [' Sierra Nevada Celebration Ale', 1743],
       [' Stone Ruination IPA', 1537],
       [' Arrogant Bastard Ale', 1510],
       [' Sierra Nevada Pale Ale', 1510],
       [' Two Hearted Ale', 1509],
       [' Stone IPA (India Pale Ale)', 1444],
       [' Founders Breakfast Stout', 1427],
       [' La Fin Du Monde', 1425]], dtype=object)

In [32]:
positiveRatedBeers[:10,1:]

array([[' 90 Minute IPA', 1577],
       [' Old Rasputin Russian Imperial Stout', 1533],
       [' Sierra Nevada Celebration Ale', 1484],
       [' Pliny The Elder', 1372],
       [' Two Hearted Ale', 1354],
       [' Sierra Nevada Pale Ale', 1340],
       [' Stone Ruination IPA', 1320],
       [' Founders Breakfast Stout', 1315],
       [' Stone IPA (India Pale Ale)', 1283],
       [" Bell's Hopslam Ale", 1268]], dtype=object)

In [33]:
meanRateBeers[:5,1:]

array([[' Gourmetbryggeriet Piney', 4.6394230769230766],
       [' Centennial Alt', 4.6284403669724767],
       [' St Arnou St Cloud', 4.621382636655949],
       [' Oktoberator', 4.6135593220338986],
       [' Old Morgantown Amber', 4.6104651162790695]], dtype=object)

In [34]:
testUsersGrouped = test[test.overall>=4.0].groupby('profile_name')['beerId'].apply(list)

In [35]:
topN = 10
np.mean(testUsersGrouped.apply(lambda rated: recall_at_n(topN, rated, mostRatedBeers[:,0])).values)

0.048424083922409704

In [36]:
np.mean(testUsersGrouped.apply(lambda rated: recall_at_n(topN, rated, positiveRatedBeers[:,0])).values)

0.050780102818708303

In [37]:
np.mean(testUsersGrouped.apply(lambda rated: recall_at_n(topN, rated, meanRateBeers[:,0])).values)

6.7296960147068474e-05

In [38]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    
    :param actual : A list of elements that are to be predicted 
    :param predicted : A list of predicted elements 
    :param k: The maximum number of predicted elements
    
    :return The average precision at k over the input lists
    """
    predicted = predicted[:k] # top-k predictions
    
    score = 0.0 # This will store the numerator
    num_hits = 0.0 # This will store the sum of rel(i)

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

In [39]:
rated = [" Short's Cup A Joe Coffee Creme Stout"," Frog's Hollow Double Pumpkin Ale"," DNR Belgian-Style Golden Ale"]
recommended = [" Founders KBS (Kentucky Breakfast Stout" , " Maduro Oatmeal Brown Ale"]

In [40]:
apk(rated, recommended, 1)

0.0

In [41]:
apk(rated, recommended, 2)

0.0

### looking for the MAP (mean avergae precision) using the top 5 beers


In [42]:
testUsersGrouped = test[test.overall>=4.0].groupby('profile_name')['beerId'].apply(list)

In [43]:
topN = 5
np.mean(testUsersGrouped.apply(lambda rated: apk(rated, mostRatedBeers[:,0], topN)).values)

0.017186622312681008

In [44]:
np.mean(testUsersGrouped.apply(lambda rated: apk(rated, positiveRatedBeers[:,0], topN)).values)

0.017697094376235598

In [45]:
np.mean(testUsersGrouped.apply(lambda rated: apk(rated, meanRateBeers[:,0], topN)).values)

1.3131114175037753e-05

# 2. Collaborative Filtering Recommender

### Co-occurrence Matrix

In [46]:
# create a dictionary of beers per user
BeersPerUser = train[train.overall>=4.0].groupby('profile_name')['beerId'].apply(np.array).to_dict()

# calculate the number of items in train
n_items = len(beer_dict.keys())

# co-ocurrance matrix will have shape=[n_items,n_items]
coMatrix = np.zeros((n_items, n_items)) # co-occurrence matrix
for user,beers in BeersPerUser.items():
    for m in beers:
        # update 1 row at a time
        coMatrix[m,beers] = coMatrix[m,beers] + 1

### Making predictions using the co-occurrence matrix

Recommendations based on the beers similarity

In [47]:
def co_occurrance_similarity(beerId, coocurrance, ntop=10):
    """
    Returns the top-N most similar items to a given one, based on the coocurrance matrix
    
    :param item_id: id of input item
    :param cooccurrance: 2-dim numpy array with the co-occurance matrix
    :param ntop: number of items to be retrieved
    
    :return top-N most similar items to the given item_id
    """
    similarItems = coocurrance[beerId,:]
    mostSimilar = np.argsort(similarItems)[::-1]  # sort in descending order the item ids
    mostSimilar = mostSimilar[1:ntop+1]
    
    return np.dstack((mostSimilar,similarItems[mostSimilar]))[0]

### Example of top 5 beers recommender if the customer likes the beer called # 100
** we use id=0 for the next example (queryBeerId = 0) 
for example: beer_dict [0]  --> beer name = ' # 100' **

In [48]:
queryBeerId = 0
Ntop = 5
print('For item "%s" top-%s recommendations are:' % (beer_dict[queryBeerId], Ntop))

similarItems = co_occurrance_similarity(queryBeerId, coMatrix, Ntop)
# let's print out the first Ntop recommendations
for r in similarItems:
    print(beer_dict[r[0]], r[1])

For item " # 100" top-5 recommendations are:
(' Stone Imperial Russian Stout', 28.0)
(' Old Rasputin Russian Imperial Stout', 27.0)
(' Pliny The Elder', 26.0)
(' Stone Ruination IPA', 25.0)
(' La Fin Du Monde', 25.0)


**Now, let use this function to make recommendations:**

In [49]:
def co_occurrance_recommendation(beerId, cooccurrance, ntop=5):
    list_sim_items = np.vstack([co_occurrance_similarity(id, cooccurrance, ntop) for id in beerId])
    sorted_list = np.sort(list_sim_items, axis=0)[::-1]
    # We have to remove duplicates
    unique_items = np.unique(sorted_list[:,0])[:ntop]
    return unique_items    

In [50]:
trainUsersGrouped = train[train.overall>=4.0].groupby('profile_name')['beerId'].apply(list)
trainUsersGrouped.head()

profile_name
 007                                     [14698, 33574, 27418]
 00766                                                 [16376]
 01001111                                  [7996, 26422, 7696]
 0110x011    [24859, 19597, 6540, 20592, 16672, 1185, 1184,...
 01121987    [33774, 11597, 1035, 28077, 9405, 3375, 23660,...
Name: beerId, dtype: object

In [51]:
co_occurrance_recommendation(trainUsersGrouped[1], coMatrix, 3)

array([  5883.,  23132.,  32974.])

In [52]:
Ntop = 5
predictions = trainUsersGrouped.apply(lambda l: co_occurrance_recommendation(l, coMatrix, Ntop))
predictions[:4]

profile_name
 007          [545.0, 12152.0, 15632.0, 18457.0, 23132.0]
 00766       [5883.0, 12152.0, 23132.0, 30516.0, 32974.0]
 01001111    [3336.0, 12152.0, 18457.0, 23132.0, 23593.0]
 0110x011          [477.0, 545.0, 1007.0, 1013.0, 1184.0]
Name: beerId, dtype: object

In [54]:
for (rated, recom) in zip(testUsersGrouped, predictions)[:3]:
    print("*"*6)
    print("rated items: ")
    print([beer_dict[i] for i in rated])
    print("Recommended items: ")
    print([beer_dict[i] for i in recom])

******
rated items: 
[' HopDevil Ale', ' Fraoch Heather Ale']
Recommended items: 
[' 90 Minute IPA', ' Founders Breakfast Stout', ' HopDevil Ale', ' La Fin Du Monde', ' Old Rasputin Russian Imperial Stout']
******
rated items: 
[' Aphrodisiaque']
Recommended items: 
[' Brooklyn Black Chocolate Stout', ' Founders Breakfast Stout', ' Old Rasputin Russian Imperial Stout', ' Stone Imperial Russian Stout', ' Two Hearted Ale']
******
rated items: 
[' Stone Imperial Russian Stout', ' Mirror Mirror', ' Stone Sublimely Self-Righteous Ale', ' Cantillon Saint Lamvinus', ' White Knuckle Double IPA', ' White Zin', ' Ommegang Rouge', ' Weihenstephaner Hefeweissbier', ' Racer X', ' The Bitter End Pale Ale', ' Double Daddy Imperial India Pale Ale', " Bell's Expedition Stout", ' Firestone 12 - Anniversary Ale', ' Sculpin India Pale Ale', ' Tovarish Imperial Espresso Stout', ' AleSmith Old Numbskull', ' Melange No. 1', " The Angel's Share - Bourbon Barrel-Aged", ' Bottleworks 10th Anniversary Wild Ale',

### Evalute the recommendation

In [55]:
topN = 5
predictions = trainUsersGrouped.apply(lambda l: co_occurrance_recommendation(l, coMatrix, topN))

targets_predictions = zip(testUsersGrouped, predictions)
recall = np.mean([recall_at_n(topN, rated, recommended) for (rated, recommended) in targets_predictions])
map_ = np.mean([apk(rated, recommended, topN) for (rated, recommended) in targets_predictions])

print("Recall=%.3f; MAP=%.3f" %(recall, map_))

Recall=0.025; MAP=0.013
