In [86]:
import pandas as pd
import pymc as pm
import numpy as np
import os

from sklearn.model_selection import train_test_split

os.environ["PATH"] += os.pathsep + 'C:\Program Files\Graphviz\\bin'

In [87]:
df_raw = pd.read_csv('./data/beer_reviews.csv')
df_raw['is_duplicate'] = df_raw.duplicated(['review_profilename', 'beer_beerid'], keep='first')
df_raw = df_raw[df_raw['is_duplicate'] == False].drop(columns=['is_duplicate']).reset_index(drop=True)

In [88]:
top_100_reviewers = df_raw['review_profilename'].value_counts().head(100).index
top_100_beers = df_raw[df_raw['review_profilename'].isin(top_100_reviewers)]['beer_beerid'].value_counts().head(100).index

In [89]:
df = df_raw[(df_raw['beer_beerid'].isin(top_100_beers)) & (df_raw['review_profilename'].isin(top_100_reviewers))].reset_index(drop=True)
df = df[['review_profilename', 'beer_beerid', 'review_overall']]
df = df.rename(columns={'review_profilename': 'user_id', 'beer_beerid': 'beer_id', 'review_overall': 'review'})

In [90]:
df['review'] = (df['review'] * 2 - 1).apply(int)
df['user_id'] = df['user_id'].astype('category').cat.codes.apply(int)
df['beer_id'] = df['beer_id'].astype('category').cat.codes.apply(int)

In [114]:
num_users = df['user_id'].nunique()
num_beers = df['beer_id'].nunique()
num_categories = 20
num_samples = 1000
num_burnin = 800
num_chains = 4
num_cores = 4
num_recommend = 5

a = c = 0.30
a_prime = c_prime = 0.30
b_prime = d_prime = 0.15

In [104]:
observed_ratings = np.zeros((num_users, num_beers))

In [105]:
df_train, df_test = train_test_split(df, test_size=0.2)

In [106]:
for _, row in df_train.iterrows():
    observed_ratings[row['user_id']][row['beer_id']] = row['review']

In [107]:
with pm.Model() as model:
    xi = pm.Gamma('xi', alpha=a_prime, beta=a_prime/b_prime, shape=(num_users, 1))
    theta = pm.Gamma('theta', alpha=a, beta=xi, shape=(num_users, num_categories))
    eta = pm.Gamma('eta', alpha=c_prime, beta=c_prime/d_prime, shape=(num_beers, 1))
    beta = pm.Gamma('beta', alpha=c, beta=eta, shape=(num_beers, num_categories))

    y_ui = pm.Poisson('y_ui', mu=pm.math.dot(theta, beta.T), observed=observed_ratings)

In [108]:
# Fit the model
with model:
    # trace = pm.sample(1000, tune=1000, chains=1, cores=8, nuts={'target_accept': 0.9})
    #step = pm.Metropolis()
    trace = pm.sample(num_samples, tune=num_burnin, chains=num_chains, cores=num_cores, nuts={'target_accept': 0.9}) #, step=step)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [xi, theta, eta, beta]


Sampling 4 chains for 800 tune and 1_000 draw iterations (3_200 + 4_000 draws total) took 2310 seconds.


In [109]:
# Extract theta and beta samples
theta_samples = np.concatenate([sublist for sublist in trace['posterior']['theta']], axis=0)
beta_samples = np.concatenate([sublist for sublist in trace['posterior']['beta']], axis=0)

In [110]:
def makePredictions(t_samples, b_samples, data):
    prediction_matrix = np.zeros((num_users, num_beers))

    len_samples = t_samples.shape[0]

    for _, row in data.iterrows():
        user_id = int(row['user_id'])
        beer_id = int(row['beer_id'])
        ratings = []

        for i in range(len_samples):
            rating = np.dot(t_samples[i][user_id], b_samples[i][beer_id].T)
            ratings.append(rating)

        prediction_matrix[user_id][beer_id] = np.mean(ratings)
    
    return prediction_matrix

In [111]:
def RMSE(predictions, data):
    total_error = 0

    for _, row in data.iterrows():
        user_id = int(row['user_id'])
        beer_id = int(row['beer_id'])

        predicted_rating = predictions[user_id][beer_id]
        error = (predicted_rating - row['review']) ** 2
        total_error += error
    
    return np.sqrt(total_error / data.shape[0])

In [112]:
def oneOffAcc(predictions, data):
    correct = 0

    for _, row in data.iterrows():
        if abs(predictions[row['user_id']][row['beer_id']] - row['review']) <= 1:
            correct += 1
    
    return correct / data.shape[0]

In [115]:
prediction_ratings_train = makePredictions(theta_samples, beta_samples, df_train)
print(f'{RMSE(prediction_ratings_train, df_train)=}, {oneOffAcc(prediction_ratings_train, df_train)=}')

RMSE(prediction_ratings_train, df_train)=1.3552688875158467, oneOffAcc(prediction_ratings_train, df_train)=0.49025686448184236


In [116]:
prediction_ratings = makePredictions(theta_samples, beta_samples, df_test)
print(f'{RMSE(prediction_ratings, df_test)=}, {oneOffAcc(prediction_ratings, df_test)=}')

RMSE(prediction_ratings, df_test)=4.931532961809268, oneOffAcc(prediction_ratings, df_test)=0.005312868949232586


In [56]:
def makeRecommandations(t_samples, b_samples, data):
    users = data['user_id'].unique()
    beers = data['beer_id'].unique()
    len_samples = theta_samples.shape[0]
    recommandations = []

    for user_id in users:
        beer_ratings = []

        for beer_id in beers:
            ratings = []

            for i in range(len_samples):
                rating = np.dot(t_samples[i][user_id], b_samples[i][beer_id].T)
                ratings.append(rating)

            beer_ratings.append((beer_id, np.mean(ratings)))
        
        beer_ratings = sorted(beer_ratings, key=lambda x: x[1], reverse=True)
        recommandations.append((user_id, [r[0] for r in beer_ratings[:5]]))

    return recommandations

In [44]:
recommandations = makeRecommandations(theta_samples, beta_samples, df_train)

In [45]:
recommandations = pd.DataFrame(recommandations)
recommandations.columns = ['user_id', 'Recommandations']

In [46]:
recommandations.head(10)

Unnamed: 0,user_id,Recommandations
0,78,"[44, 36, 67, 60, 42]"
1,56,"[50, 33, 37, 48, 13]"
2,96,"[47, 23, 11, 31, 28]"
3,12,"[37, 43, 48, 96, 97]"
4,66,"[60, 96, 68, 48, 73]"
5,5,"[67, 75, 48, 80, 73]"
6,89,"[34, 14, 27, 11, 0]"
7,45,"[30, 73, 44, 56, 96]"
8,1,"[51, 47, 67, 43, 56]"
9,62,"[98, 19, 85, 96, 78]"
