In [94]:
import pandas as pd
import pymc as pm
import numpy as np
import os

from sklearn.model_selection import train_test_split

os.environ["PATH"] += os.pathsep + 'C:\Program Files\Graphviz\\bin'

In [3]:
# Read and clean the data
df_reviews = pd.read_csv('./data/tourism_rating.csv')
df_reviews['User_Id'] = df_reviews['User_Id'] - 1
df_reviews['Place_Id'] = df_reviews['Place_Id'] - 1

In [None]:
# Sanity_check (Users from 0-9 and places with the following id)
place_ids = [87, 207, 3, 165, 320, 201, 300, 29, 104, 66]

df = df_reviews[(df_reviews['User_Id'] < 10) & (df_reviews['Place_Id'].isin(place_ids))]
df['Place_Id'] = df.groupby(['Place_Id']).ngroup()

df = df.reset_index(drop=True)

In [4]:
# Sanity check 2
df = df_reviews[(df_reviews['User_Id'] < 120) & (df_reviews['Place_Id'] < 100)]

In [33]:
num_users = df['User_Id'].nunique()
num_places = df['Place_Id'].nunique()
num_categories = 20
num_samples = 1000
num_burnin = 800
num_chains = 4
num_cores = 4
num_recommend = 5

In [6]:
# Make constraints
observed_ratings = np.zeros((num_users, num_places))

In [7]:
df_train, df_test = train_test_split(df, test_size=0.2)

In [8]:
for _, row in df_train.iterrows():
    observed_ratings[row['User_Id']][row['Place_Id']] = row['Place_Ratings']

In [9]:
a = c = 0.15
a_prime = c_prime = 0.30
b_prime = d_prime = 0.15

In [11]:
with pm.Model() as model:

    # User activity
    xi = pm.Gamma('xi', alpha=a_prime, beta=a_prime/b_prime, shape=(num_users, 1))

    # User preferences
    theta = pm.Gamma('theta', alpha=a, beta=xi, shape=(num_users, num_categories))

    # Place popularity
    eta = pm.Gamma('eta', alpha=c_prime, beta=c_prime/d_prime, shape=(num_places, 1))

    # Place attributes
    beta = pm.Gamma('beta', alpha=c, beta=eta, shape=(num_places, num_categories))

    # Ratings
    y_ui = pm.Poisson('y_ui', mu=pm.math.dot(theta, beta.T), observed=observed_ratings)

In [12]:
# Fit the model
with model:
    # trace = pm.sample(1000, tune=1000, chains=1, cores=8, nuts={'target_accept': 0.9})
    step = pm.Metropolis()
    trace = pm.sample(num_samples, tune=num_burnin, chains=num_chains, cores=num_cores, step=step)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [xi, theta, eta, beta]


Sampling 4 chains for 800 tune and 1_000 draw iterations (3_200 + 4_000 draws total) took 237 seconds.


In [13]:
# Extract theta and beta samples
theta_samples = np.concatenate([sublist for sublist in trace['posterior']['theta']], axis=0)
beta_samples = np.concatenate([sublist for sublist in trace['posterior']['beta']], axis=0)

In [35]:
def makePredictions(t_samples, b_samples, data):
    prediction_matrix = np.zeros((num_users, num_places))

    len_samples = t_samples.shape[0]

    for _, row in data.iterrows():
        user_id = int(row['User_Id'])
        place_id = int(row['Place_Id'])
        ratings = []

        for i in range(len_samples):
            rating = np.dot(t_samples[i][user_id], b_samples[i][place_id].T)
            ratings.append(rating)

        prediction_matrix[user_id][place_id] = np.mean(ratings)
    
    return prediction_matrix

In [83]:
def RMSE(predictions, data):
    total_error = 0

    for _, row in data.iterrows():
        user_id = int(row['User_Id'])
        place_id = int(row['Place_Id'])

        predicted_rating = predictions[user_id][place_id]
        error = (predicted_rating - row['Place_Ratings'] + 1) ** 2
        total_error += error
    
    return np.sqrt(total_error / data.shape[0])

In [86]:
def oneOffAcc(predictions, data):
    correct = 0

    for _, row in data.iterrows():
        if abs(predictions[row['User_Id']][row['Place_Id']] - row['Place_Ratings'] + 1) <= 1:
            correct += 1
    
    return correct / data.shape[0]

In [92]:
prediction_ratings_train = makePredictions(theta_samples, beta_samples, df_train)
print(f'{RMSE(prediction_ratings_train, df_train)=}, {oneOffAcc(prediction_ratings_train, df_train)=}')

RMSE(prediction_ratings_train, df_train)=1.4884457711336077, oneOffAcc(prediction_ratings_train, df_train)=0.5095108695652174


In [93]:
prediction_ratings = makePredictions(theta_samples, beta_samples, df_test)
print(f'{RMSE(prediction_ratings, df_test)=}, {oneOffAcc(prediction_ratings, df_test)=}')

RMSE(prediction_ratings, df_test)=2.3573479064728398, oneOffAcc(prediction_ratings, df_test)=0.32432432432432434


In [70]:
def makeRecommandations(t_samples, b_samples, data):
    users = data['User_Id'].unique()
    places = data['Place_Id'].unique()
    len_samples = theta_samples.shape[0]
    recommandations = []

    for user_id in users:
        place_ratings = []

        for place_id in places:
            ratings = []

            for i in range(len_samples):
                rating = np.dot(t_samples[i][user_id], b_samples[i][place_id].T)
                ratings.append(rating)

            place_ratings.append((place_id, np.mean(ratings)))
        
        place_ratings = sorted(place_ratings, key=lambda x: x[1], reverse=True)
        recommandations.append((user_id, [r[0] for r in place_ratings[:5]]))

    return recommandations

In [71]:
recommandations = makeRecommandations(theta_samples, beta_samples, df_train)

In [74]:
recommandations = pd.DataFrame(recommandations)
recommandations.columns = ['User_Id', 'Recommandations']

In [77]:
recommandations.head(10)

Unnamed: 0,User_Id,Recommandations
0,53,"[15, 36, 23, 45, 46]"
1,31,"[78, 10, 1, 18, 75]"
2,104,"[68, 96, 0, 25, 85]"
3,48,"[79, 30, 48, 67, 40]"
4,13,"[66, 87, 49, 7, 53]"
5,14,"[66, 89, 22, 94, 8]"
6,39,"[20, 36, 62, 23, 48]"
7,51,"[52, 69, 85, 46, 99]"
8,90,"[32, 7, 75, 12, 53]"
9,61,"[48, 70, 46, 49, 45]"
