In [28]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv("data/ratings.csv")


In [29]:
df

Unnamed: 0,user_id,auction_id,rating
0,0x01,3,4
1,0x02,1,4
2,0x03,4,5
3,0x03,3,3
4,0x04,2,3
5,0x04,4,4
6,0x04,5,5
7,0x05,4,5
8,0x05,1,1
9,0x01,1,5


In [30]:
# Pivot the DataFrame to create the utility matrix
utility_matrix = df.pivot(index='user_id', columns='auction_id', values='rating')

utility_matrix

auction_id,1,2,3,4,5
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0x01,5.0,,4.0,,
0x02,4.0,,,,4.0
0x03,,2.0,3.0,5.0,
0x04,4.0,3.0,,4.0,5.0
0x05,1.0,,,5.0,2.0
0x06,5.0,4.0,,,
0x07,2.0,1.0,,,3.0


In [31]:
# utility matrix with NAN values for later use
utility_matrix_with_na = utility_matrix

In [32]:
# Fill NaN values with 0 (or any other value that represents no interaction)
utility_matrix = utility_matrix.fillna(0)

utility_matrix

auction_id,1,2,3,4,5
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0x01,5.0,0.0,4.0,0.0,0.0
0x02,4.0,0.0,0.0,0.0,4.0
0x03,0.0,2.0,3.0,5.0,0.0
0x04,4.0,3.0,0.0,4.0,5.0
0x05,1.0,0.0,0.0,5.0,2.0
0x06,5.0,4.0,0.0,0.0,0.0
0x07,2.0,1.0,0.0,0.0,3.0


In [33]:
utility_matrix.iloc[0,0]

5.0

In [34]:
user_mean = utility_matrix.mean(axis=0)
user_removed_mean_rating = (utility_matrix - user_mean).fillna(0)

In [35]:
user_removed_mean_rating

auction_id,1,2,3,4,5
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0x01,2.0,-1.428571,3.0,-2.0,-2.0
0x02,1.0,-1.428571,-1.0,-2.0,2.0
0x03,-3.0,0.571429,2.0,3.0,-2.0
0x04,1.0,1.571429,-1.0,2.0,3.0
0x05,-2.0,-1.428571,-1.0,3.0,0.0
0x06,2.0,2.571429,-1.0,-2.0,-2.0
0x07,-1.0,-0.428571,-1.0,-2.0,1.0


In [36]:
def find_neighbours(user_removed_mean_rating, user_id):
    n_users = len(user_removed_mean_rating.index)
    # print(n_users)

    similarities = np.zeros(n_users)
    # print(similarities)

    target_user = user_removed_mean_rating.loc[user_id].values.reshape(1, -1)
    # print(target_user)

    for i, user_index in enumerate(user_removed_mean_rating.index):

        # print(user_removed_mean_rating.loc[user_index].values)

        neighbour_user = user_removed_mean_rating.loc[user_index].values.reshape(1, -1)

        similarities[i] = cosine_similarity(target_user, neighbour_user)[0,0]
        # print(i, similarities[i])

    sorted_idx = np.argsort(similarities)[::-1]
    # print(sorted_idx)

    closest_neighbours = user_removed_mean_rating.index[sorted_idx[1:]].tolist()
    # print(closest_neighbours)

    similarities = np.sort(similarities)[::-1]
    closest_neighbors_similarities = similarities[1:]
    # print(closest_neighbors_similarities)

    return {
        "closest_neighbours": closest_neighbours,
        "closest_neighbors_similarities": closest_neighbors_similarities
    }

In [37]:
def baseline_prediction(data, user_id, auction_id):
    """Function to calculate baseline prediction from user and movie """

    # calculate global mean
    global_mean = data.stack().dropna().mean()

    # calculate user mean
    user_mean = data.loc[user_id, :].mean()

    # calculate item mean
    item_mean = data.loc[:, auction_id].mean()

    # calculate user bias
    user_bias = global_mean - user_mean

    # calculate item bias
    item_bias = global_mean - item_mean

    # calculate baseline
    baseline_ui = global_mean + user_bias + item_bias

    return baseline_ui

In [38]:
def predict_item_rating(user_id, auction_id, data, neighbor_data,
                        max_rating=5, min_rating=1):
    """Function to predict rating on user_id and auction_id"""

    # calculate baseline (u,i)
    baseline = baseline_prediction(data=data,
                                   user_id=user_id, auction_id=auction_id)
    # for sum
    sim_rating_total = 0
    similarity_sum = 0
    k = len(neighbor_data['closest_neighbours'])
    # loop all over neighbor
    for i in range(k):
        # retrieve rating from neighbor
        neighbour_rating = data.loc[neighbor_data['closest_neighbours'][i], auction_id]

        # skip if nan
        if np.isnan(neighbour_rating):
            continue

        # calculate baseline (ji)
        baseline = baseline_prediction(data=data,
                                       user_id=neighbor_data['closest_neighbours'][i], auction_id=auction_id)

        # subtract baseline from rating
        adjusted_rating = neighbour_rating - baseline

        # multiply by similarity
        sim_rating = neighbor_data['closest_neighbors_similarities'][i] * adjusted_rating

        # sum similarity * rating
        sim_rating_total += sim_rating

        #
        similarity_sum += neighbor_data['closest_neighbors_similarities'][i]

    # avoiding ZeroDivisionError
    try:
        user_item_predicted_rating = baseline + (sim_rating_total / similarity_sum)

    except ZeroDivisionError:
        user_item_predicted_rating = baseline

    # checking the boundaries of rating,
    if user_item_predicted_rating > max_rating:
        user_item_predicted_rating = max_rating

    elif user_item_predicted_rating < min_rating:
        user_item_predicted_rating = min_rating

    return user_item_predicted_rating

In [39]:
def recommend_items(data, user_id, recommend_seen=False):
    """ Function to generate recommendation on given user_id """

    # find neighbor
    neighbor_data = find_neighbours(user_removed_mean_rating=user_removed_mean_rating, user_id=user_id)

    # create empty dataframe to store prediction result
    prediction_df = pd.DataFrame()
    # create list to store prediction result
    predicted_ratings = []

    # mask seen item
    mask = np.isnan(data.loc[user_id])
    # print(data.loc[user_id])
    item_to_predict = data.columns[mask]

    if recommend_seen:
        item_to_predict = data.columns

    # loop all over movie
    for movie in item_to_predict:
        # predict rating
        preds = predict_item_rating(user_id=user_id, auction_id=movie,
                                    data=data,
                                    neighbor_data=neighbor_data)

        # append
        predicted_ratings.append(preds)

    # assign auction_id
    prediction_df['auction_id'] = data.columns[mask]

    # assign prediction result
    prediction_df['predicted_ratings'] = np.round(predicted_ratings, 2)

    #
    prediction_df = (prediction_df
                     .sort_values('predicted_ratings', ascending=False))

    return prediction_df

In [40]:
for user_id in utility_matrix.index:
    recommendations = recommend_items(data=utility_matrix_with_na, user_id=user_id, recommend_seen=False)
    for auction_id in recommendations['auction_id']:
        # print(user_id)
        utility_matrix.loc[user_id, auction_id] = float(recommendations[recommendations['auction_id'] == auction_id]['predicted_ratings'].iloc[0])
        # print(recommendations[recommendations['auction_id']==auction_id]['predicted_ratings'])

utility_matrix

auction_id,1,2,3,4,5
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0x01,5.0,1.16,4.0,3.89,2.67
0x02,4.0,5.0,2.84,4.82,4.0
0x03,4.8,2.0,3.0,5.0,4.58
0x04,4.0,3.0,3.51,4.0,5.0
0x05,1.0,5.0,1.0,5.0,2.0
0x06,5.0,4.0,1.0,5.0,2.91
0x07,2.0,1.0,3.56,4.78,3.0


In [41]:
user_id='0x06'

# Example recommendation after adding new rating
user_1_recommendation_updated = recommend_items(data=utility_matrix_with_na, user_id=user_id, recommend_seen=False)

print(f'Recommendations in descending order for user {user_id}')
user_1_recommendation_updated.head()

Recommendations in descending order for user 0x06


Unnamed: 0,auction_id,predicted_ratings
1,4,5.0
2,5,2.91
0,3,1.0
