In [0]:
# The file u.data that contains the ratings is a tab separated list of user ID, item ID, rating, and timestamp.
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [0]:
# Read file into dataframe
df = pd.read_csv('u.data', sep='\t', header=None, names=['UserId', 'Item', 'Rating', 'Timestamp'])
df.drop('Timestamp', axis=1, inplace=True)


In [85]:
# convert dataframe to matrix
user_item_sparse_matrix = df.pivot_table(index=['UserId'], columns='Item', values='Rating')
print(user_item_sparse_matrix)



Item    1     2     3     4     5     6     ...  1677  1678  1679  1680  1681  1682
UserId                                      ...                                    
1        5.0   3.0   4.0   3.0   3.0   5.0  ...   NaN   NaN   NaN   NaN   NaN   NaN
2        4.0   NaN   NaN   NaN   NaN   NaN  ...   NaN   NaN   NaN   NaN   NaN   NaN
3        NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   NaN   NaN   NaN   NaN   NaN
4        NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   NaN   NaN   NaN   NaN   NaN
5        4.0   3.0   NaN   NaN   NaN   NaN  ...   NaN   NaN   NaN   NaN   NaN   NaN
...      ...   ...   ...   ...   ...   ...  ...   ...   ...   ...   ...   ...   ...
939      NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   NaN   NaN   NaN   NaN   NaN
940      NaN   NaN   NaN   2.0   NaN   NaN  ...   NaN   NaN   NaN   NaN   NaN   NaN
941      5.0   NaN   NaN   NaN   NaN   NaN  ...   NaN   NaN   NaN   NaN   NaN   NaN
942      NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   NaN   NaN   NaN   Na

In [0]:
#Replace nan with avg of column
def replace_nan_column_Avg(sparse_matrix):
    return sparse_matrix.fillna(sparse_matrix.mean(axis=0))

#Replace nan with avg of row
def replace_nan_row_Avg(sparse_matrix):
    return sparse_matrix.apply(lambda row: row.fillna(row.mean()), axis=1)


In [0]:
def calculate_similarity(avg_replaced_matrix):
    cosine = cosine_similarity(avg_replaced_matrix)
    np.fill_diagonal(cosine, 0)
    similarity = pd.DataFrame(cosine, index=avg_replaced_matrix.index)
    similarity.columns = avg_replaced_matrix.index
    return similarity



In [0]:
def calculate_neighbors(df, n):
    df = df.apply(
        lambda val: pd.Series(val.sort_values(ascending=False).iloc[:n].index, index=[i for i in range(1, n + 1)]),
        axis=1)
    return df


In [0]:
def predict_user_based_rating(user_item_matrix, user_id, item_id, k_neighbors, item_avg_matrix, user_avg_matrix,
                              similarity_matrix):
    neighbor_user_values = k_neighbors[k_neighbors.index == user_id].values
    neighbor_user_list = neighbor_user_values.squeeze().tolist()

    item = item_avg_matrix.loc[:, item_id]

    neighbors = item[item.index.isin(neighbor_user_list)]
    non_null_neighbors = neighbors[neighbors.notnull()]

    user_avg = user_item_matrix['Average'][user_id]

    index = non_null_neighbors.index.values.squeeze().tolist()
    #similarity of user u and user v, v in neighbors of u
    user_similarity = similarity_matrix.loc[user_id, index]

    normal_neighbors_ratings = user_avg_matrix[item_id][index] - user_item_matrix['Average'][index]

    final_df = pd.concat([user_similarity, normal_neighbors_ratings], axis=1)
    final_df.columns = ['user_similarity', 'normalized_neighbor_ratings']
    final_df['score'] = final_df.apply(lambda x: x['user_similarity'] * x['normalized_neighbor_ratings'], axis=1)

    num = (final_df['normalized_neighbor_ratings'] * final_df['user_similarity']).sum()
    den = final_df['user_similarity'].sum()

    # predicted_rating(u,i) = avg_rating(u) + (sum(similarity(u,v) * (r(v,i) - avg_rating(v)))/sum(sim(u,v)))
    # where v belongs to neighbors of u
    predicted_rating = user_avg + (num / den)

    if predicted_rating < 0:
        predicted_rating = 0
    elif predicted_rating > 5:
        predicted_rating = 5

    return predicted_rating


In [0]:
def user_based_CF(user_item_matrix, neighbor_size, user_id, item_id):
    user_item_matrix['Average'] = user_item_matrix.mean(axis=1)

    item_avg_matrix = replace_nan_column_Avg(user_item_matrix)
    user_avg_matrix = replace_nan_row_Avg(user_item_matrix)

    user_similarity_for_item = calculate_similarity(item_avg_matrix)
    item_similarity_for_user = calculate_similarity(user_avg_matrix)

    user_k_neighbours = calculate_neighbors(item_similarity_for_user, neighbor_size)


    return predict_user_based_rating(user_item_matrix, user_id, item_id, user_k_neighbours, item_avg_matrix,
                                     user_avg_matrix, user_similarity_for_item)

In [0]:
def predict_item_based_rating(user_item_matrix, user_id, item_id, item_k_neighbours, item_avg_matrix, user_avg_matrix,
                              item_similarity_with_user):
    neighbor_item_values = item_k_neighbours[item_k_neighbours.index == item_id].values
    neighbor_item_list = neighbor_item_values.squeeze().tolist()


    user = user_avg_matrix.loc[:, user_id]

    neighbors = user[user.index.isin(neighbor_item_list)]
    non_null_neighbors = neighbors[neighbors.notnull()]

    item_avg = user_item_matrix['ItemAverage'][item_id]

    index = non_null_neighbors.index.values.squeeze().tolist()

    item_similarity = item_similarity_with_user.loc[item_id, index]

    # for item j in neighbors of item i ->  r(u,j) - avg_rating(j)
    normal_neighbors_ratings = user_avg_matrix[user_id][index] - user_item_matrix['ItemAverage'][index]

    fin_df = pd.concat([item_similarity, normal_neighbors_ratings], axis=1)
    fin_df.columns = ['item_similarity', 'normalized_neighbor_ratings']
    fin_df['score'] = fin_df.apply(lambda x: x['item_similarity'] * x['normalized_neighbor_ratings'], axis=1)

    num = (fin_df['score']).sum()
    den = fin_df['item_similarity'].sum()

    # predicted_rating(u,i) = avg_rating(i) + (sum(similarity(i,j) * (r(u,j) - avg_rating(j)))/sum(similarity(i,j)))
    # where j belongs to neighbors of i
    predicted_rating = item_avg + (num / den)

    if predicted_rating < 0:
        predicted_rating = 0
    elif predicted_rating > 5:
        predicted_rating = 5

    return predicted_rating



In [0]:
def item_based_CF(item_user_matrix, neighbor_size, user_id, item_id):
    item_user_matrix['ItemAverage'] = item_user_matrix.mean(axis=1)

    item_avg_matrix = replace_nan_row_Avg(item_user_matrix)
    user_avg_matrix = replace_nan_column_Avg(item_user_matrix)

    user_similarity_for_item = calculate_similarity(item_avg_matrix)
    item_similarity_for_user = calculate_similarity(user_avg_matrix)

    item_k_neighbours = calculate_neighbors(user_similarity_for_item, neighbor_size)

    return predict_item_based_rating(item_user_matrix, user_id, item_id, item_k_neighbours, item_avg_matrix,
                                     user_avg_matrix, item_similarity_for_user)


In [93]:
def predict_user_and_item_based_rating(user_item_matrix, neighbor_size, user_id, item_id):
    predicted_rating_user_based = user_based_CF(user_item_matrix, neighbor_size, user_id, item_id)
    print("Rating prediction using user_based CF: ", predicted_rating_user_based)

    user_item_matrix_transpose = user_item_matrix.T  # Transpose a matrix for item-based CF

    predicted_rating_item_based = item_based_CF(user_item_matrix_transpose, neighbor_size, user_id, item_id)
    print("Rating prediction using item_based CF: ", predicted_rating_item_based)


predict_user_and_item_based_rating(user_item_sparse_matrix, 20, 833, 474)
#predict_user_and_item_based_rating(user_item_sparse_matrix, 20, 238, 151)


Rating prediction using user_based CF:  3.2718230972889786
Rating prediction using item_based CF:  4.460948153061859
