In [1]:
import pandas as pd
import numpy as np
from scipy import spatial
from sklearn import metrics
import pickle
import random

m = 1000  # number of users
n = 1000  # number of items

A = np.zeros([m + 1, n + 1])  # user_item rating matrix
filled_rating = []
item_item_sim = np.zeros([n+1, n+1])
user_user_sim = np.zeros([m+1, m+1])
random_location = set()
path = 'library/' # make it empty in azure notebook


def gt_col(mat, col):
    return mat[:, col]


def gt_row(mat, row):
    return mat[row, :]


def magnitude(v):
    return np.linalg.norm(v)


def cosine_sim(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)


In [2]:
f = open(path + 'user_item.pckl', 'rb')
A = pickle.load(f)
f.close()

f = open(path + 'filled_rating.pckl', 'rb')
filled_rating = pickle.load(f)
f.close()

f = open(path + 'user_user_sim.pckl', 'rb')
user_user_sim = pickle.load(f)
f.close()

f = open(path + 'item_item_sim.pckl', 'rb')
item_item_sim = pickle.load(f)
f.close()

f = open(path + 'random_location.pckl', 'rb')
random_location = pickle.load(f)
f.close()


In [3]:
# Only execute if you want to generate the filled_rating or A matrix
sample_data = pd.read_csv(path + "sampleData.csv")

A = np.zeros([m + 1, n + 1])

for row in range(0, sample_data.shape[0]):
    i = sample_data['userId'][row]
    j = sample_data['movieId'][row]
    v = sample_data['rating'][row]
    A[i][j] = v
    filled_rating.append([i, j, v])

user_item_pkl = open(path + 'user_item.pckl', 'wb')
pickle.dump(A, user_item_pkl )
user_item_pkl.close()

filled_rating_pkl = open(path + 'filled_rating.pckl', 'wb')
pickle.dump(filled_rating, filled_rating_pkl)
filled_rating_pkl.close()

np.savetxt(path + "user_item.csv", A, delimiter=",")  # saving user item rating matrix
np.savetxt(path + "filled_rating.csv", A, delimiter=",") # saving places where the rating is present

In [4]:
# Only execute if you want to generate the item_item_sim
item_item_sim = np.zeros([n+1, n+1])

for i in range(0, n+1):
    for j in range(0, n+1):
        v1 = gt_col(A, i)
        v2 = gt_col(A, j)
        if magnitude(v1) != 0 and magnitude(v2) != 0:
            item_item_sim[i][j] = cosine_sim(v1, v2)

item_item_sim_pkl = open(path + 'item_item_sim.pckl', 'wb')
pickle.dump(item_item_sim, item_item_sim_pkl)
item_item_sim_pkl.close()

np.savetxt(path + "item_item_similarity.csv", item_item_sim, delimiter=",") # saving item-item similarit

In [5]:
# Only execute if you want to generate the user_user_item
user_user_sim = np.zeros([m+1, m+1])
for i in range(0, m+1):
    for j in range(0, m+1):
        v1 = gt_row(A, i)
        v2 = gt_row(A, j)
        if magnitude(v1) != 0 and magnitude(v2) != 0:
            user_user_sim[i][j] = cosine_sim(v1, v2)

user_user_sim_pkl = open(path + 'user_user_sim.pckl', 'wb')
pickle.dump(user_user_sim, user_user_sim_pkl)
user_user_sim_pkl.close()

np.savetxt(path + "user_user_similarity.csv", user_user_sim, delimiter=",") # saving user user similarity

In [4]:
print(len(filled_rating))

39744


In [37]:
# Only execute if you want to generate the user_user_item
lth = len(filled_rating)
while len(random_location) != int(0.2*lth):
    random_location.add(random.randint(1, lth))

random_location_pkl = open(path + 'random_location.pckl', 'wb')
pickle.dump(random_location, random_location_pkl)
random_location_pkl.close()

In [25]:
# x, y = item_item_sim.shape
# location1 = filled_rating[next(iter(random_location))]
# print(pred_item)
# print(A[active_user][pred_item])
# print(A[active_user][pred_item])
# print(active_user_item_dot/similarity_sum - actual_rating)
predicted = []
actual = []
for r in random_location:
    location1 = filled_rating[r]
    active_user = location1[0]
    pred_item = location1[1]
    actual_rating = location1[2]
    # print(active_user,pred_item,actual_rating)
    actual.append(actual_rating)
    A[active_user][pred_item] = 0
    active_user_item_dot = np.dot(gt_row(A, active_user), gt_row(item_item_sim, pred_item))
    similarity_sum = np.sum(np.multiply( gt_row(A, active_user).astype(bool), gt_row(item_item_sim,pred_item)))
    if similarity_sum != 0:
        p = active_user_item_dot/similarity_sum
        predicted.append(p)
    A[active_user][pred_item] = actual_rating

print(metrics.mean_absolute_error(actual, predicted))
sparsity_level = 1 - len(filled_rating)/(n*m)
print(sparsity_level)

0.7447538466324093
0.960256
