In [28]:
import pandas as pd
import numpy as np
from scipy import spatial
from sklearn import metrics
import pickle
import random

m = 1000  # number of users
n = 1000  # number of items

A = np.zeros([m + 1, n + 1])  # user_item rating matrix
filled_rating = []
item_item_sim = np.zeros([n+1, n+1])
user_user_sim = np.zeros([m+1, m+1])
random_location = set()
path = 'library/' # make it empty in azure notebook

tag_dict = dict()
tag_set = set()

tag_user_sum_mat = {}
for u in range(0, m+1):
    tag_weight = {}
    for tag in tag_set:
        tag_weight[tag] = 0
    tag_user_sum_mat[u] = tag_weight
    
tag_user_count_mat = {}
for u in range(0, m+1):
    tag_weight = {}
    for tag in tag_set:
        tag_weight[tag] = 0
    tag_user_count_mat[u] = tag_weight


user_rating_sum = dict()
for u in range(0,m+1):
    user_rating_sum[u] = 0


def gt_col(mat, col):
    return mat[:, col]


def gt_row(mat, row):
    return mat[row, :]


def magnitude(v):
    return np.linalg.norm(v)


def cosine_sim(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)


# v: vector , p :projection vector
def bool_cast(v, p):
    return np.multiply(p.astype(bool),v)
    

def modified_cosine_sim(v1,v2):
    numerator = np.dot(v1, v2)
    denominator = magnitude(bool_cast(v1,v2))*magnitude(bool_cast(v2, v1)) 
    return numerator/denominator


def special_divide(numerator, denominator):
    if denominator == 0:
        return 0
    else:
        return numerator/denominator

In [113]:
f = open(path + 'user_item.pckl', 'rb')
A = pickle.load(f)
f.close()

f = open(path + 'filled_rating.pckl', 'rb')
filled_rating = pickle.load(f)
f.close()

f = open(path + 'user_user_sim.pckl', 'rb')
user_user_sim = pickle.load(f)
f.close()

f = open(path + 'item_item_sim.pckl', 'rb')
item_item_sim = pickle.load(f)
f.close()

f = open(path + 'random_location.pckl', 'rb')
random_location = pickle.load(f)
f.close()

f = open(path + 'tag_dict.pckl', 'rb')
tag_dict = pickle.load(f)
f.close()

f = open(path + 'tag_set.pckl', 'rb')
tag_set = pickle.load(f)
f.close()


f = open(path + 'tag_user_sum_mat.pckl', 'rb')
tag_user_sum_mat = pickle.load(f)
f.close()

f = open(path + 'tag_user_count_mat.pckl', 'rb')
tag_user_count_mat = pickle.load(f)
f.close()

f = open(path + 'user_rating_sum.pckl', 'rb')
user_rating_sum = pickle.load(f)
f.close()




In [3]:
# Only execute if you want to generate the filled_rating or A matrix
sample_data = pd.read_csv(path + "sampleData.csv")

A = np.zeros([m + 1, n + 1])

for row in range(0, sample_data.shape[0]):
    i = sample_data['userId'][row]
    j = sample_data['movieId'][row]
    v = sample_data['rating'][row]
    A[i][j] = v
    filled_rating.append([i, j, v])

user_item_pkl = open(path + 'user_item.pckl', 'wb')
pickle.dump(A, user_item_pkl )
user_item_pkl.close()

filled_rating_pkl = open(path + 'filled_rating.pckl', 'wb')
pickle.dump(filled_rating, filled_rating_pkl)
filled_rating_pkl.close()

np.savetxt(path + "user_item.csv", A, delimiter=",")  # saving user item rating matrix
np.savetxt(path + "filled_rating.csv", A, delimiter=",") # saving places where the rating is present

In [4]:
# Only execute if you want to generate the item_item_sim
item_item_sim = np.zeros([n+1, n+1])

for i in range(0, n+1):
    for j in range(0, n+1):
        v1 = gt_col(A, i)
        v2 = gt_col(A, j)
        if magnitude(v1) != 0 and magnitude(v2) != 0:
            item_item_sim[i][j] = cosine_sim(v1, v2)

item_item_sim_pkl = open(path + 'item_item_sim.pckl', 'wb')
pickle.dump(item_item_sim, item_item_sim_pkl)
item_item_sim_pkl.close()

np.savetxt(path + "item_item_similarity.csv", item_item_sim, delimiter=",") # saving item-item similarit

In [5]:
# Only execute if you want to generate the user_user_item
user_user_sim = np.zeros([m+1, m+1])
for i in range(0, m+1):
    for j in range(0, m+1):
        v1 = gt_row(A, i)
        v2 = gt_row(A, j)
        if magnitude(v1) != 0 and magnitude(v2) != 0:
            user_user_sim[i][j] = cosine_sim(v1, v2)

user_user_sim_pkl = open(path + 'user_user_sim.pckl', 'wb')
pickle.dump(user_user_sim, user_user_sim_pkl)
user_user_sim_pkl.close()

np.savetxt(path + "user_user_similarity.csv", user_user_sim, delimiter=",") # saving user user similarity

In [14]:
print(user_user_sim)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.27084462 ... 0.09492173 0.15947379 0.        ]
 [0.         0.27084462 1.         ... 0.11840056 0.1977887  0.        ]
 ...
 [0.         0.09492173 0.11840056 ... 1.         0.13510553 0.        ]
 [0.         0.15947379 0.1977887  ... 0.13510553 1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [23]:
# Only execute if you want to generate the user_user_item
random_location = set()
lth = len(filled_rating)
while len(random_location) != int(0.2*lth):
    random_location.add(random.randint(1, lth))

random_location_pkl = open(path + 'random_location.pckl', 'wb')
pickle.dump(random_location, random_location_pkl)
random_location_pkl.close()

In [114]:
# it calculates the accuracy based on the 
predicted = []
actual = []
for r in random_location:
    location1 = filled_rating[r]
    active_user = location1[0]
    pred_item = location1[1]
    actual_rating = location1[2]
    
    A[active_user][pred_item] = 0
    active_user_item_dot = np.dot(gt_row(A,active_user), gt_row(item_item_sim,pred_item))
    similarity_sum = np.sum(np.multiply(gt_row(A,active_user).astype(bool), gt_row(item_item_sim,pred_item)))
    if similarity_sum != 0:
        p = active_user_item_dot/similarity_sum
        predicted.append(p)
        actual.append(actual_rating)
    A[active_user][pred_item] = actual_rating

print(metrics.mean_absolute_error(actual,predicted))

0.7447538466324093


In [95]:
# Only execute if you want to generate the tag set 
movies_csv = pd.read_csv(path + "movies.csv")
tag_dict = dict()
tag_set = set()
for row in range(0, movies_csv.shape[0]):
    mid = movies_csv['movieId'][row]
    tags = movies_csv['genres'][row]
    tag_arr = tags.split('|')
    tag_dict[mid] = tag_arr
    for t in tag_arr:
        tag_set.add(t)
        
tag_dict_pkl = open(path + 'tag_dict.pckl', 'wb')
pickle.dump(tag_dict, tag_dict_pkl)
tag_dict_pkl.close()

tag_set_pkl = open(path + 'tag_set.pckl', 'wb')
pickle.dump(tag_set, tag_set_pkl)
tag_set_pkl.close()

{'Adventure', '(no genres listed)', 'Thriller', 'Action', 'Mystery', 'Romance', 'Musical', 'Fantasy', 'War', 'IMAX', 'Comedy', 'Drama', 'Animation', 'Western', 'Horror', 'Children', 'Documentary', 'Film-Noir', 'Sci-Fi', 'Crime'}


In [115]:
# only execute if you want to initialise the tag_user_sum_count
# and tag_user_cunt_mat, user_rating_sum

tag_user_sum_mat = {}
for u in range(0, m+1):
    tag_weight = {}
    for tag in tag_set:
        tag_weight[tag] = 0
    tag_user_sum_mat[u] = tag_weight
    
tag_user_count_mat = {}
for u in range(0, m+1):
    tag_weight = {}
    for tag in tag_set:
        tag_weight[tag] = 0
    tag_user_count_mat[u] = tag_weight
    
counter = 100
for row_tr in filled_rating:
    uid = row_tr[0]
    mid = row_tr[1]
    val = row_tr[2]
    for t in tag_dict[mid]:
        tag_user_sum_mat[uid][t] = tag_user_sum_mat[uid][t] + val
        if counter > 0 :
            counter = counter - 1
            print(uid, t, tag_user_sum_mat[uid][t])
        tag_user_count_mat[uid][t] = tag_user_count_mat[uid][t] + 1

for uid in tag_user_sum_mat.keys():
    total = 0
    for t in tag_user_sum_mat[uid].keys():
        numerator = tag_user_sum_mat[uid][t]
        denominator = tag_user_count_mat[uid][t]
        total = total + special_divide(numerator, denominator)
    user_rating_sum[uid] = total
    
        
tag_user_sum_mat_pkl = open(path + 'tag_user_sum_mat.pckl', 'wb')
pickle.dump(tag_user_sum_mat, tag_user_sum_mat_pkl)
tag_user_sum_mat_pkl.close()

tag_user_count_mat_pkl = open(path + 'tag_user_count_mat.pckl', 'wb')
pickle.dump(tag_user_count_mat, tag_user_count_mat_pkl)
tag_user_count_mat_pkl.close()

user_rating_sum_pkl = open(path + 'user_rating_sum.pckl', 'wb')
pickle.dump(user_rating_sum, user_rating_sum_pkl)
user_rating_sum_pkl.close()

# import csv
# 
# with open('user_rating_sum.csv', 'w') as csv_file:
#     writer = csv.writer(csv_file)
#     for key, value in user_rating_sum.items():
#         print(key,value)
#         writer.writerow([key, value])

1 Adventure 3.5
1 Children 3.5
1 Fantasy 3.5
1 Adventure 7.0
1 Drama 3.5
1 Fantasy 7.0
1 Mystery 3.5
1 Sci-Fi 3.5
1 Mystery 7.0
1 Sci-Fi 7.0
1 Thriller 3.5
1 Mystery 10.5
1 Thriller 7.0
1 Crime 3.5
1 Mystery 14.0
1 Thriller 10.5
1 Action 3.5
1 Adventure 10.5
1 Comedy 3.5
1 Crime 7.0
1 Action 7.5
1 Drama 7.5
1 Romance 4.0
1 War 4.0
1 Comedy 7.5
1 Drama 11.5
1 Horror 4.0
1 Action 11.5
1 Adventure 14.5
1 Sci-Fi 11.0
1 Action 15.5
1 Crime 11.0
1 Drama 15.5
1 Thriller 14.5
1 Comedy 11.5
1 Crime 15.0
1 Drama 19.5
1 Thriller 18.5
1 Crime 19.0
1 Drama 23.5
1 Drama 27.0
1 Action 19.0
1 Comedy 15.0
1 Crime 22.5
1 Fantasy 10.5
1 Action 23.0
1 Sci-Fi 15.0
1 Thriller 22.5
1 Action 26.5
1 Sci-Fi 18.5
1 Crime 26.0
1 Horror 7.5
1 Thriller 26.0
1 Action 29.5
1 Adventure 17.5
1 Fantasy 13.5
1 Adventure 21.0
1 Children 7.0
1 Fantasy 17.0
1 Musical 3.5
1 Adventure 24.5
1 Drama 30.5
1 Sci-Fi 22.0
2 Comedy 4.0
2 Romance 4.0
2 Drama 5.0
2 Action 5.0
2 Comedy 9.0
2 Horror 5.0
2 Thriller 5.0
2 Action 9.0
2 Dra

In [124]:
predicted = []
actual = []
fact_list = []
counter = 500
for r in random_location:
    location1 = filled_rating[r]
    active_user = location1[0]
    pred_item = location1[1]
    actual_rating = location1[2]
    
    A[active_user][pred_item] = 0
    active_user_item_dot = np.dot(gt_row(A,active_user), gt_row(item_item_sim,pred_item))
    similarity_sum = np.sum(np.multiply(gt_row(A,active_user).astype(bool), gt_row(item_item_sim,pred_item)))
    if similarity_sum != 0:
        p = active_user_item_dot/similarity_sum
        factor = 0
        for t in tag_dict[mid]:
            numerator = tag_user_sum_mat[active_user][t]
            denominator = tag_user_count_mat[active_user][t]
            factor = max(factor, special_divide(numerator, denominator))
            # if counter > 0:
            #     print(active_user, t ,numerator, denominator)
            #     counter = counter-1
        # factor = factor/len(tag_dict[mid])
        fact_list.append(factor)
        p = 0.5*p+0.5*factor
        predicted.append(p)
        actual.append(actual_rating)
    A[active_user][pred_item] = actual_rating

# print(tag_user_sum_mat[2]['Thriller'])
print(metrics.mean_absolute_error(actual,predicted))
np.savetxt(path + "fact_list.csv", fact_list, delimiter=",")  # saving user item rating matrix

0.7552272094259934


In [29]:
# test cell

print("Testing special_divide :")
print(special_divide(5, 0))
print(special_divide(5, 3))
print("------------------------")

print("Testing bool_cast :")
v = np.array([2, 4, 3])
p = np.array([5, 1, 0])
print(bool_cast(v, p))
print("------------------------")

print("Testing cosine similarity :")
print(cosine_sim(v, p))
print("------------------------")

print("Testing magnitude :")
print(magnitude(v))
print("------------------------")

print("Testing modified cosine similarity :")
print(modified_cosine_sim(v,p))
print("------------------------")





Testing special_divide :
0
1.6666666666666667
------------------------
Testing bool_cast :
[2 4 0]
------------------------
Testing cosine similarity :
0.5098499285104608
------------------------
Testing magnitude :
5.385164807134504
------------------------
Testing modified cosine similarity :
14
[2 4 0]
4.47213595499958
5.0990195135927845
[5 1 0]
0.6139406135149205
------------------------
