In [None]:
import sys
import numpy as np
import math
import pandas as pd
import random

from scipy import stats

%matplotlib inline

import matplotlib.pyplot as plt 
import matplotlib
matplotlib.rc('font',family='serif')
matplotlib.rc('font',weight='bold')
matplotlib.rc('font',size=16)
matplotlib.rcParams['axes.labelweight'] = 'bold'

from recommender_functions import *
from sklearn.metrics import recall_score, precision_score, f1_score, roc_curve,roc_auc_score
from sklearn import model_selection

import random

In [None]:
week = 'Week4'
day  = 'Mon'

In [None]:
# Get the ratings by user ID: starts at user index 1 
r = pd.read_csv('goodbooks-10k/ratings.csv')
df_ratings = pd.DataFrame(r)

# Add counts to the df
cut_u, cut_b = 175, 1 #175, 1
df_ratings['user_counts'] = df_ratings.groupby(['user_id'])['book_id'].transform('count')
df_ratings['book_counts'] = df_ratings.groupby(['book_id'])['user_id'].transform('count')
df_ratings_cut = df_ratings.query('user_counts > %d '%(cut_u))

# Add an index for the user for matrix making later
df_ratings_cut['user_idx'] = pd.Categorical(df_ratings_cut['user_id']).codes 
df_ratings_cut['book_idx'] = pd.Categorical(df_ratings_cut['book_id']).codes 

df_ratings_cut.to_csv('ratings_cut.csv', sep=',')

# ...but also do a groupby, so can plot easily
df_ratings_count_u = df_ratings.groupby(['user_id']).size().reset_index(name='Counts')

r = pd.read_csv('goodbooks-10k/books_with_genres.csv')
df_books = pd.DataFrame(r)
df_ratings_cut.head()

r = pd.read_csv('goodbooks-10k/ahack_tags_3.csv')
df_tags = pd.DataFrame(r)
df_tags['tag_index'] = pd.Categorical(df_tags['tag_id']).codes 


In [None]:
# Set number of books and users variables for later use
N_BOOKS = len(df_ratings_cut.book_id.unique()) 
N_USERS = len(df_ratings_cut.user_id.unique())

print(N_BOOKS,N_USERS)

## Build a Rank Matrix

In [None]:
# First, define a matrix size num_books x num_users
ratings_genre_mat = np.zeros((N_USERS,N_BOOKS))
ratings_mat = np.zeros((N_USERS,N_BOOKS))
binary_mat = np.zeros((N_USERS,N_BOOKS))

Y, R = ratings_mat, binary_mat

genre_list_dict = []

# Now fill the rank matrix and validation matrix
for i in range(df_ratings_cut.shape[0]):
    user_i = df_ratings_cut.user_idx.values[i] # This goes from 0 -> 536
    book_i = df_ratings_cut.book_idx.values[i] # This goes from 0 -> 7336
    rating_i = df_ratings_cut.rating.values[i] # This goes from 1 -> 5

    # Fill ratings + binary matrix
    ratings_mat[user_i][book_i] = rating_i
    binary_mat[user_i][book_i] = 1
    
    # Now fill the genre tag matrix
    # First link ratings matrix entry to its book in df_books
    actual_book_i = df_ratings_cut.book_id.values[i]               # book_id goes from 1 -> 10000
    book_q = df_books.loc[df_books['book_id'] == actual_book_i]
    genre, tag_id = book_q.genre.values[0], book_q.tag_id.values[0] 
    
    ratings_genre_mat[user_i][book_i] = int(tag_id)    

In [None]:
user_ave_v = ratings_mat.sum(1)/(ratings_mat!=0).sum(1).astype(float)
book_ave_v = ratings_mat.T.sum(1)/(ratings_mat.T!=0).sum(1).astype(float)

ave_mat = np.zeros((N_USERS,N_BOOKS))
user_ave_mat = np.zeros((N_USERS,N_BOOKS))

for i in range(N_USERS):
    rowi = [ (user_ave_v[i]+book_ave_v[j])/2 for j in range(N_BOOKS) ]
    ave_mat[i] = rowi
    
    row_ave = [user_ave_v[i] for j in range(N_BOOKS)]
    user_ave_mat[i] = row_ave

In [None]:
# Per empty entry, set entry value to average of user and book rankings
orig_ratings_mat = ratings_mat.copy()
temp = (ratings_mat == 0)
ratings_mat[temp] = ave_mat[temp]

In [None]:
# Now store the top 3 genre preferences per user 
top_3_genres_per_user = np.zeros((N_USERS,4))
top_3_genres_names_per_user = []
top_genre_names_per_user = []

for rowi in range(ratings_genre_mat.shape[0]):
    
    i = ratings_genre_mat[rowi]
    
    unique, counts = np.unique(i,return_counts=True)

    pred_idxs_sorted = np.argsort(counts)
    pred_idxs_sorted = pred_idxs_sorted[::-1]

    # Start at 1, not 0, to avoid the 0's which are most of the space
    top_3_genres_per_user[rowi][0] = unique[pred_idxs_sorted[1]] #counts[pred_idx_sorted[1]]
    top_3_genres_per_user[rowi][1] = unique[pred_idxs_sorted[2]] #counts[pred_idx_sorted[1]]
    top_3_genres_per_user[rowi][2] = unique[pred_idxs_sorted[3]] #counts[pred_idx_sorted[1]]
    top_3_genres_per_user[rowi][3] = unique[pred_idxs_sorted[4]] #counts[pred_idx_sorted[1]]
    
    first_pick = df_tags.query('tag_id == %d'%int(top_3_genres_per_user[rowi][0]))
    second_pick = df_tags.query('tag_id == %d'%int(top_3_genres_per_user[rowi][1]))
    third_pick = df_tags.query('tag_id == %d'%int(top_3_genres_per_user[rowi][2]))

    top_genre_names_per_user.append(first_pick['tag_id'].values[0])
    
    user_pref_v = [first_pick['tag_name'].values[0], second_pick['tag_name'].values[0], third_pick['tag_name'].values[0]]

    top_3_genres_names_per_user.append(user_pref_v)

In [None]:
# Ok cool, so now per user, build a matrix to blind user's top genres
# This matrix is the same size as the ratings matrix, but only has entries for 
# preferences below the user's favorite
binary_genre_mat = binary_mat.copy()

for i in range(ratings_genre_mat.shape[0]):
    rowi = ratings_genre_mat[i]
    binary_genre_mat[i] = [ 1 if ratings_genre_mat[i][j] != 0 and ratings_genre_mat[i][j] != top_genre_names_per_user[i]\
            else 0 for j in range(ratings_genre_mat.shape[1])]

In [None]:
# Split data between train and test, user similarity (roughly 80:20 split)
train_set, test_set = train_test_split(ratings_mat,split=1400)
user_similarity = fast_similarity(train_set,kind='user')

print('Test :' ,float(len(np.nonzero(test_set)[0]))/(len(np.nonzero(train_set)[0])+len(np.nonzero(test_set)[0]))*100)
print('Train:' ,float(len(np.nonzero(train_set)[0]))/(len(np.nonzero(train_set)[0])+len(np.nonzero(test_set)[0]))*100)

print(len(np.nonzero(test_set)[0]),len(np.nonzero(train_set)[0]))

In [None]:
# Get user predictions for only top k = 30 (mse minimum) most similar users 
user_prediction_topk = predict_topk(train_set, user_similarity, kind='user', k=30)
print( 'Top-k User-based CF MSE: ' + str(get_mse(user_prediction_topk, test_set)))

In [None]:
# Let's also find the item similarity
# This does not perform as well
#item_similarity = fast_similarity(train_set,kind='item')
#item_prediction_topk = predict_topk(train_set,item_similarity,kind='item',k=15)

#item_y_pred_topk = item_prediction_topk[nonzero_test]
#item_y_pred_scaled_topk = (item_y_pred_topk - 1.) / 4

## And now we validate

In [None]:
# Blind data from user's top genre 
blind_test_set = test_set.copy() * binary_genre_mat
nonzero_test = blind_test_set > 0

blind_y_true = blind_test_set[nonzero_test]
blind_y_pred_topk = user_prediction_topk[nonzero_test]
user_ave_thresholds = user_ave_mat[nonzero_test]

blind_y_pred_scaled_topk = (blind_y_pred_topk - 1.) / 4

# Binarize true values and predictions using user's average rating as a threshold
blind_y_true_binarized = binarize(blind_y_true.copy(), user_ave_thresholds)
blind_y_pred_binarized_topk = binarize(blind_y_pred_topk.copy(), user_ave_thresholds) 


In [None]:
user_ids = np.tile(np.array([np.arange(blind_test_set.shape[0])]).T, \
                   (1, blind_test_set.shape[1]))[nonzero_test]

In [None]:
def precision_at_k(true, pred, pred_binarized, user_ids, k, tol=[]):
    unique_users = np.unique(user_ids)
    precisions = np.zeros(unique_users.size)
    
    for i in range(unique_users.size):
        user_ind = user_ids == unique_users[i]
        user_true = true[user_ind]
        user_pred = pred[user_ind]
        user_pred_binarized = pred_binarized[user_ind]
        ranked_ind = np.argsort(-user_pred)[:k]
        precisions[i] = precision_score(user_true[ranked_ind], user_pred_binarized[ranked_ind])
    return np.mean(precisions[precisions > 0]) #precisions

In [None]:
t = user_ave_thresholds
print( 'precision @4 : ', precision_at_k(blind_y_true_binarized, blind_y_pred_topk, blind_y_pred_binarized_topk, user_ids, 4, tol=t))
print( 'precision @8 : ', precision_at_k(blind_y_true_binarized, blind_y_pred_topk, blind_y_pred_binarized_topk, user_ids, 8, tol=t))
print( 'precision @16 : ', precision_at_k(blind_y_true_binarized, blind_y_pred_topk, blind_y_pred_binarized_topk, user_ids, 16, tol=t))

In [None]:
precision_v = []
t = user_ave_thresholds

for i in range(1,30,1):

    p =  precision_at_k(blind_y_true_binarized, blind_y_pred_topk, blind_y_pred_binarized_topk, user_ids, i, tol=t)
    precision_v.append(p)

In [None]:
plt.figure(figsize=(6,4))
x=np.linspace(1,31,29)
plt.plot(x,precision_v,'-o',color='green')
plt.xlabel('# of Recommendations',fontsize=18)
plt.ylabel('Precision',fontsize=18)
plt.grid(True,ls='--')
plt.savefig('Plots/%s_%s_precision.png'%(week,day), bbox_inches='tight')
plt.xlim(0,30)

In [None]:
# Recall, precision, and f1 are definied for binarized predictions  
auc_score = roc_auc_score(blind_y_true_binarized, blind_y_pred_scaled_topk)
fpr_b, tpr_b, thresholds = roc_curve(blind_y_true_binarized, blind_y_pred_scaled_topk)

print( 'Recall: %0.3f' % recall_score(blind_y_true_binarized, blind_y_pred_binarized_topk))
print( 'Precision: %0.3f' % precision_score(blind_y_true_binarized, blind_y_pred_binarized_topk))
print( 'F1 score: %0.3f' % f1_score(blind_y_true_binarized, blind_y_pred_binarized_topk))
print( 'ROC-AUC: %0.3f' % auc_score)

In [None]:
print(blind_y_true_binarized)
print(blind_y_pred_scaled_topk)

In [None]:
#Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_b, tpr_b, lw=2,color='green',label='User-Similarity') 
plt.plot([0, 1], [0, 1], 'k--',lw=2,label='Random')

plt.xlabel('False Positive Rate',fontsize=20)
plt.ylabel('True Positive Rate',fontsize=20)
plt.legend(fontsize=16)
plt.savefig('Plots/%s_%s_roc.png'%(week,day),bbox_to_inches='tight')