In [1]:
%matplotlib inline
import pandas as pd
from recommender import BIKNN
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split

import numpy as np
from scipy.stats import norm
import heapq
from operator import itemgetter
from itertools import combinations
%load_ext autoreload
%autoreload 2

# BIKNN on Small Dataset

In [2]:
final_ratings = pd.read_csv('data/final_rating.csv')
final_ratings.head()

Unnamed: 0,userID,placeID,rating
0,1,32,2
1,1,33,1
2,1,76,2
3,1,82,1
4,1,86,1


In [3]:
# X_train and X_test is your training and testing data
X_train, X_test = train_test_split( 
    final_ratings, 
    test_size = 0.2, 
    stratify = final_ratings['userID'].values,
    random_state = 1234 # this is the seed for reproducible train/test split
)
print(X_train.shape)
X_train.head()

(920, 3)


Unnamed: 0,userID,placeID,rating
901,109,79,2
72,8,41,1
728,90,107,2
327,39,16,1
131,15,49,1


In [4]:
# the full dataset
# column order: user id, item id and ratings
train = pd.read_csv( 'data/u1.base', sep = '\t', header = None )
train = train.iloc[ :, 0:3 ]
test  = pd.read_csv( 'data/u1.test', sep = '\t', header = None )
test  = test.iloc[ :, 0:3 ]
column_names = [ 'user_ids', 'item_ids', 'ratings' ]
train.columns = column_names
test.columns  = column_names
print(train.shape)
print(test.shape)
train.head()

(80000, 3)
(20000, 3)


Unnamed: 0,user_ids,item_ids,ratings
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [None]:
# only retain users that are frequently rating 
# change the quantile for percentage that are retained
histogram_counts = train['user_ids'].value_counts()
df_value_counts  = pd.DataFrame( histogram_counts > histogram_counts.quantile(0.5) )
df_value_counts  = df_value_counts.reset_index()
frequent_users   = df_value_counts.loc[ df_value_counts['user_ids'], 'index' ]

train = train[ train['user_ids'].isin(frequent_users) ]
test  = test[ test['user_ids'].isin(frequent_users) ]
print(train.shape)
print(test.shape)

In [6]:
# Parameters
K = 20 
B1 = 25
B2 = 25

# X_train.columns = [ 'user_ids', 'item_ids', 'ratings' ]
# data = X_train.copy()

data = train.copy()
user_ids = np.array(data['user_ids'])
item_ids = np.array(data['item_ids'])
ratings  = np.array(data['ratings'])

unique_item_ids = np.unique(item_ids)
unique_user_ids = np.unique(user_ids)
item_ids_dict = { v: k for k, v in enumerate(unique_item_ids) }
size = len(item_ids_dict)
F   = np.ones( ( size, size ) )
G   = np.ones( ( size, size ) )
sup = np.ones( ( size, size ), dtype = np.int )

In [7]:
def _calculate_similarity( item1, item2 ):
    """
    item1 = 1
    item2 = 7
    support, numerator, denominator = _calculate_similarity( item1, item2 )
    """
    item1_boolean = item_ids == item1
    item2_boolean = item_ids == item2
    item1_users   = user_ids[item1_boolean]
    item2_users   = user_ids[item2_boolean]
    common_users  = list( set(item1_users).intersection(item2_users) )

    if not common_users:
        return 0, 0, 0
    
    match = np.in1d( user_ids, common_users )
    item1_ratings = ratings[item1_boolean & match]
    item2_ratings = ratings[item2_boolean & match]
    
    support = len(common_users)
    numerator = item1_ratings.dot(item2_ratings)
    denominator = np.sum( item1_ratings ** 2 ) + np.sum( item2_ratings ** 2 )    
    return support, numerator, denominator

In [8]:
# around 7 minutes
supports = []
for item1, item2 in combinations( unique_item_ids, 2 ):
    i1, i2 = item_ids_dict[item1], item_ids_dict[item2]
    support, numerator, denominator = _calculate_similarity( item1, item2 )
    F[i1][i2] = F[i2][i1] = numerator
    G[i1][i2] = G[i2][i1] = denominator
    sup[i1][i2] = sup[i2][i1] = support
    supports.append(support)

supports = np.array(supports)
N = supports.shape[0]
mean = np.mean(supports)
variance = np.var(supports)

In [80]:
std = np.sqrt(variance)

w = np.ones( [ size, size ] )
for i1, i2 in combinations( item_ids_dict.values(), 2 ):
    weight = norm.cdf( sup[i1][i2], mean, std )
    w[i1][i2] = w[i2][i1] = weight
    
sim_w = ( F / G ) * w
sim_w[ np.isnan(sim_w) ] = 0 # there're nan values 

In [10]:
global_avg = np.mean(ratings)
known_ratings_count = ratings.shape[0]

# every items' / users' bias
user_ratings_sum = {}
item_ratings_sum = {}
user_ratings_count = {} 
item_ratings_count = {}

for item_id in unique_item_ids:
    item_ratings = ratings[ item_ids == item_id ]
    item_ratings_sum[item_id] = np.sum(item_ratings)
    item_ratings_count[item_id] = item_ratings.shape[0]
    
for user_id in unique_user_ids:
    user_ratings = ratings[ user_ids == user_id ]
    user_ratings_sum[user_id] = np.sum(user_ratings)
    user_ratings_count[user_id] = user_ratings.shape[0]

In [11]:
def _calculate_item_bias( item_id ):
    # _n, _d stands for numerator and denominator
    item_bias_n = item_ratings_sum[item_id] - global_avg * item_ratings_count[item_id]
    item_bias_d = B1 + item_ratings_count[item_id]
    item_bias = item_bias_n / item_bias_d
    return item_bias

In [12]:
def _calculate_user_bias( user_id, user_rated_item_id ):

    item_bias_total = 0
    for other_item_id in user_rated_item_id:
        item_bias_total += _calculate_item_bias(other_item_id)

    user_bias_n = ( user_ratings_sum[user_id] - 
                    global_avg * user_ratings_count[user_id] - 
                    item_bias_total )
    user_bias_d = B2 + user_ratings_count[user_id]
    user_bias = user_bias_n / user_bias_d
    return user_bias

In [129]:
def _predict_rating( user_id, item_id, user, user_item_ids  ):
    """_predict_rating( user_id = 1, item_id = 1 )"""
    item_bias = _calculate_item_bias(item_id)
    user_bias = _calculate_user_bias( user_id, user_rated_item_id )
    baseline  = global_avg + item_bias + user_bias
    
    similars = []
    other_item_ids = set(user_rated_item_id).difference( set([item_id]) )
    for other_item_id in other_item_ids:
        similarity = sim_w[ item_ids_dict[item_id] ][ item_ids_dict[other_item_id] ]
        similars.append( ( other_item_id, similarity ) )

    knearest = heapq.nlargest( K, similars, key = itemgetter(1) )
    
    numerator = 0
    denominator = 0
    for nearest_id, sim in knearest:
        nearest_rating = ratings[ ( item_ids == nearest_id) & user ][0]
        nearest_item_bias = _calculate_item_bias(nearest_id)
        numerator += sim * ( nearest_rating - global_avg - user_bias - nearest_item_bias )
        denominator += sim

    try:
        rating = baseline + ( numerator / denominator )
    except ZeroDivisionError:
        rating = baseline
    return rating

In [125]:
# change column access values
test.column = [ 'user_ids', 'item_ids', 'ratings' ]
new_data = test.iloc[:30].values

In [192]:
iterations = 50000

In [202]:
prediction = []
for idx1, ( user_id, item_id1, rating1 ) in enumerate(new_data):
    
    # obtain all the other items and ratings associated with the user
    user = user_ids == user_id
    user_item_ids = item_ids[user]
    user_ratings = ratings[user]
    
    predicted = _predict_rating( user_id, item_id1, user, user_item_ids )
    prediction.append(predicted)
    
    # update a bunch of stuff according to all the 
    # other items and ratings associated with the user
    update( user_item_ids, user_ratings )
    
    # if index1 % iterations == 0:
    #    _update_support_weight_and_similarity()
    
    global_avg_new_n = global_avg * known_ratings_count + rating
    known_ratings_count += 1
    global_avg_new_d = known_ratings_count
    global_avg  = global_avg_new_n / global_avg_new_d

    user_ratings_sum[user_id] += rating
    item_ratings_sum[item_id] += rating
    user_ratings_count[user_id] += 1
    item_ratings_count[item_id] += 1

In [201]:
def update( user_item_ids, user_ratings ):
    global mean
    global variance
    for item_id2, rating2 in zip( user_item_ids, user_ratings ):
        i1 = item_ids_dict[item_id1]
        i2 = item_ids_dict[item_id2]

        F_new = F[i1][i2] + ( rating1 * rating2 )
        G_new = G[i1][i2] + ( rating1 ** 2 + rating2 ** 2 )
        F[i1][i2] = F[i2][i1] = F_new
        G[i1][i2] = G[i2][i1] = G_new    
        
        # compute and update the new support's mean, variance and count
        sup_old = sup[i1][i2]
        sup_delta = 1
        sup_new = sup_old + sup_delta       
        mean_new = mean + sup_delta / N
        variance_new = ( variance + 
                         ( 2 * sup_delta * sup_old + sup_delta ** 2 ) / N +
                         mean ** 2 - mean_new ** 2 )

        mean = mean_new
        variance = variance_new
        sup[i1][i2] = sup[i2][i1] = sup_new

In [174]:
# this runs a single BIKNN
biknn1 = BIKNN( K = 20, B1 = 25, B2 = 25, iterations = 100000 )
biknn1.fit( data = train, column_names = [ 'user_ids', 'item_ids', 'ratings' ] )
pred = biknn1.predict(test)

KeyboardInterrupt: 

In [46]:
"""
data = pd.read_csv( 'test_data.txt', sep = ',', header = None )
data.head(4)
dict1 = { 
    'user_id': [6, 5],
    'item_id': [1, 2],
    'ratings': [5, 2]
}
test1 = pd.DataFrame( dict1, columns = [ 'user_id', 'item_id', 'ratings' ] )
test1
"""
print('ignore this code chunk')

ignore this code chunk
