In [1]:
# import subset datasets

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
mydata = pd.read_csv('subset.csv') # import sub dataset with 10000 unique uesers and 100 hotel_clusters
Total = mydata['is_booking'].sum() # print Total

In [3]:
# Build user-item matrix
#-1 means the hotel_cluster is never clicked
# 0 means the hotel_cluster is clikced(interested) but not booked
# 1 means the hotel_cluster is booked

In [4]:

m = np.zeros((10000,100))
m[m==0]=-1

userid = mydata['user_id'].unique()
hotelcluster = mydata['hotel_cluster'].unique()
for index, row in mydata.iterrows():
    rowNum = np.where(userid==row['user_id'])
    colNum = np.where(hotelcluster==row['hotel_cluster'])
    if m[rowNum,colNum]==-1:
        m[rowNum,colNum]=row['is_booking']+1
    else:
        m[rowNum,colNum] = m[rowNum,colNum]+row['is_booking']
print (m.max())

37.0


In [5]:
# Split into Train/Test Matrix according to Train/Test split ratio (hyperparameter)

In [6]:
split_ratio = 0.3
pMatrix = np.random.rand(10000,100) # Predict Matrix
TrainMatrix = np.zeros([10000,100]) # Train Matrix
#TrainMatrix[:] = np.NAN
TestMatrix = np.zeros([10000,100]) # Test Matrix
#TestMatrix[:] = np.NAN
test_count = 0
for index, x in np.ndenumerate(pMatrix):
    if x>=split_ratio:
        TrainMatrix[index] = m[index]
    else:
        test_count +=1
        TestMatrix[index] = m[index]
print (TrainMatrix.max())


37.0


In [7]:
# Make a copy, TMatrix of Train Matrix and convert it to binary matrix
# Use TMatrix to calculate similarity
# Use TrainMatrix to identify entries need to be predicted

In [8]:
TMatrix = TrainMatrix # a copy of Train Matrix
TMatrix[TMatrix<0]=0
TMatrix[TMatrix>1]=1

# Compute the Similarity Matrix
user_similarity = cosine_similarity(TMatrix)
print (user_similarity.min())
print (test_count)

0.0
100071


In [9]:
# Accessory Function Predict

In [10]:
def predict(rowNum, colNum, k):
    topUser = np.argsort(user_similarity[rowNum])[-k:-1]
    #print topUser
    topSimilarity = user_similarity[rowNum][topUser]
    if topSimilarity.max()==0:
        prediction = 0.0
    else:
    #print colNum
    #print TrainMatrix[topUser][:,colNum].T.max()
        prediction = (TMatrix[topUser][:,colNum].T.dot(topSimilarity))/sum(topSimilarity)
        prediction = prediction.round()
    return prediction

In [11]:
# Prediction using k most similar users' rate on the hotel cluster

In [12]:
k=21
rMatrix = np.zeros([10000,100])
for index, item in np.ndenumerate(TrainMatrix):
    if item==0:
        #colInd = np.where(row==item)
        #rowInd = np.where(TrainMatrix == row)
        #print index
        rMatrix[index] = predict(index[0],index[1],k)
print (rMatrix)
print (rMatrix.max())

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  1.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
1.0


In [13]:
# Convert the test matrix to a binary matrix

In [14]:
TestMatrix[TestMatrix>=2] = 1
count = np.count_nonzero(TestMatrix)
print (count)

100071


In [15]:
TrueItems = np.count_nonzero(TestMatrix==1)
print (TrueItems)

TruePositive = 0
row, col = TestMatrix.shape
for i in range(row):
    for j in range(col):
        if TestMatrix[i,j] == rMatrix[i,j] and TestMatrix[i, j] == 1:
            TruePositive += 1
recall = TruePositive / (1.0*TrueItems)
print ('Recall is {:.3}'.format(recall))

12241
Recall is 0.21


In [16]:
Positive = np.count_nonzero(rMatrix==1)
Precision = TruePositive/(1.0*Positive)
print ('Precision is {:.3}'.format(Precision))

Precision is 0.156


In [17]:
TestMatrix[TestMatrix<0] = 0

print np.count_nonzero(rMatrix)
print np.count_nonzero(TestMatrix)
print np.count_nonzero(rMatrix - TestMatrix)

16463
12241
23564


In [18]:
# Print accuracy
# Print base_line accuracy

In [19]:
accuracy = 1-(np.count_nonzero(rMatrix - TestMatrix)/(1.0*count))
print ('User-based collaborative filtering accuracy is {:.3}'.format(accuracy))

# baseline, assume all predicted entries to be 0
base_TestMatrix = np.count_nonzero(TestMatrix)
baseline_accuracy = (test_count - base_TestMatrix) / (1.0 * test_count)
print 'Baseline accuracy is {:.3}'.format(baseline_accuracy)

User-based collaborative filtering accuracy is 0.765
Baseline accuracy is 0.878


In [20]:
#Print out the 10 most popular items in the original dataset
m[m<0] = 0
#m[m>1] = 1
popularity = np.sum(m, axis=0)
print np.argsort(popularity)[-10:,]
#print m.max()

[80 43 58 77 37 62 20 70  1 49]


In [21]:
#Combine the predicted entries with the TMatrix to get a completed prediction matrix
result = TMatrix + rMatrix
#print result.max()

#Print out the 10 most popular items in the resulted prediction matrix
Rpopularity = np.sum(result, axis=0)
print np.argsort(Rpopularity)[-10:,]

#Coverage of the recommendation based on predicted results
recommendation = np.zeros([10000,5])
for rowNum in range(result.shape[0]):
    #print sum(result[rowNum])
    recommend = np.argsort(result[rowNum])[-5:,]
    #print recomend
    recommendation[rowNum]=recommend
#Print out the labels of hotels that appeared at least once in the recommendations for each user
print "Labels of Hotel Clusters recommended based on prediction"
print np.unique(recommendation)

#Coverage of the recomendation based on original dataset
Mrecommendation = np.zeros([10000,5])
for rowN in range(m.shape[0]):
    #print sum(m[rowN])
    Mrecommend = np.argsort(m[rowN])[-5:,]
    #print recomend
    Mrecommendation[rowN]=Mrecommend
#Print out the labels of hotels that appeared at least once in the recommendations for each user
print "Labels of Hotel Clusters recommended based on original dataset"
print np.unique(Mrecommendation)

    

[77 58 60 20 43 80 62 70  1 49]
Labels of Hotel Clusters recommended based on prediction
[  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.
  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.  25.  26.  27.  28.  29.
  30.  31.  32.  33.  34.  35.  36.  37.  38.  39.  40.  41.  42.  43.  44.
  45.  46.  47.  48.  49.  50.  51.  52.  53.  54.  55.  56.  57.  58.  59.
  60.  61.  62.  63.  64.  65.  66.  67.  68.  69.  70.  71.  72.  73.  74.
  75.  76.  77.  78.  79.  80.  81.  82.  83.  84.  85.  86.  87.  88.  89.
  90.  91.  92.  93.  94.  95.  96.  97.  98.  99.]
Labels of Hotel Clusters recommended based on original dataset
[  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.
  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.  25.  26.  27.  28.  29.
  30.  31.  32.  33.  34.  35.  36.  37.  38.  39.  40.  41.  42.  43.  44.
  45.  46.  47.  48.  49.  50.  51.  52.  53.  54.  55.  56.  57.  58.  59.
  60.  61.  62.  63.  64.  65.  66. 

In [22]:
#Count the number of same recommendations given by our result compared to the original matrix
count = 0
for rowNum in range(result.shape[0]):
    recommend = np.argsort(result[rowNum])[-5:,]
    Mrecomend = np.argsort(m[rowN])[-5:,]
    if recommend.all() == Mrecomend.all():
        count+=1
print count

9234
