In [23]:
from datasketch import MinHash, MinHashLSH
from math import sqrt
import numpy as np
import pandas as pd
import copy
import timeit
from sklearn.metrics.pairwise import cosine_similarity
mydata = pd.read_csv('subset_numeric.csv') # import sub dataset with 10000 unique uesers and 100 hotel_clusters
Total = mydata['is_booking'].sum() # print Total

In [24]:
matrix = np.zeros((10000,100))
matrix[matrix==0]=-1

userid = mydata['user_id'].unique()
hotelcluster = mydata['hotel_cluster'].unique()
for index, row in mydata.iterrows():
    rowNum = np.where(userid==row['user_id'])
    colNum = np.where(hotelcluster==row['hotel_cluster'])
    if matrix[rowNum,colNum]==-1:
        matrix[rowNum,colNum]=row['is_booking']+1
    else:
        matrix[rowNum,colNum] = matrix[rowNum,colNum]+row['is_booking']
print (matrix.max())

37.0


In [25]:
split_ratio = 0.3
pMatrix = np.random.rand(10000,100) # Predict Matrix
TrainMatrix = np.zeros([10000,100]) # Train Matrix
#TrainMatrix[:] = np.NAN
TestMatrix = np.zeros([10000,100]) # Test Matrix
#TestMatrix[:] = np.NAN
test_count = 0
for index, x in np.ndenumerate(pMatrix):
    if x>=split_ratio:
        TrainMatrix[index] = matrix[index]
    else:
        test_count +=1
        TestMatrix[index] = matrix[index]
print (TrainMatrix.max())

37.0


In [26]:
TMatrix =copy.deepcopy(TrainMatrix)  # a copy of Train Matrix
TMatrix[TMatrix<0]=0

TestM= copy.deepcopy(TestMatrix) 
TestM[TestM<0]=0


user_similarity = cosine_similarity(TMatrix)
print (user_similarity.min())
print (test_count)

0.0
300734


In [27]:
def predict(current,index, k,user_similarity,LSHresult):
    if len(LSHresult)==0:
        topUser = np.argsort(user_similarity[current])[-k:-1]
    if len(LSHresult)>k:
        topUser = np.argsort(user_similarity[current][LSHresult])[-k:-1]
    else:
        topUser = np.argsort(user_similarity[current][LSHresult])
    topSimilarity = user_similarity[current][topUser]
    if topSimilarity.max()==0:
        prediction = 0.0
    else:
        prediction = (TMatrix[topUser][:,index].T.dot(topSimilarity))/sum(topSimilarity)
    return prediction

In [28]:
def prediction(LSHresult,current):
    current = int(current)
    LSHresult = [ int(x) for x in LSHresult ]
    k=21
    rMatrix = np.zeros(100)
    for index in range (len(TrainMatrix[current])):
        if TrainMatrix[current][index]==0:
            rMatrix[index] = predict(current,index,k, user_similarity,LSHresult)
    return rMatrix

In [54]:
def model(thres,permutations):
    lsh = MinHashLSH(params = thres, num_perm=permutations)
    m = []
    for i in range (len(TMatrix)):
        tempm = MinHash(num_perm=permutations)
        m.append(tempm)
        for j in TMatrix[i]:
            tempm.update(j)
        name = str(i)
        lsh.insert(name, tempm)

    result1 = lsh.query(m[4])
    print len(result1)
    
    resultM = []
    for i in range (len(m)):
        res = lsh.query(m[i])
        pred = prediction(res,i)
        resultM.append(pred)
    return resultM

In [40]:
def calculateRMSE(resultM):
    error = resultM - TestM
    #print error
    sqError = error**2

    sumSqerror =0
    for i in sqError:
        for j in i:
            sumSqerror+=j
    rmse = sqrt(sumSqerror/test_count)
    return rmse

In [41]:
def recommendation(modelResult):
    recommendOutput = []
    result = modelResult+TMatrix
    
    #Popular items in the resulted matrix
    Rpopularity = np.sum(result, axis=0)
    #print np.argsort(Rpopularity)[-10:,]

    #Popular items in the original matrix
    ORpopularity = np.sum(matrix, axis=0)
    #print np.argsort(Rpopularity)[-10:,]
    
    recommendOutput.append((np.argsort(Rpopularity)[-10:,], np.argsort(ORpopularity)[-10:,]))

    #Coverage of the recommendation based on predicted results
    recommendation = np.zeros([10000,5])
    for rowNum in range(result.shape[0]):
        recommend = np.argsort(result[rowNum])[-5:,]
        recommendation[rowNum]=recommend
    #print np.unique(recommendation)
    #print len(np.unique(recommendation))

    #Coverage of the recomendation based on original dataset
    Mrecommendation = np.zeros([10000,5])
    for rowN in range(matrix.shape[0]):
        Mrecommend = np.argsort(matrix[rowN])[-5:,]
        Mrecommendation[rowN]=Mrecommend
    #print np.unique(Mrecommendation)
    #print len(np.unique(Mrecommendation))
    
    recommendOutput.append((len(np.unique(recommendation)), len(np.unique(Mrecommendation))))

    #Count the number of same recommendations given by our result compared to the original matrix
    count = 0
    countall = 0
    for user in range(result.shape[0]):
        recommend = set(np.argsort(result[user])[-5:,])
        Mrecomend = set(np.argsort(matrix[user])[-5:,])
        if (recommend.intersection(Mrecomend) != set([])):
            count+=1
        if (recommend.intersection(Mrecomend) != Mrecomend):
            countall+=1
    #print count
    
    recommendOutput.append(count)
    recommendOutput.append(countall)
    
    return recommendOutput

In [50]:
thresholds = [0.3,0.5,0.6,0.7,0.75,0.8,0.85,0.9]
permutations = [100,150,200,250]
MRMSE = []
RecommendResult = []
Runtime = []
for permutation in permutations:
    start_time = timeit.default_timer()
    modelResult = model(0.8,permutation)
    elapsed = timeit.default_timer()-start_time
    Runtime.append(elapsed)
    modelRMSE = calculateRMSE(modelResult)
    MRMSE.append(modelRMSE)
    print modelRMSE
    modelRecommend = recommendation(modelResult)
    RecommendResult.append(modelRecommend)
    print modelRecommend
    
print Runtime
print MRMSE
print RecommendResult

4150
0.47563864655
[(array([80, 77, 58, 37, 43, 62, 20, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9864, 9542]
4139
0.479190180119
[(array([80, 77, 58, 37, 43, 62, 20, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9845, 9543]
4137
0.478898275561
[(array([80, 77, 58, 37, 43, 62, 20, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9846, 9562]
4138
0.480339305536
[(array([80, 77, 58, 37, 43, 62, 20, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9844, 9545]
[147.35184502601624, 165.54736804962158, 183.07105994224548, 194.29551601409912]
[0.4756386465502319, 0.47919018011903036, 0.4788982755612695, 0.48033930553607007]
[[(array([80, 77, 58, 37, 43, 62, 20, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9864, 9542], [(array([80, 77, 58, 37, 43, 62, 20, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9845, 9543], [(ar

In [37]:
thresholds = [0.3,0.5,0.6,0.7,0.75,0.8,0.85,0.9]
permutations = [100,150,200,250]
parameters = [(1,100),(2,50),(4,25),(5,20),(10,10)]
MRMSE = []
RecommendResult = []
Runtime = []
for param in parameters:
    start_time = timeit.default_timer()
    modelResult = model(param,100)
    elapsed = timeit.default_timer()-start_time
    Runtime.append(elapsed)
    modelRMSE = calculateRMSE(modelResult)
    MRMSE.append(modelRMSE)
    print modelRMSE
    modelRecommend = recommendation(modelResult)
    RecommendResult.append(modelRecommend)
    print modelRecommend
    
print Runtime
print MRMSE
print RecommendResult

4137
0.479385280436
[(array([80, 77, 58, 37, 43, 62, 20, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9846, 9561]
4137
0.479385280436
[(array([80, 77, 58, 37, 43, 62, 20, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9846, 9561]
4137
0.479371321266
[(array([80, 77, 58, 37, 43, 62, 20, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9846, 9561]
4137
0.479393160586
[(array([80, 77, 58, 37, 43, 62, 20, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9846, 9561]
4855
0.476049930232
[(array([80, 77, 58, 37, 43, 62, 20, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9865, 9541]
[114.7693121433258, 118.0986340045929, 123.17016100883484, 125.84952402114868, 162.07170391082764]
[0.4793852804361218, 0.4793852804361218, 0.4793713212655214, 0.4793931605856193, 0.4760499302319335]
[[(array([80, 77, 58, 37, 43, 62, 20, 70,  1, 49]), array([77, 37, 80, 58, 

In [38]:
thresholds = [0.3,0.5,0.6,0.7,0.75,0.8,0.85,0.9]
permutations = [100,150,200,250]
parameters = [(20,5),(25,4),(50,2),(100,1)]
MRMSE = []
RecommendResult = []
Runtime = []
for param in parameters:
    start_time = timeit.default_timer()
    modelResult = model(param,100)
    elapsed = timeit.default_timer()-start_time
    Runtime.append(elapsed)
    modelRMSE = calculateRMSE(modelResult)
    MRMSE.append(modelRMSE)
    print modelRMSE
    modelRecommend = recommendation(modelResult)
    RecommendResult.append(modelRecommend)
    print modelRecommend
    
print Runtime
print MRMSE
print RecommendResult

9667
0.476493457345
[(array([80, 77, 58, 37, 43, 20, 62, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9843, 9537]
9933
0.477140617994
[(array([80, 37, 77, 43, 58, 20, 62, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9834, 9565]
10000
0.47667425483
[(array([80, 77, 43, 58, 37, 20, 62, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9848, 9548]
10000
0.478706815097
[(array([80, 77, 43, 37, 58, 20, 62, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9843, 9561]
[366.38730001449585, 416.6616520881653, 600.303719997406, 1182.8339221477509]
[0.4764934573450613, 0.47714061799375934, 0.47667425483016834, 0.4787068150970586]
[[(array([80, 77, 58, 37, 43, 20, 62, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9843, 9537], [(array([80, 37, 77, 43, 58, 20, 62, 70,  1, 49]), array([77, 37, 80, 58, 43, 20, 62, 70,  1, 49])), (100, 100), 9834, 9565], [(arr

In [55]:
thresholds = [0.3,0.5,0.7,0.75,0.8,0.85,0.9]
permutations = [100]
parameters = [(20,5),(25,4),(50,2),(100,1)]

RMSEALL=[]
RTALL = []
RecALL = []
for i in range(5):
    MRMSE = []
    RecommendResult = []
    Runtime = []
    for permutation in permutations:
        start_time = timeit.default_timer()
        modelResult = model((10,10),permutation)
        elapsed = timeit.default_timer()-start_time
        Runtime.append(elapsed)
        modelRMSE = calculateRMSE(modelResult)
        MRMSE.append(modelRMSE)
        #print modelRMSE
        modelRecommend = recommendation(modelResult)
        RecommendResult.append(modelRecommend[2])
        #print modelRecommend
    RTALL.append(Runtime)
    RMSEALL.append(MRMSE)
    RecALL.append(RecommendResult)
    
#print Runtime
#print MRMSE
#print RecommendResult



4855
4855
4855
4855
4855


In [56]:
print np.sum(RMSEALL, axis=0)/5.0
print np.sum(RTALL, axis=0)/5.0
print np.sum(RecALL, axis=0)/5.0

[ 0.47604993]
[ 170.06138439]
[ 9865.]
