# Travel Attraction Recommendation System in Bangkok

In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
%matplotlib inline

from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import dump
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# Evalutation function

In [2]:
df_title = pd.read_csv('Data/travel attractions.csv', usecols = ['title','spotId'])
def find_candidate(model, username):
    candidate = df_title.copy()

    candidate['score'] = candidate['spotId'].apply(lambda x: model.predict(str(username),x).est) 
    total_train_title = df[~df['title'].isin(get_average(str(username)).keys())] # เอา title ทั้งหมดใน train มาลบด้วย title ใน test เพราะบ้างคนมีไปเที่ยวที่เดิมหลายครั้ง
    traveled_list = total_train_title[total_train_title['username'] == str(username)]['title'].unique().tolist()
    candidate = candidate[~candidate['title'].isin(traveled_list)]
    candidate = candidate.sort_values(by=['score'], ascending=False)
    return candidate

In [3]:
def get_average(username):
    user_df = pd.read_csv('test_process_over2.csv')
    user_rated = {}
    for index, values in user_df[user_df['username'] == username].iterrows():
        if values['title'] not in user_rated:
            user_rated[values['title']] = values['rating']
        else:
            user_rated[values['title']] = (user_rated[values['title']] + values['rating'])/2
            
    # filter สถานที่ต้องมีคะแนนมากกว่า 3.0 ถึงจะถือว่า relevant
    filter_dict = {}
    for (key, values) in user_rated.items():
        if user_rated[key] >= 1.0:
            filter_dict[key] = values
#     print(filter_list)
    return filter_dict
# get_average('nellielim')

In [68]:
# Find Average precision (AP) and Mean average precision (MAP) function
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0
    count_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
            count_hits += 1
#     print('score = {}'.format(score))
            
    if count_hits == 0.0:
        count_hits = 1
    else:
        count_hits = count_hits

    if not actual:
        return 0.0

    return score / min(len(actual), k, count_hits)

# def apk2(actual, predicted, k=10):
#     """
#     Computes the average precision at k.
#     This function computes the average prescision at k between two lists of
#     items.
#     Parameters
#     ----------
#     actual : list
#              A list of elements that are to be predicted (order doesn't matter)
#     predicted : list
#                 A list of predicted elements (order does matter)
#     k : int, optional
#         The maximum number of predicted elements
#     Returns
#     -------
#     score : double
#             The average precision at k over the input lists
#     """
#     if len(predicted)>k:
#         predicted = predicted[:k]

#     score = 0.0
#     num_hits = 0.0
#     count_hits = 0.0

#     for i,p in enumerate(predicted):
#         if p in actual and p not in predicted[:i]:
#             num_hits += 1.0
#             score += num_hits / (i+1.0)
#             count_hits += 1
            
#     if count_hits == 0.0:
#         count_hits = 1
#     else:
#         count_hits = count_hits
        
#     result = score / min(len(actual), k, count_hits)

#     if not actual:
#         result = 0.0
    
#     return result

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [5]:
# for num in range(1,11):
#     print(apk(actual, predict, num))

In [67]:
# Find Average Recall (AR) and Mean average Recall (MAR) function
def _ark(actual, predicted, k=10):
    """
    Computes the average recall at k.
    Parameters
    ----------
    actual : list
        A list of actual items to be predicted
    predicted : list
        An ordered list of predicted items
    k : int, default = 10
        Number of predictions to consider
    Returns:
    -------
    score : int
        The average recall at k.
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0
    count_rel = len(actual)
    count_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / count_rel
            count_hits += 1
#     print('score = {}'.format(score))
    
    if count_hits == 0.0:
        count_hits = 1
    else:
        count_hits = count_hits

    if not actual:
        return 0.0

    return score / min(len(actual), count_hits, k)

def mark(actual, predicted, k=10):
    """
    Computes the mean average recall at k.
    Parameters
    ----------
    actual : a list of lists
        Actual items to be predicted
        example: [['A', 'B', 'X'], ['A', 'B', 'Y']]
    predicted : a list of lists
        Ordered predictions
        example: [['X', 'Y', 'Z'], ['X', 'Y', 'Z']]
    Returns:
    -------
        mark: int
            The mean average recall at k (mar@k)
    """
    return np.mean([_ark(a,p,k) for a,p in zip(actual, predicted)])

In [7]:
# for num in range(1,11):
#     print(_ark(actual, predict, num))

In [8]:
# หา Avg.Precision and Avg.recall แต่ละ k ของแต่ละคน
def find_apk(actual, predict, k):
    result = pd.DataFrame(columns = ['@k', 'Average precision', 'Average recall'])
    for num in range(1,k+1):
        APK = apk(actual, predict, num)
        ARK = _ark(actual, predict, num)
        to_append = pd.Series([int(num), APK, ARK], index = result.columns)
        result = result.append(to_append, ignore_index = True)
#     print(result)
    return result

### Simple Rec. ใช้ function นี้หา average recall and precision

In [9]:
# เนื่องจาก Simple Recommendation System จะมีคำตอบที่ออกมาเพียงคำตอบเดียว ดังนั้นจึงไม่ต้องเอา model มารันใหม่เรื่อยๆ
# user_df = pd.read_csv('test_process_over2.csv')
user_df = pd.read_csv('test_process_over2.csv')
def MAP_MAR_Simple(candidate, k):
    all_user_rated = []
    all_predicted = []
    
    # test_process เป็นข้อมูล testing เป็น user ที่แบ่งมา 20% โดยคัดคนที่ rated มากกว่า 2 ครั้ง
    for name in user_df['username'].unique():
        user_rated = get_average(name)
#         print(name)
        all_user_rated.append(list(user_rated.keys()))
        all_predicted.append(candidate['title'].tolist())
        
        
    result = pd.DataFrame(columns = ['@k', 'Mean average precision', 'Mean average recall'])        
    for num in range(1,k+1):
        MAP = mapk(all_user_rated, all_predicted, num)
#         print('MAP @{} is {}'.format(num, MAP))
        MAR = mark(all_user_rated, all_predicted, num)
#         print('MAR @{} is {}'.format(num, MAR))
        to_append = pd.Series([num, MAP, MAR] , index = result.columns)
        result = result.append(to_append, ignore_index = True)
        
    return result
        
        
# avg_recall_precision(user_rated, candidate)

### Model อื่น ๆ (SVD, Content-based, CF, Hybrid) ใช้ function นี้หา avg. recall and precision

In [10]:
# SVD, Conetnt-based use this!
user_df = pd.read_csv('test_process_over2.csv')
def MAP_MAR(model, k):
    
    all_user_rated = []
    all_predicted = []
    
    # testing_process เป็นข้อมูล testing เป็น user ที่แบ่งมา 20% ดยคัดคนที่ rated มากกว่า 2 ครั้ง
    for name in user_df['username'].unique():
        try:
        # get average rating from spot where user traveled in the past
            user_rated = get_average(name)
        # find candidate score for each usernames [spotId, title, score]
            candidate = find_candidate(model, name)
        # compute score between user's rated spot and candidate spot from recommend model
        # compute every unique username
            all_user_rated.append(list(user_rated.keys()))
            all_predicted.append(candidate['title'].tolist())
        except:
            user_rated = get_average(name)
#             print(name)
#             print(user_rated)
            my_ratings = pd.Series(user_rated)
#             print(my_ratings)
            candidate = model(my_ratings)
            all_user_rated.append(my_ratings.keys().tolist())
            all_predicted.append(candidate['title'].tolist())
        # ปัญหาคือ มันมีข้อมูลใน Test ที่ให้คะแนนน้อยมากๆ อยู่ทำให้ dict ที่เอามาสร้าง my_ratings มันเป็น Null เลยเอามาทำต่อไม่ได้

    result = pd.DataFrame(columns = ['@k', 'Mean average precision', 'Mean average recall'])        
    for num in range(1,k+1):
        MAP = mapk(all_user_rated, all_predicted, num)
#         print('MAP @{} is {}'.format(num, MAP))
        MAR = mark(all_user_rated, all_predicted, num)
#         print('MAR @{} is {}'.format(num, MAR))
        to_append = pd.Series([num, MAP, MAR] , index = result.columns)
        result = result.append(to_append, ignore_index = True)
#         print(result)
        
    return result
        
# avg_recall_precision(svd)

# Import data

In [11]:
# Import file with 50 travel attractions and 
df = pd.read_csv('train_process_over2.csv')
df = df.drop_duplicates()
df = df.dropna(subset = ['username'])

travel = pd.read_csv('Data/travel attractions.csv')

In [12]:
# training data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91650 entries, 0 to 91649
Data columns (total 15 columns):
experience_month    89340 non-null object
experience_year     89340 non-null float64
location            91650 non-null object
name                91650 non-null object
rating              91650 non-null float64
review_body         91650 non-null object
review_head         91650 non-null object
trip_type           74450 non-null object
user_location       81047 non-null object
username            91650 non-null object
write_date          91650 non-null object
spotId              91650 non-null int64
title               91650 non-null object
genres              91650 non-null object
overview            91650 non-null object
dtypes: float64(2), int64(1), object(12)
memory usage: 11.2+ MB


In [13]:
# testing data
user_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10006 entries, 0 to 10005
Data columns (total 15 columns):
experience_month    9735 non-null object
experience_year     9735 non-null float64
location            10006 non-null object
name                10006 non-null object
rating              10006 non-null float64
review_body         10006 non-null object
review_head         10006 non-null object
trip_type           8101 non-null object
user_location       8849 non-null object
username            10006 non-null object
write_date          10006 non-null object
spotId              10006 non-null int64
title               10006 non-null object
genres              10006 non-null object
overview            10006 non-null object
dtypes: float64(2), int64(1), object(12)
memory usage: 1.1+ MB


# Modeling

# 1. Simple Recommendation
#### Weighted with rating's count and average ratings

In [14]:
md = pd.DataFrame(columns = ['spotId','title','rated_count','average_rated'])

md = df.groupby(['spotId','title'], as_index = False).agg({'rating':[np.sum,np.mean]}).sort_values(by = 'title')
md.head()

Unnamed: 0_level_0,spotId,title,rating,rating
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean
17,18,ASIATIQUE The Riverfront,16004.0,4.098335
26,27,Art in Paradise Bangkok,1203.0,4.422794
23,24,Baiyoke Sky Tower,2723.0,3.946377
13,14,Bangkok Art and Culture Center,1918.0,4.20614
19,20,Benjasiri Park,967.0,4.259912


In [15]:
# find average of rating's count and rating's average
avg_avg = md['rating']['mean'].mean()
avg_avg
# find 50 percentile
percentile = md['rating']['sum'].quantile(0.5)
print('Average of mean is {} and 50 percentile is {}'.format(avg_avg
                                                            , percentile))

Average of mean is 4.128023690461919 and 50 percentile is 2891.0


In [16]:
# Qulified output have at least 2891 vote and have average rating more than 4.128
qualified = md[ (md['rating']['sum'] >= percentile) &
               (md['rating']['sum'].notnull()) &
               (md['rating']['mean'].notnull()) ]

qualified['rating']['sum'] = qualified['rating']['sum'].astype('int')
qualified['rating']['mean'] = qualified['rating']['mean'].astype('int')
qualified.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0_level_0,spotId,title,rating,rating
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean
17,18,ASIATIQUE The Riverfront,16004.0,4.098335
29,30,Central World,10503.0,4.24021
24,25,Chao Phraya River,9609.0,4.234905
7,8,Chatuchak weekend market,28741.0,4.321305
21,22,Chinatown - Bangkok (Yaowarat Road),11620.0,3.989015


In [17]:
# Simple Weighting function
# Output will have at least 2891 votes and have average rating over 4.128
def weighted_rating(x):
    count = x['rating']['sum']
    average = x['rating']['mean']
    return (count/(count+percentile) * average) + (percentile/(count + percentile) * avg_avg)

In [18]:
# Compute score
qualified['score'] = qualified.apply(weighted_rating, axis = 1)
qualified = qualified.sort_values('score', ascending = False)

# Top places recommend
candidate_simple = qualified.drop(['rating'], axis = 1)
candidate_simple = pd.DataFrame(data = candidate_simple)
candidate_simple.head(20)
# Recommend top 5,10,15, and20 spots

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  new_axis = axis.drop(labels, errors=errors)


Unnamed: 0,spotId,title,score
,,,
1.0,2.0,Wat Phra Chetuphon (Wat Pho),4.553976
0.0,1.0,Temple of the Emerald Buddha (Wat Phra Kaew),4.383966
2.0,3.0,Temple of Dawn (Wat Arun),4.371982
3.0,4.0,Temple of the Golden Buddha (Wat Traimit),4.352891
35.0,36.0,Siam Niramit Show,4.323505
20.0,21.0,Siam Paragon,4.311319
10.0,11.0,Jim Thompson house,4.308838
7.0,8.0,Chatuchak weekend market,4.30364
11.0,12.0,Lumpini park,4.25221


## Evaluate Simple Recommendation

In [48]:
# ผลลัพธ์ของ username 'zamanwho'
candidate = candidate_simple['title'].tolist()

user_rated = list(get_average('zamanwho').keys())

find_apk(user_rated, candidate, 10)

Unnamed: 0,@k,Average precision,Average recall
0,1.0,0.0,0.0
1,2.0,0.0,0.0
2,3.0,0.0,0.0
3,4.0,0.0,0.0
4,5.0,0.0,0.0
5,6.0,0.0,0.0
6,7.0,0.0,0.0
7,8.0,0.0,0.0
8,9.0,0.0,0.0
9,10.0,0.0,0.0


### **-------------------------------------------------------------------------------------------------------------------------**

# 2. Content-based Model

In [19]:
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
# # Use travel attractions profile
# df = pd.read_csv('Data/Travel attractions.csv')
# df.head(3)

In [21]:
# transform genres columns to bag of word
travel['genres'] = travel['genres'].map(lambda x: x.split('|'))
travel['Bag_of_words'] = ''
for title in range(0, travel['title'].count()):
    words = ''
    for gen in travel['genres'][title]:
        words += gen + ' '
    travel['Bag_of_words'][title] = words
travel.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,spotId,title,genres,overview,Bag_of_words
0,1,Temple of the Emerald Buddha (Wat Phra Kaew),"[Relax, Convenience, GoodView, Local, Accommod...",Temple of the Emerald Buddha is regarded as th...,Relax Convenience GoodView Local Accommodation...
1,2,Wat Phra Chetuphon (Wat Pho),"[Relax, Convenience, GoodView, Local, Accommod...",The temple is first on the list of six temples...,Relax Convenience GoodView Local Accommodation...
2,3,Temple of Dawn (Wat Arun),"[Relax, Convenience, GoodView, Local, Accommod...",Wat Arun is a Buddhist temple in Bangkok Yai d...,Relax Convenience GoodView Local Accommodation...
3,4,Temple of the Golden Buddha (Wat Traimit),"[Relax, Convenience, GoodView, Local, Accommod...",The Golden Buddha commonly known in Thai as Ph...,Relax Convenience GoodView Local Accommodation...
4,5,Wat Suthat,"[Relax, Convenience, GoodView, Local, Accommod...","Wat Suthat is a Buddhist temple in Bangkok, Th...",Relax Convenience GoodView Local Accommodation...


In [22]:
# Use only 2 columns title and bag of words
df_gen = travel[['title','Bag_of_words']]
df_gen.head()

Unnamed: 0,title,Bag_of_words
0,Temple of the Emerald Buddha (Wat Phra Kaew),Relax Convenience GoodView Local Accommodation...
1,Wat Phra Chetuphon (Wat Pho),Relax Convenience GoodView Local Accommodation...
2,Temple of Dawn (Wat Arun),Relax Convenience GoodView Local Accommodation...
3,Temple of the Golden Buddha (Wat Traimit),Relax Convenience GoodView Local Accommodation...
4,Wat Suthat,Relax Convenience GoodView Local Accommodation...


In [23]:
# Find average rating for eacg user
# def get_average(username):
#     user_df = pd.read_csv('test_process.csv')
#     user_rated = {}
#     for index, values in user_df[user_df['username'] == username].iterrows():
#         if values['title'] not in user_rated:
#             user_rated[values['title']] = values['rating']
#         else:
#             user_rated[values['title']] = (user_rated[values['title']] + values['rating'])/2
#     return user_rated

In [24]:
user_rated = get_average('zamanwho')

my_ratings = pd.Series(user_rated)
my_ratings.index

Index(['Central World', 'Platinum Fashion Mall'], dtype='object')

In [25]:
user_df[user_df['username']=='zamanwho'][['username','name', 'title', 'rating', 'review_body']]

Unnamed: 0,username,name,title,rating,review_body
3735,zamanwho,zamanwho,Central World,5.0,Central World Plaza is an enormous Mall situat...
4645,zamanwho,zamanwho,Platinum Fashion Mall,4.0,The Platinum Fashion Mall is located in the sa...


In [26]:
df[df['username']=='zamanwho'][['username','name', 'title', 'rating', 'review_body']]

Unnamed: 0,username,name,title,rating,review_body
27946,zamanwho,zamanwho,Central World,4.0,It’s one of the huge shopping malls in Bangkok...
31684,zamanwho,zamanwho,MBK Center (Ma Boon Khrong Center),4.0,It’s a lovely place in Bangkok for shopping lo...
33514,zamanwho,zamanwho,Pratunam Market,3.0,Pratunam Market is one of the crowded place as...
40753,zamanwho,zamanwho,Chatuchak weekend market,4.0,One has to plan plenty of time to roaming arou...
41414,zamanwho,zamanwho,Lumpini park,4.0,This lovely park is a beautiful place for roam...
41940,zamanwho,zamanwho,Temple of the Emerald Buddha (Wat Phra Kaew),4.0,The Temple of the Emerald Buddha is also known...
43509,zamanwho,zamanwho,Siam Paragon,4.0,"Siam Paragon, a huge mall is located in the he..."
44710,zamanwho,zamanwho,Wat Phra Chetuphon (Wat Pho),4.0,The Temple of the Reclining Buddha known as Wa...
46997,zamanwho,zamanwho,The Grand Palace,4.0,The Grand Palace Bangkok is very interesting p...
49688,zamanwho,zamanwho,Temple of the Emerald Buddha (Wat Phra Kaew),4.0,The Temple of the Emerald Buddha is also known...


In [27]:
## ต้องใช้ตัวนี้ในการทำ
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(travel['Bag_of_words'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

for idx, row in travel.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], travel['spotId'][i]) for i in similar_indices]

    results[row['spotId']] = similar_items[0:] #****** แก้ตรงนี้ให้เป็น 1 จะไม่มีตัวมันเองในลิส แต่ถ้าเป็น 0 จะมี

def item(id):
    return travel.loc[travel['spotId'] == id]['title'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary.
def recommend(item_id, num):
#     print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
#     print("-------")
    recs = results[item_id][:num]
#     for rec in recs:
#         print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")
    return recs

# recommend(item_id=1, num=20)

In [28]:
# function ในการหา Content-based (1 user)
def content_based(my_ratings):
    sim_candidates = pd.DataFrame()
    for i in range(0, len(my_ratings.index)):
#         print('Adding sims for', my_ratings.index[i], '...') 
        rec_df = recommend(travel.loc[travel['title'] == my_ratings.index[i]]['spotId'].tolist()[0], 50) # <-- แก้ตรงนี้ถ้าจะ recommend เพิ่มขึ้นหรือน้อยลง
#     print(rec_df[2])
        sim_candidates = sim_candidates.append(rec_df)
    sim_candidates_result = sim_candidates.groupby(by = 1).sum() # Sum of score in my_ratings
    sim_candidates_result.reset_index(level=0, inplace=True)
    
    x = ''
    for i in my_ratings.index:
        x += '\n' + i
    
    sim_candidates_result.sort_values(by = 0,inplace=True, ascending=False)
    # print("Recommend Spot for : {} is ...".format(x))
    candidate_content = sim_candidates_result.merge(travel, left_on = 1, right_on = 'spotId')
    candidate_content.rename(columns={0:'score'}, inplace=True)
    candidate_content = candidate_content[['spotId','title','score']]
#     print(candidate_content.head(10))
    return candidate_content
content_based(my_ratings)
# ['Temple of the Emerald Buddha (Wat Phra Kaew)', 'Siam Paragon',
#        'Temple of the Golden Buddha (Wat Traimit)', 'Sukhumvit',
#        'Museum of Contemporary Art (MOCA)']

Unnamed: 0,spotId,title,score
0,49,IconSiam,1.531126
1,21,Siam Paragon,1.531126
2,46,Pantip Plaza,1.531126
3,45,MBK Center (Ma Boon Khrong Center),1.531126
4,23,Platinum Fashion Mall,1.531126
5,13,Terminal 21,1.531126
6,30,Central World,1.531126
7,48,Emporium Bangkok and EmQuartier Bangkok,1.469344
8,32,King Power Mahanakhon,1.243723
9,39,Flow House Bangkok,1.180739


## Evaluate model (Content-based use only genres)

In [33]:
user_rated

['Central World', 'Platinum Fashion Mall']

In [32]:
candidate_content

['IconSiam',
 'Siam Paragon',
 'Pantip Plaza',
 'MBK Center (Ma Boon Khrong Center)',
 'Platinum Fashion Mall',
 'Terminal 21',
 'Central World',
 'Emporium Bangkok and EmQuartier Bangkok',
 'King Power Mahanakhon ',
 'Flow House Bangkok',
 'Baiyoke Sky Tower',
 'Dream World',
 'Siam AMAZING Park',
 'Sukhumvit',
 'Lumpini park ',
 'SEA LIFE Bangkok Ocean World',
 'Art in Paradise Bangkok',
 'The National Museum & Wang Na Palace ',
 'Museum of Contemporary Art (MOCA)',
 'ASIATIQUE The Riverfront',
 'Pratunam Market',
 'Bangkok Art and Culture Center ',
 'Calypso Cabaret',
 'Chinatown - Bangkok (Yaowarat Road)',
 'Damnoen Saduak Floating Market ',
 'Taling Chan Floating Market',
 'Siam Niramit Show',
 'Train Night Market Ratchada',
 'Soi Cowboy ',
 'Patpong Night Market',
 'KhaoSan Road',
 'Madame Tussauds Bangkok',
 'Safari World',
 'Chatuchak weekend market ',
 'Wat Bowonniwet Vihara',
 'Temple of the Emerald Buddha (Wat Phra Kaew)',
 'Wat Phra Chetuphon (Wat Pho)',
 'Wat Benchamabophi

In [50]:
# ผลลัพธ์ของ username 'nellielim'
user_rated = list(get_average('zamanwho').keys())
candidate_content = content_based(my_ratings)['title'].tolist()

find_apk(user_rated, candidate_content, 10)

Unnamed: 0,@k,Average precision,Average recall
0,1.0,0.0,0.0
1,2.0,0.0,0.0
2,3.0,0.0,0.0
3,4.0,0.0,0.0
4,5.0,0.2,0.5
5,6.0,0.2,0.5
6,7.0,0.242857,0.75
7,8.0,0.242857,0.75
8,9.0,0.242857,0.75
9,10.0,0.242857,0.75


In [52]:
# ผลลัพธ์จากตัวอย่าง 560 คนที่มีการให้คะแนนมากกว่า 10 ครั้ง
# Use genres column only!
MAP_MAR(content_based, 10)

Unnamed: 0,@k,Mean average precision,Mean average recall
0,1.0,0.598083,0.560999
1,2.0,0.62995,0.621978
2,3.0,0.670448,0.740737
3,4.0,0.674964,0.771396
4,5.0,0.68809,0.849274
5,6.0,0.69259,0.880023
6,7.0,0.702432,0.957631
7,8.0,0.700249,0.963013
8,9.0,0.69953,0.964549
9,10.0,0.699057,0.965424


### **-------------------------------------------------------------------------------------------------------------------------**

# 3. Singular value decomposition (SVD)

In [53]:
# df = pd.read_csv('train_process_over3_newover3.csv')
# df = df.drop_duplicates()
# df = df.dropna(subset = ['username'])

# travel = pd.read_csv('Data/travel attractions.csv')

In [29]:
reader = Reader()

data = Dataset.load_from_df(df[['username', 'spotId','rating']], reader)

# trainset คือ ข้อมูลทั้งหมดของเราไม่ได้แบ่ง 80%
trainset = data.build_full_trainset()

In [31]:
# from surprise import Reader, Dataset, SVD
from surprise.model_selection import GridSearchCV
# from surprise.model_selection.validation import cross_validate

param_grid = {'n_epochs': [25, 40], 'lr_all': [0.01, 0.015],
              'reg_all': [0.6, 0.8], 'n_factors':[25,50,75]}

gs_svd = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs_svd.fit(data)

print(gs_svd.best_score['rmse'])
print(gs_svd.best_score['mae'])

print(gs_svd.best_params['rmse'])

In [32]:
# svd_algo = gs_svd.best_estimator['rmse']
svd_algo = SVD(n_factors= 75, n_epochs = 25, lr_all = 0.01, reg_all = 0.6 ,random_state = 1) 
# svd_algo = SVD()

In [57]:
# Run 5-fold cross-validation and print results
# Training 80% Test 20% from trainingset data
cross_validate(svd_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8342  0.8183  0.8214  0.8378  0.8299  0.8283  0.0074  
MAE (testset)     0.6505  0.6459  0.6466  0.6576  0.6505  0.6502  0.0042  
Fit time          3.70    4.25    3.76    4.05    4.04    3.96    0.20    
Test time         0.19    0.13    0.09    0.13    0.11    0.13    0.03    


{'test_rmse': array([0.83423992, 0.81826031, 0.82137158, 0.83776067, 0.82987107]),
 'test_mae': array([0.65049873, 0.64587078, 0.64657389, 0.65758088, 0.65051376]),
 'fit_time': (3.6950368881225586,
  4.254155874252319,
  3.7647697925567627,
  4.045993089675903,
  4.04119610786438),
 'test_time': (0.18632006645202637,
  0.12829279899597168,
  0.09309005737304688,
  0.13331389427185059,
  0.10760498046875)}

In [33]:
# Create model from training set
svd_algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc4e4f3f150>

## ทำนาย User WillGrace2013

In [34]:
# df_title = pd.read_csv('Data/travel attractions.csv', usecols = ['title','spotId'])
# candidate_svd = df_title.copy()

# candidate_svd['score'] = candidate_svd['spotId'].apply(lambda x: svd.predict('nellielim',x).est)
# candidate_svd = candidate_svd.sort_values(by=['score'], ascending=False)
# candidate_svd.head(20)
find_candidate(svd_algo, 'zamanwho').head(20)

Unnamed: 0,spotId,title,score
48,49,IconSiam,4.291303
40,41,Museum of Contemporary Art (MOCA),4.29003
31,32,King Power Mahanakhon,4.233507
25,26,Rajadamnern Thai Boxing Stadium,4.215957
27,28,Snake Farm (Queen Saovabha Memorial Institute),4.211325
3,4,Temple of the Golden Buddha (Wat Traimit),4.201375
46,47,Wat Bowonniwet Vihara,4.186072
4,5,Wat Suthat,4.164669
5,6,Wat Benchamabophit (The Marble Temple),4.164074
2,3,Temple of Dawn (Wat Arun),4.149796


## Evaluate model

In [60]:
# ผลลัพธ์ของ username 'WillGrace2013'
user_rated = list(get_average('zamanwho').keys())
candidate_svd = find_candidate(svd_algo, 'zamanwho')['title'].tolist()

find_apk(user_rated, candidate_svd, 10)

Unnamed: 0,@k,Average precision,Average recall
0,1.0,0.0,0.0
1,2.0,0.0,0.0
2,3.0,0.0,0.0
3,4.0,0.0,0.0
4,5.0,0.0,0.0
5,6.0,0.0,0.0
6,7.0,0.0,0.0
7,8.0,0.0,0.0
8,9.0,0.0,0.0
9,10.0,0.0,0.0


In [64]:
# ผลลัพธ์จากตัวอย่าง 8724 คนที่มีการให้คะแนนมากกว่า 3 ครั้ง
MAP_MAR(svd_algo, 10)

Unnamed: 0,@k,Mean average precision,Mean average recall
0,1.0,0.088327,0.07945
1,2.0,0.090983,0.083847
2,3.0,0.091829,0.086137
3,4.0,0.092147,0.087247
4,5.0,0.093197,0.092174
5,6.0,0.095988,0.106062
6,7.0,0.099276,0.129106
7,8.0,0.099851,0.133511
8,9.0,0.100643,0.139065
9,10.0,0.102013,0.151283


### **-------------------------------------------------------------------------------------------------------------------------**

# 4. SVD++ model

In [65]:
# df = pd.read_csv('train_process.csv')
# df = df.drop_duplicates()
# df = df.dropna(subset = ['username'])

# travel = pd.read_csv('Data/travel attractions.csv')

In [35]:
from surprise import SVDpp

# reader = Reader()

# data = Dataset.load_from_df(df[['username', 'spotId','rating']], reader)
# trainset = data.build_full_trainset()

# param_grid = {'n_epochs': [15, 25], 'lr_all': [0.01, 0.015],
#               'reg_all': [0.6, 0.8], 'n_factors':[25,50]}

# gs_svdpp = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3)
# gs_svdpp.fit(data)

# print(gs_svdpp.best_score['rmse'])
# print(gs_svdpp.best_score['mae'])

# print(gs_svdpp.best_params['rmse'])


# svdpp_algo = gs_svdpp.best_estimator['rmse']
# cross_validate(svdpp_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


# svdpp_algo.fit(trainset)
# svdpp_predictions = svdpp_algo.test(testset)

# RMSE 0.8321015116249821
# MAE 0.652170682543113
# {'n_epochs': 25, 'lr_all': 0.01, 'reg_all': 0.6, 'n_factors': 50}

In [36]:
svdpp_algo = SVDpp(n_factors= 75, n_epochs = 25, lr_all = 0.01, reg_all = 0.6 ,random_state = 1)

# trainset = data.build_full_trainset()
# Create model from training set
svdpp_algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fc4eacd4290>

In [68]:
cross_validate(svdpp_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8296  0.8363  0.8289  0.8264  0.8207  0.8284  0.0050  
MAE (testset)     0.6504  0.6533  0.6491  0.6484  0.6473  0.6497  0.0020  
Fit time          17.94   18.15   18.30   18.05   18.08   18.10   0.12    
Test time         0.24    0.33    0.32    0.23    0.24    0.27    0.04    


{'test_rmse': array([0.82961807, 0.83626361, 0.82885913, 0.82642189, 0.82070023]),
 'test_mae': array([0.65036692, 0.65325212, 0.64910233, 0.64842139, 0.64730359]),
 'fit_time': (17.935545206069946,
  18.15137481689453,
  18.304162979125977,
  18.048035144805908,
  18.08067774772644),
 'test_time': (0.24394607543945312,
  0.3252279758453369,
  0.31627798080444336,
  0.22737979888916016,
  0.23821210861206055)}

## ทำนาย username 'WillGrace2013'

In [37]:
# df_title = pd.read_csv('Data/travel attractions.csv', usecols = ['title','spotId'])
# candidate_svdpp = df_title.copy()

# candidate_svdpp['score'] = candidate_svdpp['spotId'].apply(lambda x: svdpp_algo.predict('nellielim',x).est)
# candidate_svdpp = candidate_svdpp.sort_values(by=['score'], ascending=False)
# candidate_svdpp.head(20)
find_candidate(svdpp_algo, 'zamanwho')

Unnamed: 0,spotId,title,score
48,49,IconSiam,4.291301
40,41,Museum of Contemporary Art (MOCA),4.290007
31,32,King Power Mahanakhon,4.233501
25,26,Rajadamnern Thai Boxing Stadium,4.215889
27,28,Snake Farm (Queen Saovabha Memorial Institute),4.21129
3,4,Temple of the Golden Buddha (Wat Traimit),4.201369
46,47,Wat Bowonniwet Vihara,4.186077
4,5,Wat Suthat,4.164672
5,6,Wat Benchamabophit (The Marble Temple),4.164062
2,3,Temple of Dawn (Wat Arun),4.149802


## Evaluate model (svdpp)

In [72]:
user_rated

['Central World', 'Platinum Fashion Mall']

In [71]:
# ผลลัพธ์ของ User 'WillGrace2013'
candidate_svdpp = find_candidate(svdpp_algo,'zamanwho')['title'].tolist()
user_rated = list(get_average('zamanwho').keys())

find_apk(user_rated, candidate_svdpp, 10)

Unnamed: 0,@k,Average precision,Average recall
0,1.0,0.0,0.0
1,2.0,0.0,0.0
2,3.0,0.0,0.0
3,4.0,0.0,0.0
4,5.0,0.0,0.0
5,6.0,0.0,0.0
6,7.0,0.0,0.0
7,8.0,0.0,0.0
8,9.0,0.0,0.0
9,10.0,0.0,0.0


In [75]:
# ผลลัพธ์รวมทั้งหมด (เทสกับข้อมูล user 560 คนที่มีการ rated มากกว่า 10 ครั้ง)
MAP_MAR(svdpp_algo, 10)

Unnamed: 0,@k,Mean average precision,Mean average recall
0,1.0,0.088212,0.079421
1,2.0,0.089713,0.081913
2,3.0,0.090463,0.08378
3,4.0,0.091935,0.088589
4,5.0,0.092917,0.092841
5,6.0,0.097995,0.11969
6,7.0,0.106382,0.176676
7,8.0,0.109198,0.198673
8,9.0,0.112279,0.225835
9,10.0,0.11519,0.25675


### **-------------------------------------------------------------------------------------------------------------------------**

# 5. Item-based Collaborative filtering

In [76]:
# df = pd.read_csv('train_process_over3_newover3.csv')
# df = df.drop_duplicates()
# df = df.dropna(subset = ['username'])

# travel = pd.read_csv('Data/travel attractions.csv')

In [77]:
reader = Reader()

# create training data
data = Dataset.load_from_df(df[['username', 'spotId','rating']], reader)
trainset = data.build_full_trainset()

In [38]:
sim_options = {'name':'pearson_baseline', 'user_based':False}
knnbaseline_algo = KNNBaseline(sim_options = sim_options, k= 7)

knnbaseline_algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7fc4e9e5d950>

In [79]:
# test model with cross-validation
cross_validate(knnbaseline_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9211  0.9234  0.8961  0.9063  0.9279  0.9150  0.0119  
MAE (testset)     0.6456  0.6487  0.6329  0.6369  0.6509  0.6430  0.0069  
Fit time          0.26    0.27    0.29    0.33    0.29    0.29    0.02    
Test time         0.21    0.20    0.27    0.34    0.2

{'test_rmse': array([0.92114981, 0.92343981, 0.89613789, 0.90633915, 0.92786951]),
 'test_mae': array([0.64559792, 0.64868495, 0.63291115, 0.63692873, 0.65092887]),
 'fit_time': (0.2588491439819336,
  0.2668578624725342,
  0.29135894775390625,
  0.3265390396118164,
  0.29178786277770996),
 'test_time': (0.20958423614501953,
  0.20090699195861816,
  0.2684202194213867,
  0.3376922607421875,
  0.20946407318115234)}

## ทำนาย username 'WillGrace2013'

In [80]:
# user_rated = get_average('WillGrace2013')

In [81]:
# df_title = pd.read_csv('Data/travel attractions.csv', usecols = ['title','spotId'])
# candidate_knn = df_title.copy()

# candidate_knn['score'] = candidate_knn['spotId'].apply(lambda x: knnbaseline_algo.predict('nellielim',x).est)
# candidate_knn = candidate_knn.sort_values(by=['score'], ascending=False)
# candidate_knn = candidate_knn[~candidate_knn['title'].isin(user_rated.keys())]
# candidate_knn.head(20)
find_candidate(knnbaseline_algo, 'zamanwho').head(20)

Unnamed: 0,spotId,title,score
31,32,King Power Mahanakhon,4.379607
29,30,Central World,4.365335
46,47,Wat Bowonniwet Vihara,4.189246
40,41,Museum of Contemporary Art (MOCA),4.173055
4,5,Wat Suthat,4.10153
26,27,Art in Paradise Bangkok,4.097917
48,49,IconSiam,4.093323
2,3,Temple of Dawn (Wat Arun),4.079638
5,6,Wat Benchamabophit (The Marble Temple),4.034987
10,11,Jim Thompson house,4.022323


## Evaluate model Item-based Collaborative filtering

In [82]:
# ผลลัพธ์ของ User 'nellielim'
user_rated = list(get_average('zamanwho').keys())
candidate_knn = find_candidate(knnbaseline_algo, 'zamanwho')['title'].tolist()

find_apk(user_rated, candidate_knn, 10)

Unnamed: 0,@k,Average precision,Average recall
0,1.0,0.0,0.0
1,2.0,0.5,0.5
2,3.0,0.5,0.5
3,4.0,0.5,0.5
4,5.0,0.5,0.5
5,6.0,0.5,0.5
6,7.0,0.5,0.5
7,8.0,0.5,0.5
8,9.0,0.5,0.5
9,10.0,0.5,0.5


In [83]:
# ผลลัพธ์รวมทั้งหมด (เทสกับข้อมูล user 560 คนที่มีการ rated มากกว่า 10 ครั้ง)
MAP_MAR(knnbaseline_algo, 10)

Unnamed: 0,@k,Mean average precision,Mean average recall
0,1.0,0.049763,0.045081
1,2.0,0.060617,0.064211
2,3.0,0.067592,0.081841
3,4.0,0.072971,0.100862
4,5.0,0.077531,0.12237
5,6.0,0.081402,0.143471
6,7.0,0.085023,0.167184
7,8.0,0.088657,0.194654
8,9.0,0.091499,0.220782
9,10.0,0.093807,0.242739


### **-------------------------------------------------------------------------------------------------------------------------**

# User-based recommendation|

In [39]:
sim_options = {'name':'pearson_baseline', 'user_based':True}
knnbaseline_algo2 = KNNBaseline(sim_options = sim_options, k= 7)

knnbaseline_algo2.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7fc4e9e5d150>

In [85]:
# test model with cross-validation
cross_validate(knnbaseline_algo2, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8747  0.8856  0.8710  0.8824  0.8701  0.8768  0.0062  
MAE (testset)     0.6596  0.6636  0.6608  0.6656  0.6554  0.6610  0.0035  
Fit time          199.03  202.25  206.83  204.74  203.96  203.36  2.62    
Test time         38.15   38.21   36.69   38.56   39.

{'test_rmse': array([0.87467658, 0.88562019, 0.87095408, 0.88238997, 0.87014155]),
 'test_mae': array([0.65959589, 0.66358699, 0.6607739 , 0.66558238, 0.65535954]),
 'fit_time': (199.02716183662415,
  202.24796199798584,
  206.83022117614746,
  204.7435760498047,
  203.96393489837646),
 'test_time': (38.154205322265625,
  38.208712100982666,
  36.689704179763794,
  38.55995202064514,
  39.236464738845825)}

In [86]:
# ผลลัพธ์ของ User 'nellielim'
user_rated = list(get_average('zamanwho').keys())
candidate_knn2 = find_candidate(knnbaseline_algo2, 'zamanwho')['title'].tolist()
find_apk(user_rated, candidate_knn2, 10)

Unnamed: 0,@k,Average precision,Average recall
0,1.0,0.0,0.0
1,2.0,0.0,0.0
2,3.0,0.0,0.0
3,4.0,0.0,0.0
4,5.0,0.0,0.0
5,6.0,0.0,0.0
6,7.0,0.0,0.0
7,8.0,0.0,0.0
8,9.0,0.0,0.0
9,10.0,0.0,0.0


In [87]:
candidate_knn = find_candidate(knnbaseline_algo2, 'zamanwho')
candidate_knn[:20]

Unnamed: 0,spotId,title,score
40,41,Museum of Contemporary Art (MOCA),4.870617
26,27,Art in Paradise Bangkok,4.6075
16,17,Train Night Market Ratchada,4.462611
4,5,Wat Suthat,4.455967
10,11,Jim Thompson house,4.448285
3,4,Temple of the Golden Buddha (Wat Traimit),4.423952
28,29,Siriraj Medical Museum,4.395518
6,7,The National Museum & Wang Na Palace,4.304181
31,32,King Power Mahanakhon,4.299068
46,47,Wat Bowonniwet Vihara,4.291891


In [88]:
# ผลลัพธ์รวมทั้งหมด (เทสกับข้อมูล user 560 คนที่มีการ rated มากกว่า 10 ครั้ง)
MAP_MAR(knnbaseline_algo2, 10)

# Avg. precision @ 3 is 10.024154589371978
# Avg. precision @ 5 is 9.251207729468598
# Avg. precision @ 7 is 8.661145617667357
# Avg. precision @ 9 is 8.588298443370908
# Mean average precision is 9.131201594969712 

# Avg. Recall @ 3 is 10.907857909669502
# Avg. Recall @ 5 is 17.043267387470284
# Avg. Recall @ 7 is 22.425715052526648
# Avg. Recall @ 9 is 28.753354804079436
# Mean average recall is 19.78254878843647 

Unnamed: 0,@k,Mean average precision,Mean average recall
0,1.0,0.058192,0.051701
1,2.0,0.072798,0.076865
2,3.0,0.080437,0.097159
3,4.0,0.08673,0.120441
4,5.0,0.092303,0.145733
5,6.0,0.097442,0.175328
6,7.0,0.101204,0.201691
7,8.0,0.10469,0.229032
8,9.0,0.107656,0.255241
9,10.0,0.110829,0.285443


# Hybrid model

In [40]:
# def hybrid(username):
#     # เตรียมช้อมูล
#     # เลือกเฉพาะ Columns ที่จะใช้
#     df_subset = df.loc[:, ['username', 'rating', 'title', 'spotId']]
#     result = travel.loc[:, ['spotId', 'title']]
  
#     # Content-based result
#     # หา Dict ของสถานที่ที่ username นี้ไปใน Testing set
#     user_rated = get_average(username)
#     # ทำเป็น Series
#     my_ratings = pd.Series(user_rated)
#     my_ratings.index
#     # หาผลลัพธ์ของ Content-based output = dataFrame (spotId, title, score)
#     candidate_cb = content_based(my_ratings)
    
#     # Weighted โมเดลทั้งผลของ svd และ item-based collaborative filtering
#     result['est'] = result['spotId'].apply(lambda x: 0.5*knnbaseline_algo2.predict(username, x).est +
#                                             0.5*svd_algo.predict(username, x).est)
#     # เอาผลลัพธ์ของ Content-based มารวมกับ Collaborative filtering
#     result = result.merge(candidate_cb, on = 'spotId')
# #     print(result.columns)
#     result = result.drop('title_y', axis =1).rename(columns={"title_x": "title"})
    
#     # Weight โมเดลรวมผลของ Content-based เป็น Final score
#     result['Final score'] = (result['est']) * (result['score']**2)
#     result['model'] = 'CB + SVD + CF'
#     recommend_list = result[['spotId','title', 'Final score', 'model']].sort_values(by = 'Final score', ascending = False)
    
#     return recommend_list

In [135]:
def hybrid(username):
    # เตรียมช้อมูล
    # เลือกเฉพาะ Columns ที่จะใช้
    df_subset = df.loc[:, ['username', 'rating', 'title', 'spotId']]
    result = travel.loc[:, ['spotId', 'title']]
  
    # Content-based result
    # หา Dict ของสถานที่ที่ username นี้ไปใน Testing set
    user_rated = get_average(username)
    # ทำเป็น Series
    my_ratings = pd.Series(user_rated)
    my_ratings.index
    # หาผลลัพธ์ของ Content-based output = dataFrame (spotId, title, score)
    candidate_cb = content_based(my_ratings)
    
    # Weighted โมเดลทั้งผลของ svd และ item-based collaborative filtering
    result['est'] = result['spotId'].apply(lambda x: 0.2*knnbaseline_algo.predict(username, x).est + 
                                           0.5*knnbaseline_algo2.predict(username, x).est+
                                           0.3*svd_algo.predict(username, x).est)
    # เอาผลลัพธ์ของ Content-based มารวมกับ Collaborative filtering
    result = result.merge(candidate_cb, on = 'spotId')
#     print(result.columns)
    result = result.drop('title_y', axis =1).rename(columns={"title_x": "title"})
    
    # Weight โมเดลรวมผลของ Content-based เป็น Final score
    result['Final score'] = (result['est'])+(result['score']*8)
    result['model'] = 'UCF + ICF + SVD + CF'
    recommend_list = result[['spotId','title', 'Final score', 'model']].sort_values(by = 'Final score', ascending = False)
    
    return recommend_list

In [15]:
def hybrid2(username):
    # เตรียมช้อมูล
    # เลือกเฉพาะ Columns ที่จะใช้
    df_subset = df.loc[:, ['username', 'rating', 'title', 'spotId']]
    result = travel.loc[:, ['spotId', 'title']]
  
    # Content-based result
    # หา Dict ของสถานที่ที่ username นี้ไปใน Testing set
    user_rated = get_average(username)
    # ทำเป็น Series
    my_ratings = pd.Series(user_rated)
    my_ratings.index
    # หาผลลัพธ์ของ Content-based output = dataFrame (spotId, title, score)
    candidate_cb = content_based(my_ratings)
#     result = result[result['title'].isin(candidate_cb[:int(len(df['title'].unique())/2)]['title'].tolist())]
    result = candidate_cb[:20]
#     print(result)
    
    # Weighted โมเดลทั้งผลของ svd และ item-based collaborative filtering
    result['est'] = result['spotId'].apply(lambda x: 0.5*knnbaseline_algo.predict(username, x).est +
                        0.5*svd_algo.predict(username, x).est)
    # เอาผลลัพธ์ของ Content-based มารวมกับ Collaborative filtering
    result = result.merge(candidate_cb, on = 'spotId')
#     print(result.columns)
    result = result.drop('title_y', axis =1).rename(columns={"title_x": "title"})
#     print(result)
    
    # Weight โมเดลรวมผลของ Content-based เป็น Final score
#     result['Final score'] = (0.3*result['est']) + (0.7*result['score'])
    result['model'] = 'CB -> CF'
    recommend_list = result[['spotId','title', 'est', 'model']].sort_values(by = 'est', ascending = False)
    
    return recommend_list

In [16]:
# Meta-level [SVD + CF -> CB]
def hybrid3(username):
    # เตรียมช้อมูล
    # เลือกเฉพาะ Columns ที่จะใช้
    df_subset = df.loc[:, ['username', 'rating', 'title', 'spotId']]
    result = travel.loc[:, ['spotId', 'title']]
    
    # Weighted โมเดลทั้งผลของ svd และ item-based collaborative filtering
    result['est'] = result['spotId'].apply(lambda x: 0.5*knnbaseline_algo.predict(username, x).est +
                           0.5*svd_algo.predict(username, x).est)
    result = result.sort_values(by = 'est', ascending = False)[:30]
#     print(result)

    # Content-based result
    # หา Dict ของสถานที่ที่ username นี้ไปใน Testing set
    user_rated = get_average(username)
    # ทำเป็น Series
    my_ratings = pd.Series(user_rated)
    my_ratings.index
    # หาผลลัพธ์ของ Content-based output = dataFrame (spotId, title, score)
    candidate_cb = content_based(my_ratings)
    
    # เอาผลลัพธ์ของ Content-based มารวมกับ Collaborative filtering
    result = result.merge(candidate_cb, on = 'spotId')
    result = result.drop('title_y', axis =1).rename(columns={"title_x": "title"})
#     print(result)
    
    # Weight โมเดลรวมผลของ Content-based เป็น Final score
#     result['Final score'] = (0.3*result['est']) + (0.7*result['score'])
    result['model'] = 'CF -> CB'
    recommend_list = result[['spotId','title', 'score', 'model']].sort_values(by = 'score', ascending = False)
    
    return recommend_list

In [17]:
def hybrid4(username):
    # เตรียมช้อมูล
    # เลือกเฉพาะ Columns ที่จะใช้
    df_subset = df.loc[:, ['username', 'rating', 'title', 'spotId']]
    result = travel.loc[:, ['spotId', 'title']]
  
    # Content-based result
    # หา Dict ของสถานที่ที่ username นี้ไปใน Testing set
    user_rated = get_average(username)
    # ทำเป็น Series
    my_ratings = pd.Series(user_rated)
    my_ratings.index
    # หาผลลัพธ์ของ Content-based output = dataFrame (spotId, title, score)
    candidate_cb = content_based(my_ratings)
    
    # Weighted โมเดลทั้งผลของ svd และ item-based collaborative filtering
    result['est'] = result['spotId'].apply(lambda x: 0.2*knnbaseline_algo.predict(username, x).est + 
                                           0.5*knnbaseline_algo2.predict(username, x).est+
                                           0.3*svd_algo.predict(username, x).est)
    # เอาผลลัพธ์ของ Content-based มารวมกับ Collaborative filtering
    result = result.merge(candidate_cb, on = 'spotId')
#     print(result.columns)
    result = result.drop('title_y', axis =1).rename(columns={"title_x": "title"})
    
    # Weight โมเดลรวมผลของ Content-based เป็น Final score
    result['Final score'] = (result['est'])*(result['score']**2)
    result['model'] = 'UCF + ICF + SVD + CF'
    recommend_list = result[['spotId','title', 'Final score', 'model']].sort_values(by = 'Final score', ascending = False)
    
    return recommend_list

In [18]:
# สำหรับ Hybrid model!!!
user_df = pd.read_csv('test_process_over2.csv')
def MAP_MAR_Hybrid(model, k):
    all_user_rated = []
    all_predicted = []
    

    # testing_process เป็นข้อมูล testing เป็น user ที่แบ่งมา 20% ดยคัดคนที่ rated มากกว่า 2 ครั้ง
    for name in user_df['username'].unique():
        try:
        # get average rating from spot where user traveled in the past
            user_rated = get_average(name)
        # find candidate score for each usernames [spotId, title, score]
            candidate = model(name)
            all_user_rated.append(list(user_rated.keys()))
            all_predicted.append(candidate['title'].tolist())
            
        except:
            user_rated = get_average(name)
            my_ratings = pd.Series(user_rated)
            candidate = model(my_ratings)
            all_user_rated.append(my_ratings.keys().tolist())
            all_predicted.append(candidate['title'].tolist())
            

    result = pd.DataFrame(columns = ['@k', 'Mean average precision', 'Mean average recall'])        
    for num in range(1,k+1):
        MAP = mapk(all_user_rated, all_predicted, num)
#         print('MAP @{} is {}'.format(num, MAP))
        MAR = mark(all_user_rated, all_predicted, num)
#         print('MAR @{} is {}'.format(num, MAR))
        to_append = pd.Series([num, MAP, MAR] , index = result.columns)
        result = result.append(to_append, ignore_index = True)
#         print(result)
        
    return result

In [136]:
hybrid('zamanwho')[:20]

Unnamed: 0,spotId,title,Final score,model
29,30,Central World,16.554689,UCF + ICF + SVD + CF
48,49,IconSiam,16.404023,UCF + ICF + SVD + CF
12,13,Terminal 21,16.27023,UCF + ICF + SVD + CF
20,21,Siam Paragon,16.255704,UCF + ICF + SVD + CF
22,23,Platinum Fashion Mall,16.209526,UCF + ICF + SVD + CF
44,45,MBK Center (Ma Boon Khrong Center),16.192384,UCF + ICF + SVD + CF
45,46,Pantip Plaza,15.857728,UCF + ICF + SVD + CF
47,48,Emporium Bangkok and EmQuartier Bangkok,15.467886,UCF + ICF + SVD + CF
31,32,King Power Mahanakhon,14.252636,UCF + ICF + SVD + CF
38,39,Flow House Bangkok,13.694751,UCF + ICF + SVD + CF


In [42]:
hybrid2('zamanwho')[:20]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,spotId,title,est,model
18,41,Museum of Contemporary Art (MOCA),4.343319,CB -> CF
8,32,King Power Mahanakhon,4.292828,CB -> CF
0,49,IconSiam,4.220425,CB -> CF
6,30,Central World,4.193304,CB -> CF
16,27,Art in Paradise Bangkok,4.176644,CB -> CF
1,21,Siam Paragon,4.097229,CB -> CF
5,13,Terminal 21,4.072833,CB -> CF
9,39,Flow House Bangkok,4.067313,CB -> CF
14,12,Lumpini park,4.012135,CB -> CF
4,23,Platinum Fashion Mall,3.956256,CB -> CF


In [43]:
hybrid3('zamanwho')

Unnamed: 0,spotId,title,score,model
29,45,MBK Center (Ma Boon Khrong Center),1.531126,CF -> CB
11,21,Siam Paragon,1.531126,CF -> CB
3,49,IconSiam,1.531126,CF -> CB
4,30,Central World,1.531126,CF -> CB
26,23,Platinum Fashion Mall,1.531126,CF -> CB
14,13,Terminal 21,1.531126,CF -> CB
28,48,Emporium Bangkok and EmQuartier Bangkok,1.469344,CF -> CB
1,32,King Power Mahanakhon,1.243723,CF -> CB
15,39,Flow House Bangkok,1.180739,CF -> CB
24,12,Lumpini park,0.313786,CF -> CB


In [44]:
hybrid4('zamanwho')[:20]

Unnamed: 0,spotId,title,Final score,model
29,30,Central World,10.094006,UCF + ICF + SVD + CF
48,49,IconSiam,9.740793,UCF + ICF + SVD + CF
12,13,Terminal 21,9.427136,UCF + ICF + SVD + CF
20,21,Siam Paragon,9.393082,UCF + ICF + SVD + CF
22,23,Platinum Fashion Mall,9.284826,UCF + ICF + SVD + CF
44,45,MBK Center (Ma Boon Khrong Center),9.244638,UCF + ICF + SVD + CF
45,46,Pantip Plaza,8.460088,UCF + ICF + SVD + CF
47,48,Emporium Bangkok and EmQuartier Bangkok,8.016552,UCF + ICF + SVD + CF
31,32,King Power Mahanakhon,6.655853,UCF + ICF + SVD + CF
38,39,Flow House Bangkok,5.923496,UCF + ICF + SVD + CF


In [47]:
# # ผลลัพธ์ของ User 'nellielim'
user_rated = list(get_average('zamanwho').keys())
candidate_hybrid = hybrid('zamanwho')['title'].tolist()

find_apk(user_rated, candidate_hybrid, 10)

Unnamed: 0,@k,Average precision,Average recall
0,1.0,1.0,0.5
1,2.0,1.0,0.5
2,3.0,1.0,0.5
3,4.0,1.0,0.5
4,5.0,0.7,0.75
5,6.0,0.7,0.75
6,7.0,0.7,0.75
7,8.0,0.7,0.75
8,9.0,0.7,0.75
9,10.0,0.7,0.75


In [137]:
# ผลลัพธ์รวมทั้งหมด (เทสกับข้อมูล user 560 คนที่มีการ rated มากกว่า 10 ครั้ง) ใหม่
# CB 0.6 , CF 0.4 ดีที่สุด (svd) 0.5 0.5 0.7 0.3
# use user-based
MAP_MAR_Hybrid(hybrid, 10)

Unnamed: 0,@k,Mean average precision,Mean average recall
0,1.0,0.59104,0.557178
1,2.0,0.676596,0.723852
2,3.0,0.704489,0.807396
3,4.0,0.718082,0.865052
4,5.0,0.72276,0.895085
5,6.0,0.726045,0.918824
6,7.0,0.730575,0.95814
7,8.0,0.728837,0.963616
8,9.0,0.728375,0.965219
9,10.0,0.728258,0.965856


In [99]:
# CB -> CF
MAP_MAR_Hybrid(hybrid2, 10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,@k,Mean average precision,Mean average recall
0,1.0,0.096986,0.08938
1,2.0,0.115114,0.120606
2,3.0,0.131403,0.164326
3,4.0,0.145922,0.21479
4,5.0,0.158792,0.27319
5,6.0,0.169193,0.331521
6,7.0,0.178593,0.394078
7,8.0,0.186403,0.455405
8,9.0,0.193294,0.519552
9,10.0,0.19983,0.58826


In [100]:
# Hybrid CF -> CB
MAP_MAR_Hybrid(hybrid3, 10)

Unnamed: 0,@k,Mean average precision,Mean average recall
0,1.0,0.450641,0.41639
1,2.0,0.48551,0.482575
2,3.0,0.506389,0.541753
3,4.0,0.519282,0.592652
4,5.0,0.524473,0.623259
5,6.0,0.529683,0.657355
6,7.0,0.533998,0.693329
7,8.0,0.532783,0.69615
8,9.0,0.532581,0.696715
9,10.0,0.532542,0.69689


In [101]:
# use item-based
MAP_MAR_Hybrid(hybrid4, 10)

# 0.5 0.5
# Avg. precision @ 3 is 33.85599999999999
# Avg. precision @ 5 is 27.2256
# Avg. precision @ 7 is 22.637714285714278
# Avg. precision @ 9 is 20.36266666666667
# Mean average precision is 26.020495238095233 

# Avg. Recall @ 3 is 54.94675555555555
# Avg. Recall @ 5 is 72.89586031746032
# Avg. Recall @ 7 is 83.8775873015873
# Avg. Recall @ 9 is 95.00730158730157
# Mean average recall is 76.68187619047619 

Unnamed: 0,@k,Mean average precision,Mean average recall
0,1.0,0.602817,0.56969
1,2.0,0.677693,0.715043
2,3.0,0.710089,0.811125
3,4.0,0.723617,0.868028
4,5.0,0.729048,0.901183
5,6.0,0.732656,0.927789
6,7.0,0.735805,0.958286
7,8.0,0.734381,0.963255
8,9.0,0.733803,0.965164
9,10.0,0.733643,0.965724


# Test Hybrid model weight!

In [57]:
# สำหรับ Hybrid model!!!
user_df = pd.read_csv('test_process_over2.csv')
def MAP_MAR_Hybrid_test(model):
    all_user_rated = []
    all_predicted = []
    

    # testing_process เป็นข้อมูล testing เป็น user ที่แบ่งมา 20% ดยคัดคนที่ rated มากกว่า 2 ครั้ง
    for name in user_df['username'].unique():
        try:
        # get average rating from spot where user traveled in the past
            user_rated = get_average(name)
        # find candidate score for each usernames [spotId, title, score]
            candidate = model(name)
            all_user_rated.append(list(user_rated.keys()))
            all_predicted.append(candidate['title'].tolist())
            
        except:
            user_rated = get_average(name)
            my_ratings = pd.Series(user_rated)
            candidate = model(my_ratings)
            all_user_rated.append(my_ratings.keys().tolist())
            all_predicted.append(candidate['title'].tolist())
            

    result = pd.DataFrame(columns = ['@k', 'Mean average precision', 'Mean average recall'])        
    for num in [1,5,10]:
        MAP = mapk(all_user_rated, all_predicted, num)
#         print('MAP @{} is {}'.format(num, MAP))
        MAR = mark(all_user_rated, all_predicted, num)
#         print('MAR @{} is {}'.format(num, MAR))
        to_append = pd.Series([num, MAP, MAR] , index = result.columns)
        result = result.append(to_append, ignore_index = True)
#         print(result)
        
    return result

In [131]:
def hybrid5(username):
    # เตรียมช้อมูล
    # เลือกเฉพาะ Columns ที่จะใช้
    df_subset = df.loc[:, ['username', 'rating', 'title', 'spotId']]
    result = travel.loc[:, ['spotId', 'title']]
  
    # Content-based result
    # หา Dict ของสถานที่ที่ username นี้ไปใน Testing set
    user_rated = get_average(username)
    # ทำเป็น Series
    my_ratings = pd.Series(user_rated)
    my_ratings.index
    # หาผลลัพธ์ของ Content-based output = dataFrame (spotId, title, score)
    candidate_cb = content_based(my_ratings)
    
    # Weighted โมเดลทั้งผลของ svd และ item-based collaborative filtering
    result['est'] = result['spotId'].apply(lambda x: 0.2*knnbaseline_algo.predict(username, x).est + 
                                           0.5*knnbaseline_algo2.predict(username, x).est+
                                           0.3*svdpp_algo.predict(username, x).est)
    # เอาผลลัพธ์ของ Content-based มารวมกับ Collaborative filtering
    result = result.merge(candidate_cb, on = 'spotId')
#     print(result.columns)
    result = result.drop('title_y', axis =1).rename(columns={"title_x": "title"})
    
    # Weight โมเดลรวมผลของ Content-based เป็น Final score
    result['Final score'] = (result['est'])+(result['score']*0.5)
    result['model'] = 'UCF + ICF + SVD + CF'
    recommend_list = result[['spotId','title', 'Final score', 'model']].sort_values(by = 'Final score', ascending = False)
    
    return recommend_list

In [132]:
MAP_MAR_Hybrid_test(hybrid5)


# @k  	    Mean average precision	Mean average recall
# 0	1.0  	0.590809	             0.556947
# 1	5.0     0.721265	             0.895263
# 2	10.0	0.726477	             0.965856

Unnamed: 0,@k,Mean average precision,Mean average recall
0,1.0,0.237155,0.21876
1,5.0,0.377301,0.576974
2,10.0,0.394949,0.733388


In [90]:
# MAP_MAR_Hybrid_test(hybrid4)

Unnamed: 0,@k,Mean average precision,Mean average recall
0,1.0,0.590809,0.557082
1,5.0,0.722553,0.895104
2,10.0,0.728094,0.965813


In [133]:
def hybrid6(username):
    # เตรียมช้อมูล
    # เลือกเฉพาะ Columns ที่จะใช้
    df_subset = df.loc[:, ['username', 'rating', 'title', 'spotId']]
    result = travel.loc[:, ['spotId', 'title']]
  
    # Content-based result
    # หา Dict ของสถานที่ที่ username นี้ไปใน Testing set
    user_rated = get_average(username)
    # ทำเป็น Series
    my_ratings = pd.Series(user_rated)
    my_ratings.index
    # หาผลลัพธ์ของ Content-based output = dataFrame (spotId, title, score)
    candidate_cb = content_based(my_ratings)
    
    # Weighted โมเดลทั้งผลของ svd และ item-based collaborative filtering
    result['est'] = result['spotId'].apply(lambda x: 0.2*knnbaseline_algo.predict(username, x).est + 
                                           0.5*knnbaseline_algo2.predict(username, x).est+
                                           0.3*svdpp_algo.predict(username, x).est)
    # เอาผลลัพธ์ของ Content-based มารวมกับ Collaborative filtering
    result = result.merge(candidate_cb, on = 'spotId')
#     print(result.columns)
    result = result.drop('title_y', axis =1).rename(columns={"title_x": "title"})
    
    # Weight โมเดลรวมผลของ Content-based เป็น Final score
    result['Final score'] = (result['est']) + (result['score']*2)
    result['model'] = 'UCF + ICF + SVD + CF'
    recommend_list = result[['spotId','title', 'Final score', 'model']].sort_values(by = 'Final score', ascending = False)
    
    return recommend_list

In [134]:
MAP_MAR_Hybrid_test(hybrid6)

Unnamed: 0,@k,Mean average precision,Mean average recall
0,1.0,0.540931,0.50906
1,5.0,0.68019,0.882412
2,10.0,0.686456,0.96297
