# Sistem Rekomendasi

In [1]:
import pandas as pd

In [10]:
df = pd.read_excel('data_hotel.xlsx')

In [11]:
df.head()

Unnamed: 0,hotel_id,hotel_name,hotel_description,hotel_province,hotel_city,hotel_address,price_per_night
0,h0001,Midtown Residence Marvell City Surabaya,,Jawa Timur,Surabaya,Jalan Ngagel Raya No 123,527866.666667
1,h0002,favehotel Graha Agung Surabaya,Sebuah Hotel Mewah di Surabaya Persembahan dar...,Jawa Timur,Surabaya,Jl. Mayjen Yono Soewoyo Pakuwon Indah Square A...,442860.0
2,h0003,The Sun Hotel Sidoarjo,Hotel Bintang 3 Pertama dan Satu-satunya di Si...,Jawa Timur,Surabaya,"Jl. Pahlawan No.1,Sidokumpul, Sidoarjo, Suraba...",305000.0
3,h0004,Grand Surabaya Hotel,Penginapan Yang Tenang Dan Nyaman Di Surabaya.,Jawa Timur,Surabaya,"Jl. Pemuda 19-21, Surabaya, Indonesia",324999.333333
4,h0005,The WIN Hotel Surabaya,WIN Hotel adalah hotel smart bintang 3 yang me...,Jawa Timur,Surabaya,"Jl. Embong Tanjung 46 - 48 Surabaya, Jawa Timu...",310947.25


In [4]:
df.shape

(1199, 7)

In [12]:
df.isnull().sum()

hotel_id               0
hotel_name             0
hotel_description    101
hotel_province         0
hotel_city             0
hotel_address          0
price_per_night        6
dtype: int64

In [13]:
df = df[df['hotel_description'].notnull()]

In [14]:
df.shape

(1098, 7)

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import re
import random

In [16]:
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z #+_]')
sastrawi = StopWordRemoverFactory()
stopword = sastrawi.get_stop_words()
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [17]:
def clean_text(text):
    text = text.lower()
    text = clean_spcl.sub(' ', text)
    text = clean_symbol.sub('', text)
    text = stemmer.stem(text)
    text = ' '.join(word for word in text.split() if word not in stopword)
    
    return text

In [18]:
df['desc'] = df['hotel_description'].apply(clean_text)

In [19]:
df.head()

Unnamed: 0,hotel_id,hotel_name,hotel_description,hotel_province,hotel_city,hotel_address,price_per_night,desc
1,h0002,favehotel Graha Agung Surabaya,Sebuah Hotel Mewah di Surabaya Persembahan dar...,Jawa Timur,Surabaya,Jl. Mayjen Yono Soewoyo Pakuwon Indah Square A...,442860.0,buah hotel mewah surabaya sembah archipelago i...
2,h0003,The Sun Hotel Sidoarjo,Hotel Bintang 3 Pertama dan Satu-satunya di Si...,Jawa Timur,Surabaya,"Jl. Pahlawan No.1,Sidokumpul, Sidoarjo, Suraba...",305000.0,hotel bintang 3 pertama satusatunya sidoarjo
3,h0004,Grand Surabaya Hotel,Penginapan Yang Tenang Dan Nyaman Di Surabaya.,Jawa Timur,Surabaya,"Jl. Pemuda 19-21, Surabaya, Indonesia",324999.333333,inap tenang nyaman surabaya
4,h0005,The WIN Hotel Surabaya,WIN Hotel adalah hotel smart bintang 3 yang me...,Jawa Timur,Surabaya,"Jl. Embong Tanjung 46 - 48 Surabaya, Jawa Timu...",310947.25,win hotel hotel smart bintang 3 tawar layan tu...
5,h0006,POP! Hotel Stasiun Kota Surabaya,Sebuah Hotel Modern yang Terjangkau di Jawa Timur,Jawa Timur,Surabaya,"Jl. Waspada No.58, Surabaya, Jawa Timur, Indon...",276333.333333,buah hotel modern jangkau jawa timur


In [20]:
df.set_index('hotel_name', inplace=True)

In [21]:
df.head()

Unnamed: 0_level_0,hotel_id,hotel_description,hotel_province,hotel_city,hotel_address,price_per_night,desc
hotel_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
favehotel Graha Agung Surabaya,h0002,Sebuah Hotel Mewah di Surabaya Persembahan dar...,Jawa Timur,Surabaya,Jl. Mayjen Yono Soewoyo Pakuwon Indah Square A...,442860.0,buah hotel mewah surabaya sembah archipelago i...
The Sun Hotel Sidoarjo,h0003,Hotel Bintang 3 Pertama dan Satu-satunya di Si...,Jawa Timur,Surabaya,"Jl. Pahlawan No.1,Sidokumpul, Sidoarjo, Suraba...",305000.0,hotel bintang 3 pertama satusatunya sidoarjo
Grand Surabaya Hotel,h0004,Penginapan Yang Tenang Dan Nyaman Di Surabaya.,Jawa Timur,Surabaya,"Jl. Pemuda 19-21, Surabaya, Indonesia",324999.333333,inap tenang nyaman surabaya
The WIN Hotel Surabaya,h0005,WIN Hotel adalah hotel smart bintang 3 yang me...,Jawa Timur,Surabaya,"Jl. Embong Tanjung 46 - 48 Surabaya, Jawa Timu...",310947.25,win hotel hotel smart bintang 3 tawar layan tu...
POP! Hotel Stasiun Kota Surabaya,h0006,Sebuah Hotel Modern yang Terjangkau di Jawa Timur,Jawa Timur,Surabaya,"Jl. Waspada No.58, Surabaya, Jawa Timur, Indon...",276333.333333,buah hotel modern jangkau jawa timur


In [23]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=0)
tfidf = tf.fit_transform(df['desc'])
cos_sim = cosine_similarity(tfidf, tfidf)

In [24]:
indices = pd.Series(df.index)

In [25]:
indices

0            favehotel Graha Agung Surabaya
1                    The Sun Hotel Sidoarjo
2                      Grand Surabaya Hotel
3                    The WIN Hotel Surabaya
4          POP! Hotel Stasiun Kota Surabaya
                       ...                 
1093                 Restu Hotel Yogyakarta
1094                  Oryza Hotel Malioboro
1095            Hotel Asri Graha Yogyakarta
1096    Tasneem Convention Hotel Yogyakarta
1097          Pesonna Hotel Tugu Yogyakarta
Name: hotel_name, Length: 1098, dtype: object

In [31]:
def rekomendasi(hotel, top=10):
    recommended_hotel = []
    
    idx = indices[indices == hotel].index[0]
    score = pd.Series(cos_sim[idx]).sort_values(ascending=False)
    
    top = top+1
    top_index = list(score.iloc[0:top].index)
    
    for i in top_index:
        recommended_hotel.append(list(df.index)[i]+" - "+str(score[i]))
        
    return recommended_hotel

In [32]:
rekomendasi('Gunawangsa Manyar Hotel', 5)

['Gunawangsa Manyar Hotel - 1.0000000000000002',
 'Midtown Hotel Surabaya - 0.17992312860891296',
 'Aria Gajayana Hotel - 0.14461427726403756',
 'GP Mega Kuningan Hotel - 0.12367390420720603',
 'Le Polonia Medan managed by Topotels - 0.12049561758006845',
 'Swiss-Belinn Manyar Surabaya - 0.11731929279752672']

## Improvisasi Sistem rekomendasi

In [33]:
df['price_bin'] = pd.cut(df['price_per_night'], bins=3, labels=['low','medium','high'])

In [34]:
df.head()

Unnamed: 0_level_0,hotel_id,hotel_description,hotel_province,hotel_city,hotel_address,price_per_night,desc,price_bin
hotel_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
favehotel Graha Agung Surabaya,h0002,Sebuah Hotel Mewah di Surabaya Persembahan dar...,Jawa Timur,Surabaya,Jl. Mayjen Yono Soewoyo Pakuwon Indah Square A...,442860.0,buah hotel mewah surabaya sembah archipelago i...,low
The Sun Hotel Sidoarjo,h0003,Hotel Bintang 3 Pertama dan Satu-satunya di Si...,Jawa Timur,Surabaya,"Jl. Pahlawan No.1,Sidokumpul, Sidoarjo, Suraba...",305000.0,hotel bintang 3 pertama satusatunya sidoarjo,low
Grand Surabaya Hotel,h0004,Penginapan Yang Tenang Dan Nyaman Di Surabaya.,Jawa Timur,Surabaya,"Jl. Pemuda 19-21, Surabaya, Indonesia",324999.333333,inap tenang nyaman surabaya,low
The WIN Hotel Surabaya,h0005,WIN Hotel adalah hotel smart bintang 3 yang me...,Jawa Timur,Surabaya,"Jl. Embong Tanjung 46 - 48 Surabaya, Jawa Timu...",310947.25,win hotel hotel smart bintang 3 tawar layan tu...,low
POP! Hotel Stasiun Kota Surabaya,h0006,Sebuah Hotel Modern yang Terjangkau di Jawa Timur,Jawa Timur,Surabaya,"Jl. Waspada No.58, Surabaya, Jawa Timur, Indon...",276333.333333,buah hotel modern jangkau jawa timur,low


In [35]:
df['metadata'] = df[['desc','hotel_province','hotel_city','price_bin']].agg(' '.join, axis=1)

In [36]:
df['metadata']

hotel_name
favehotel Graha Agung Surabaya         buah hotel mewah surabaya sembah archipelago i...
The Sun Hotel Sidoarjo                 hotel bintang 3 pertama satusatunya sidoarjo J...
Grand Surabaya Hotel                   inap tenang nyaman surabaya Jawa Timur Surabay...
The WIN Hotel Surabaya                 win hotel hotel smart bintang 3 tawar layan tu...
POP! Hotel Stasiun Kota Surabaya       buah hotel modern jangkau jawa timur Jawa Timu...
                                                             ...                        
Restu Hotel Yogyakarta                 akomodasi nyaman modern yogyakarta akomodasi t...
Oryza Hotel Malioboro                  ideal accommodation options in the malioboro a...
Hotel Asri Graha Yogyakarta            pilih akomodasi ideal pasang backpacker libur ...
Tasneem Convention Hotel Yogyakarta    pilih akomodasi ideal pasang backpacker libur ...
Pesonna Hotel Tugu Yogyakarta          inap tenang nyaman dekat tugu jogja pesonna tu...
Name: meta

In [37]:
tfidf = tf.fit_transform(df['metadata'])
cos_sim = cosine_similarity(tfidf, tfidf)

In [39]:
rekomendasi('Gunawangsa Manyar Hotel', 10)

['Gunawangsa Manyar Hotel - 1.0000000000000002',
 'Midtown Hotel Surabaya - 0.36231764515648446',
 'Deka Hotel - 0.3251940293569784',
 'Grand Surabaya Hotel - 0.3251940293569784',
 'Hersya Front One Inn - 0.3251940293569784',
 'Wyndham Surabaya - 0.3251940293569784',
 'Continent My Tower Hotel, Surabaya - Rungkut - 0.3251940293569784',
 'Namira Syariah Surabaya Hotel - 0.3251940293569784',
 'Narita Classic Hotel Surabaya - 0.3138966605585867',
 'Premier Place Surabaya Airport - 0.3138966605585867',
 'Quest Hotel Darmo - Surabaya by ASTON - 0.3138966605585867']