In [4]:

# In data analysis, cosine similarity is a measure of similarity between two non-zero vectors defined in an inner 
# product space. Cosine similarity is the cosine of the angle between the vectors; that is, it is the dot product 
# of the vectors divided by the product of their lengths. It follows that the cosine similarity does not depend 
# on the magnitudes of the vectors, but only on their angle. The cosine similarity always belongs to the 
# interval [−1 , 1].

# In this experiment, we will clean up names of hotels, feed these records into a model, train the model to
# find hotels with similar names, as well as dissimilar names, and finally enter the nams of a hotel and see 
# how similar/dissimilar it is, compared to the top 5 other hotel names.

import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import random
import re, nltk, spacy, gensim
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('display.max_columns', 50)


df = pd.read_csv('C:\\seattle_hotels.csv', encoding = "ISO-8859-1")
df.head()


  "class": algorithms.Blowfish,


Unnamed: 0,name,address,desc
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the..."
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat..."
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ..."
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...
4,The Westin Seattle,"1900 5th Avenue,?Seattle,?Washington?98101?USA",Situated amid incredible shopping and iconic a...


In [5]:

# Clean text befor feeding into model 
df['word_count'] = df['name'].apply(lambda x: len(str(x).split()))
desc_lengths = list(df['word_count'])
print("Number of descriptions:",len(desc_lengths),
      "\nAverage word count", np.average(desc_lengths),
      "\nMinimum word count", min(desc_lengths),
      "\nMaximum word count", max(desc_lengths))


REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

df['desc_clean'] = df['name'].apply(clean_text)
df.head()


Number of descriptions: 152 
Average word count 4.467105263157895 
Minimum word count 2 
Maximum word count 10


  REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')


Unnamed: 0,name,address,desc,word_count,desc_clean
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the...",4,hilton garden seattle downtown
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat...",3,sheraton grand seattle
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ...",4,crowne plaza seattle downtown
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...,4,kimpton hotel monaco seattle
4,The Westin Seattle,"1900 5th Avenue,?Seattle,?Washington?98101?USA",Situated amid incredible shopping and iconic a...,3,westin seattle


In [6]:

# find top 5 most similar hotel names...
df.set_index('name', inplace = True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['desc_clean'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df.index)

def most_similar(name, cosine_similarities = cosine_similarities):
    recommended_hotels = []
    # gettin the index of the hotel that matches the name
    idx = indices[indices == name].index[0]
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)
    # getting the indexes of the 10 most similar hotels except itself
    top_5_indexes = list(score_series.iloc[1:6].index)
    # populating the list with the names of the top 10 matching hotels
    for i in top_5_indexes:
        recommended_hotels.append(list(df.index)[i])
    return recommended_hotels

most_similar('Hyatt At Olive 8')


['Hyatt House Seattle',
 'Hyatt Regency Seattle',
 'Hyatt Place Seattle',
 'Gand Hyatt Seattle',
 'Hyatt Regency Lake Washington At SeattleS Southport']

In [7]:

# find top 5 least similar hotel names...
def least_similar(name, cosine_similarities = cosine_similarities):
    recommended_hotels = []
    # gettin the index of the hotel that matches the name
    idx = indices[indices == name].index[0]
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = True)
    # getting the indexes of the 10 most similar hotels except itself
    top_5_indexes = list(score_series.iloc[1:6].index)
    # populating the list with the names of the top 10 matching hotels
    for i in top_5_indexes:
        recommended_hotels.append(list(df.index)[i])
    return recommended_hotels

least_similar('Hyatt At Olive 8')


['Emerald Motel',
 'Stay Alfred on 4th Avenue',
 'Silver Cloud Hotel - Seattle Stadium',
 'Renaissance Seattle Hotel',
 'The Baroness Hotel']

In [None]:

# data:
# https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/Seattle_Hotels.csv
