## EDA and Modelling Based on Reviews

In this section, we will investigate and dive deeper into the data based on the hotel reviews and geolocation.

In [1]:
#import required libraires
import pandas as pd
import numpy as np
from langdetect import detect
from sklearn.feature_extraction import stop_words
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import folium
from folium import plugins
import ipywidgets
import geocoder
import geopy
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import reverse_geocode
from sklearn.feature_extraction.text import CountVectorizer
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
#installing stylecloud library
#!pip install stylecloud 
import stylecloud



In [2]:
#loading the pickle file
new_df=pd.read_pickle('../data/review.pkl')

In [3]:
new_df.head()

Unnamed: 0,hotel_name,negative_review,positive_review,lat_x,lng_x,hotel_address,tags,lat_y,lng_y,location,country,city
0,11 Cadogan Gardens,Thought the prise of drinks at the bar a litt...,We were particularly impressed by the very wa...,51.493616,-0.159235,11 Cadogan Gardens Sloane Square Kensington an...,"[' Leisure trip ', ' Couple ', ' Superior Quee...",51.493616,-0.159235,"[{'country_code': 'GB', 'city': 'Chelsea', 'co...",United Kingdom,Chelsea
1,1K Hotel,Air conditioning in room didn t work and desp...,Location good close to le Marais and 3e arron...,48.863932,2.365874,13 Boulevard Du Temple 3rd arr 75003 Paris France,"[' Leisure trip ', ' Couple ', ' Superior M Do...",48.863932,2.365874,"[{'country_code': 'FR', 'city': 'Paris', 'coun...",France,Paris
2,25hours Hotel beim MuseumsQuartier,Breakfast not included and buffet really expe...,Cool vintage style in the middle of the museu...,48.206474,16.35463,Lerchenfelder Stra e 1 3 07 Neubau 1070 Vienna...,"[' Leisure trip ', ' Solo traveler ', ' Standa...",48.206474,16.35463,"[{'country_code': 'AT', 'city': 'Vienna', 'cou...",Austria,Vienna
3,41,"There wasn t a thing that we didn t like , No...",Its central proximity close to all services a...,51.498147,-0.143649,41 Buckingham Palace Road Westminster Borough ...,"[' Leisure trip ', ' Couple ', ' Executive Kin...",51.498147,-0.143649,"[{'country_code': 'GB', 'city': 'West End of L...",United Kingdom,West End of London
4,45 Park Lane Dorchester Collection,More kinds of fruit juice will make the mini ...,Everything here are almost perfect the staffs...,51.506371,-0.151536,45 Park Lane Westminster Borough London W1K 1P...,"[' Leisure trip ', ' Solo traveler ', ' Execut...",51.506371,-0.151536,"[{'country_code': 'GB', 'city': 'West End of L...",United Kingdom,West End of London


In [4]:
#subsetting the dataset for investigation
hotel_review=new_df[['hotel_name','positive_review','negative_review','city','lat_x','lng_x']]

In [5]:
#join positive and negative review
hotel_review['review_text'] = hotel_review['positive_review'].astype(str) + hotel_review['negative_review'].astype(str)


In [6]:
#detect language
hotel_review['lang']=hotel_review['review_text'].apply(lambda x: detect(x))

In [7]:
#english review
hotel_review['lang'].value_counts()

en    1474
Name: lang, dtype: int64

In [8]:
#assigning the stopwords from nltk to variable
stops = set(stop_words.ENGLISH_STOP_WORDS)


## Feature Engineering and Text Visualisation

In the following section, we are intersted to understand the frequency of top words and its occurance with respect to stop words. We have plotted these visualisation for understanding the data set.

In [9]:
def get_top_n_words(corpus, n=None,y=None):
    vec = CountVectorizer(ngram_range=y).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


In [10]:
def get_top_n_words_with_stop_words(corpus, n=None,y=None):
    vec = CountVectorizer(ngram_range=y,stop_words=stops).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


In [11]:
hotel_review['word_count'] = hotel_review['review_text'].apply(lambda x: len(str(x).split()))
desc_lengths = list(hotel_review['word_count'])
print("Number of descriptions:",len(desc_lengths),
      "\nAverage word count", np.average(desc_lengths),
      "\nMinimum word count", min(desc_lengths),
      "\nMaximum word count", max(desc_lengths))


Number of descriptions: 1474 
Average word count 12143.495251017639 
Minimum word count 183 
Maximum word count 172330


In [12]:
hotel_review['word_count'].iplot(
    kind='hist',
    bins = 100,
    linecolor='black',
    xTitle='Word Count',
    yTitle='Count',
    title='Word Count Distribution in Hotel Description')

## Unigram Text Visualisation 

### Without Stopwords

In [13]:
common_words = get_top_n_words(hotel_review['review_text'], 20,y=(1,1))
df1 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df1.groupby('review_text').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 20 words in hotel description before removing stop words')


### With Stopwords

In [14]:
common_words = get_top_n_words_with_stop_words(hotel_review['review_text'], 20,y=(1,1))
df2 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df2.groupby('review_text').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 20 words in hotel description after removing stop words')


## Bi-gram Text Visualisation
### Without Stopwords

In [15]:
# bi-gram without stop words

common_words = get_top_n_words(hotel_review['review_text'], 20,y=(2,2))
df3 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df3.groupby('review_text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in hotel description before removing stop words')


### With Stopwords

In [16]:
common_words = get_top_n_words_with_stop_words(hotel_review['review_text'], 20,y=(2,2))
df4 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df4.groupby('review_text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in hotel description After removing stop words')

## Tri-gram Text Visualisation

### Without stopwords

In [17]:
#tri-gram before stop words


common_words = get_top_n_words(hotel_review['review_text'], 20,y=(3,3))
df5 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df5.groupby('review_text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in hotel description before removing stop words')


### With stopwords

In [18]:

common_words = get_top_n_words_with_stop_words(hotel_review['review_text'], 20,y=(3,3))
df6 = pd.DataFrame(common_words, columns = ['review_text' , 'count'])
df6.groupby('review_text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in hotel description after removing stop words')


# Modelling

Based on the visualisation, it is evident that bi-gram and tri-gram has more impact on the modelling process. Bi-gram and Tri-gram gives us more context of the reviews and it makes more sense for modelling recommender.

In [19]:
replace_space = re.compile('[/(){}\[\]\|@,;]')
symbol = re.compile('[^0-9a-z #+_]')
stopwordset = stops

def clean_text(text):
    # lowercase text
    text = text.lower() 
    # replace replace_space symbols by space in text. substitute the matched string in replace_space with space.
    text = replace_space.sub(' ', text) 
    # remove symbols which are in symbol from text. substitute the matched string in symbol with Nothing.
    text = symbol.sub('', text)  
    # remove stopwords from text
    text = ' '.join(word for word in text.split() if word not in stopwordset) 
    return text
    


In [20]:
#applying function to clean_text
hotel_review['review_text_clean'] = hotel_review['review_text'].apply(clean_text)


In the sklearn library, there are many other functions you can use, to find cosine similarities between documents. You can directly use TfidfVectorizer in the sklearn’s feature_extraction.text class to Vectorize the words. It will calculate TF_IDF normalization.In sklearn, we can perform the dot product of the vector by using a linear kernel.Here the linear kernel is the same as the cosine similarity, but faster

In [21]:
#transforming reviews into a sparse matrix
tf = TfidfVectorizer(analyzer='word', ngram_range=(2, 3), min_df=0, stop_words=stops)
tfidf_matrix = tf.fit_transform(hotel_review['review_text_clean'])

In [22]:
# function that will find us the top n similar papers based on cosine similarity:

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)


In [23]:
# function to generate maps
def new_recommendations(name,city, cosine_similarities):
    
    recommended_hotels = []
    
    #get input city index
    city_index= list(hotel_review[hotel_review.city==city].index)
    
    # gettin the index of the hotel that matches the name
    idx = hotel_review[(hotel_review.hotel_name == name)].index[0]
    
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of  similar hotels list
    top_10_indexes = list(score_series.index)
    
    # populating the list with the names of hotels
    for i in range(len(top_10_indexes)):
        if top_10_indexes[i] not in city_index:
            pass
        else:
            recommended_hotels.append(hotel_review[hotel_review.index==top_10_indexes[i]]['hotel_name'].values[0])

    #getting the list of hotels based on the lat and long
    h = hotel_review[['hotel_name','lat_x','lng_x']].to_dict(orient='records')
    l = {k['hotel_name']: [k['lat_x'], k['lng_x']] for k in h}
    if {hotel: l[hotel] for hotel in recommended_hotels }=={}:
        print("There are no hotels of similar hotel")
    else:
        output= {hotel: l[hotel] for hotel in recommended_hotels[:10]}
        newoutput={i:output for i in range(1,len(output)+1)}
        return newoutput

In [24]:
#function to generate folium map based on ideal location
def get_hotel_fn_pin(mydict,city):
    loc2 = geocoder.osm(city)

    # map
    main_map = folium.Map(location=[loc2.lat, loc2.lng], zoom_start=13)
    folium.raster_layers.TileLayer('Open Street Map').add_to(main_map)

    # loop through dict
    for i in range (1,len(mydict)+1):
        folium.Marker(location=list(mydict[i].values())[i-1],tooltip=list(mydict[i].keys())[i-1]
                      ,popup=list(mydict[i].keys())[i-1],
                     icon=plugins.BeautifyIcon(number=i,
                                               icon='bus',
                                            border_color='blue',
                                            border_width=0.5,
                                            text_color='red',
                                            inner_icon_style='margin-top:0px;')).add_to(main_map)
     
    return main_map


In [25]:
#saving a html in image
get_hotel_fn_pin(new_recommendations('The Belgrave Hotel','Paris',cosine_similarities),'Paris').save(os.path.join('../image', 'reviews.html'))


In [26]:
# to populate and pin locations of recommended hotels
get_hotel_fn_pin(new_recommendations('The Belgrave Hotel','Vienna',cosine_similarities),'Vienna')