### Section 1: Loading libraries

In [None]:
# Loading libraries
import pandas as pd

# for tf, df, tf-idf
from sklearn.feature_extraction import text

# for lemmatization
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

# for co-occurance normalization (Jaccard Similarity)
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist, jaccard

# for display inline dataframe
from IPython.display import display_html 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### Section 2: Loading data and clean data

In [None]:
# Quarantine Hotel - Travel Safely During COVID-19 Note Section
quarantine_covid = pd.read_csv("../output/COVID_Msg_Quarantine.csv", encoding="utf-8")
no_of_hotel = quarantine_covid["hotel name"].count()
print("Number of Hotels: ", no_of_hotel)
no_with_travelSafe = quarantine_covid[quarantine_covid["travel safe expect"]!="None"]["travel safe expect"].count()
print("Number of hotels with Travel Safely During COVID-19: ", no_with_travelSafe, "(", round(no_with_travelSafe/no_of_hotel*100), "%)")
no_with_note = quarantine_covid[quarantine_covid["travel safe note"]!="None"]["travel safe note"].count()
print("Number of hotels with Travel Safely During COVID-19 Note Section: ", no_with_note, "(", round(no_with_note/no_of_hotel*100), "%)")

# Filter non none value for note section
quarantine_covid_note = quarantine_covid[quarantine_covid["travel safe note"]!="None"]
quarantine_covid_note.set_index("hotel name", inplace=True)
quarantine_covid_note = quarantine_covid_note[["travel safe note"]].drop_duplicates(keep='last')
no_unique_note = quarantine_covid_note["travel safe note"].count()
print("Number of Unique Note text: ", no_unique_note, "(", round(no_unique_note/no_of_hotel*100), "%)")

Number of Hotels:  96
Number of hotels with Travel Safely During COVID-19:  61 ( 64 %)
Number of hotels with Travel Safely During COVID-19 Note Section:  39 ( 41 %)
Number of Unique Note text:  29 ( 30 %)


In [None]:
# Non-Quarantine Hotel - Travel Safely During COVID-19 Note Section
non_quarantine_covid = pd.read_csv("../output/COVID_Msg_NonQuarantine.csv", encoding="utf-8")
no_of_hotel = non_quarantine_covid["hotel name"].count()
print("Number of Hotels: ", no_of_hotel)
no_with_travelSafe = non_quarantine_covid[non_quarantine_covid["travel safe expect"]!="None"]["travel safe expect"].count()
print("Number of hotels with Travel Safely During COVID-19: ", no_with_travelSafe, "(", round(no_with_travelSafe/no_of_hotel*100), "%)")
no_with_note = non_quarantine_covid[non_quarantine_covid["travel safe note"]!="None"]["travel safe note"].count()
print("Number of hotels with Travel Safely During COVID-19 Note Section: ", no_with_note, "(", round(no_with_note/no_of_hotel*100), "%)")

# Filter non none value for note section
non_quarantine_covid_note = non_quarantine_covid[non_quarantine_covid["travel safe note"]!="None"]
non_quarantine_covid_note.set_index("hotel name", inplace=True)
non_quarantine_covid_note = non_quarantine_covid_note[["travel safe note"]].drop_duplicates(keep='last')
no_unique_note = non_quarantine_covid_note["travel safe note"].count()
print("Number of Unique Note text: ", no_unique_note, "(", round(no_unique_note/no_of_hotel*100), "%)")

Number of Hotels:  96
Number of hotels with Travel Safely During COVID-19:  46 ( 48 %)
Number of hotels with Travel Safely During COVID-19 Note Section:  27 ( 28 %)
Number of Unique Note text:  19 ( 20 %)


In [None]:
# Quarantine Hotel - Customer Review
quarantine_scrapped_data = pd.read_excel("../data/data_Quarantine.xlsx", "data")
quarantine_customer_review = quarantine_scrapped_data.loc[:,["hotel ID","customer review Title","customer review text", "hotel reply yes or no"]]
quarantine_customer_review["customer review Title"] = pd.Series(quarantine_customer_review["customer review Title"], dtype="string")
quarantine_customer_review.loc[:,"customer review"] = quarantine_customer_review["customer review Title"] + " " + quarantine_customer_review["customer review text"]
quarantine_customer_review.drop(columns=["customer review Title","customer review text"], inplace=True)
quarantine_customer_review.loc[:,"Is Quarantine Hotel"] = 1 
no_of_reviews_q = quarantine_customer_review["customer review"].count()
print("Number of Reviews: ",no_of_reviews_q)
non_duplicate_review_q = quarantine_customer_review[["customer review"]].drop_duplicates(keep='last')
print("Number of Unique Review: ",non_duplicate_review_q["customer review"].count())

Number of Reviews:  1658
Number of Unique Review:  1658


In [None]:
# Non-Quarantine Hotel - Customer Review
non_quarantine_scrapped_data = pd.read_excel("../data/data_NonQuarantine.xlsx", "data")
non_quarantine_customer_review = non_quarantine_scrapped_data.loc[:,["hotel ID","customer review Title","customer review text", "hotel reply yes or no"]]
non_quarantine_customer_review["customer review Title"] = pd.Series(non_quarantine_customer_review["customer review Title"], dtype="string")
non_quarantine_customer_review.loc[:,"customer review"] = non_quarantine_customer_review["customer review Title"] + " " + non_quarantine_customer_review["customer review text"]
non_quarantine_customer_review.drop(columns=["customer review Title","customer review text"], inplace=True)
non_quarantine_customer_review = non_quarantine_customer_review[non_quarantine_customer_review["customer review"].isna() != True]
non_quarantine_customer_review.loc[:,"Is Quarantine Hotel"] = 0
no_of_reviews_nq = non_quarantine_customer_review["customer review"].count()
print("Number of Reviews: ",no_of_reviews_nq)
non_duplicate_review_nq = non_quarantine_customer_review[["customer review"]].drop_duplicates(keep='last')
print("Number of Unique Review: ",non_duplicate_review_nq["customer review"].count())

Number of Reviews:  1568
Number of Unique Review:  1568


In [None]:
# Quarantine Hotel - Hotel Reply
quarantine_hotel_reply = quarantine_scrapped_data[["hotel ID", "hotel reply text"]]
quarantine_hotel_reply = quarantine_hotel_reply[quarantine_hotel_reply["hotel reply text"].isna()!=True]
quarantine_hotel_reply.loc[:,"Is Quarantine Hotel"] = 1
no_of_reply = quarantine_hotel_reply["hotel reply text"].count()
print("Number of Reply: ",no_of_reply, "(", round(no_of_reply/no_of_reviews_q*100) ,"%)")
quarantine_hotel_reply_unique = quarantine_hotel_reply[["hotel reply text"]].drop_duplicates(keep='last')
no_unique_reply = quarantine_hotel_reply_unique["hotel reply text"].count()
print("Number of Unique Reply: ", no_unique_reply, "(", round(no_unique_reply/no_of_reply*100), "%)")

Number of Reply:  1431 ( 86 %)
Number of Unique Reply:  1370 ( 96 %)


In [None]:
# Non-Quarantine Hotel - Hotel Reply
non_quarantine_hotel_reply = non_quarantine_scrapped_data.loc[:,["hotel ID", "hotel reply text"]]
non_quarantine_hotel_reply = non_quarantine_hotel_reply[non_quarantine_hotel_reply["hotel reply text"].isna()!=True]
non_quarantine_hotel_reply.loc[:,"Is Quarantine Hotel"] = 0
no_of_reply = non_quarantine_hotel_reply["hotel reply text"].count()
print("Number of Reply: ",no_of_reply, "(", round(no_of_reply/no_of_reviews_nq*100) ,"%)")
non_quarantine_hotel_reply_unique = non_quarantine_hotel_reply[["hotel reply text"]].drop_duplicates(keep='last')
no_unique_reply = non_quarantine_hotel_reply_unique["hotel reply text"].count()
print("Number of Unique Reply: ", no_unique_reply, "(", round(no_unique_reply/no_of_reply*100), "%)")

Number of Reply:  1351 ( 86 %)
Number of Unique Reply:  1185 ( 88 %)


### Section 3: Define functions for N-gram Term Frequency, TF-IDF, and Co-orcurrance Matrix

In [None]:
# Inititate analyzer for CountVectorizer
count_analyzer = text.CountVectorizer().build_analyzer()
# Inititate analyzer for TfidfVectorizer
tfidf_analyzer = text.TfidfVectorizer().build_analyzer()

lemmatizer = WordNetLemmatizer()

# Set up class to override the analyzer setting for both vectorizer
class StemmedCountVectorizer(text.CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([lemmatizer.lemmatize(w) for w in analyzer(doc)])
    
class StemmedTfidfVectorizer(text.TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: ([lemmatizer.lemmatize(w) for w in analyzer(doc)])

# Set up custom stop word
stop_word_to_add = ["hong", "kong", "hotel", "hotels", "quarantine", "quarantines", "stay", "hk", "week", "day", "days", "14", "21"]
custom_stop_word = text.ENGLISH_STOP_WORDS.union(stop_word_to_add)

In [None]:
# Ngram Term Frequency Function 
def term_frequency(onlinetext, maxword, ngramsize):
    count = StemmedCountVectorizer(stop_words=custom_stop_word, max_features=maxword, ngram_range=ngramsize, strip_accents="ascii")
    transformed = count.fit_transform(onlinetext)
    df_count = pd.DataFrame(transformed.toarray(), columns=count.get_feature_names_out())
    df_count_sum = pd.DataFrame(df_count.sum(), columns=["Term Frequency"])
    df_count_sum = df_count_sum.sort_values(by=["Term Frequency"], ascending=False)
    return df_count_sum

In [None]:
# Ngram Document Frequency Function (Noramlized: A ngram is marked as counted as 1 even if appear more than 1 time in the same document)
def document_frequency(onlinetext, maxword, ngramsize):
    count = StemmedCountVectorizer(stop_words=custom_stop_word, max_features=maxword, ngram_range=ngramsize, binary=True, strip_accents="ascii")
    transformed = count.fit_transform(onlinetext)
    df_count = pd.DataFrame(transformed.toarray(), columns=count.get_feature_names_out())
    df_count_sum = pd.DataFrame(df_count.mean()*100, columns=["Document Frequency"])
    df_count_sum = df_count_sum.sort_values(by=["Document Frequency"], ascending=False)
    df_count_sum["Document Frequency"]=df_count_sum["Document Frequency"].map('{:.1f}'.format)
    return df_count_sum

In [None]:
# Ngram TF-IDF Function
def tf_idf(onlinetext, maxword, ngramsize):
    count = StemmedTfidfVectorizer(stop_words=custom_stop_word, max_features=maxword, ngram_range=ngramsize, strip_accents="ascii")
    transformed = count.fit_transform(onlinetext)
    df_count = pd.DataFrame(transformed.toarray(), columns=count.get_feature_names_out())
    df_count_sum = pd.DataFrame(df_count.sum(), columns=["TF-IDF"])
    df_count_sum = df_count_sum.sort_values(by=["TF-IDF"], ascending=False)
    df_count_sum["TF-IDF"] = df_count_sum["TF-IDF"].map('{:.2f}'.format)
    return df_count_sum

In [None]:
# Display TF/DF/TF-IDF
def text_results(dataFrame, ngram):
  tf = term_frequency(dataFrame, 20, (ngram,ngram))
  df = document_frequency(dataFrame, 20, (ngram,ngram))
  tf__idf = tf_idf(dataFrame, 20, (ngram,ngram))
  tf_styler = tf.style.set_table_attributes("style='display:inline'")
  df_styler = df.style.set_table_attributes("style='display:inline'")
  tf_idf_styler = tf__idf.style.set_table_attributes("style='display:inline'")
  display_html(tf_styler._repr_html_()+df_styler._repr_html_()+tf_idf_styler._repr_html_(), raw=True)

In [None]:
# Ngram Co-occurance Matrix, Normalized by Jaccard Similarity
def ngram_cooccur_matrix(onlinetext, maxword, ngramsize, outputname):
    count = StemmedCountVectorizer(stop_words=custom_stop_word, max_features=maxword, ngram_range=ngramsize, binary=True, strip_accents="ascii")
    cc = count.fit_transform(onlinetext)
    names = count.get_feature_names_out()
    df_count = pd.DataFrame(cc.T.toarray())
    res = 1-pdist(df_count, 'jaccard')
    squared = squareform(res)
    df = pd.DataFrame(squared, index=names, columns=names)
    df.to_csv(outputname + ".csv", sep = ",")

### Section 4: Analysis

##### Section 4.1a: Quarantine Hotel - Travel Safely during COVID-19 Note Section

In [None]:
#Unigram
text_results(quarantine_covid_note["travel safe note"], 1)

Unnamed: 0,Term Frequency
guest,46
health,24
area,22
public,21
measure,21
air,19
clean,17
safety,17
room,17
cleaning,14

Unnamed: 0,Document Frequency
guest,69.0
measure,62.1
health,58.6
safety,41.4
area,37.9
travel,37.9
clean,37.9
public,37.9
cleaning,34.5
temperature,34.5

Unnamed: 0,TF-IDF
guest,7.3
health,4.97
measure,4.81
safety,4.19
government,3.67
clean,3.34
hygiene,3.31
public,3.31
area,3.27
air,3.13


In [None]:
# Bigram
text_results(quarantine_covid_note["travel safe note"], 2)

Unnamed: 0,Term Frequency
public areas,11
health safety,8
covid 19,7
body temperature,5
required wear,5
guests visitors,5
precautionary measures,5
ai thermal,4
independent pipe,4
thermal scanner,4

Unnamed: 0,Document Frequency
public areas,27.6
health safety,24.1
covid 19,20.7
precautionary measures,17.2
ai thermal,13.8
required wear,13.8
disinfectant cleaner,13.8
https www,13.8
increased cleaning,13.8
visit https,13.8

Unnamed: 0,TF-IDF
covid 19,4.89
health safety,4.74
public areas,4.48
visit https,2.51
https www,2.51
precautionary measures,1.97
guests visitors,1.49
increased cleaning,1.3
required wear,1.24
body temperature,1.16


##### Section 4.1a: Non-Quarantine Hotel - Travel Safely during COVID-19 Note Section

In [None]:
# Unigram
text_results(non_quarantine_covid_note["travel safe note"], 1)

Unnamed: 0,Term Frequency
safety,22
measure,21
guest,21
19,10
covid,10
health,10
com,8
cleaning,8
service,8
http,7

Unnamed: 0,Document Frequency
guest,68.4
measure,68.4
safety,63.2
19,42.1
com,42.1
covid,42.1
priority,36.8
http,36.8
visit,31.6
service,31.6

Unnamed: 0,TF-IDF
safety,3.6
guest,3.48
measure,3.33
19,2.48
covid,2.48
precautionary,2.36
service,2.28
health,2.18
public,2.1
cleaning,1.86


In [None]:
# Bigram
text_results(non_quarantine_covid_note["travel safe note"], 2)

Unnamed: 0,Term Frequency
covid 19,10
precautionary measures,7
health safety,6
com en,4
hygiene measures,4
public spaces,3
team members,3
food safety,3
guests employees,3
https www,3

Unnamed: 0,Document Frequency
covid 19,42.1
health safety,31.6
precautionary measures,26.3
com en,21.1
safety guests,15.8
guests employees,15.8
https www,15.8
coronavirus covid,15.8
team members,10.5
safety wellbeing,10.5

Unnamed: 0,TF-IDF
covid 19,2.8
precautionary measures,2.51
health safety,2.05
public spaces,1.67
guests employees,1.66
com en,1.33
safety measures,1.27
coronavirus covid,1.27
https www,1.22
food safety,1.15


##### Section 4.2a: Quarantine Hotel - Customer Review

In [None]:
# Unigram
text_results(quarantine_customer_review["customer review"], 1)

Unnamed: 0,Term Frequency
room,3141
food,1587
staff,1456
service,1212
good,1123
time,930
meal,722
great,682
clean,593
view,590

Unnamed: 0,Document Frequency
room,77.9
staff,58.5
food,53.6
service,48.2
good,43.1
time,34.3
clean,31.2
great,30.0
view,27.7
meal,25.6

Unnamed: 0,TF-IDF
room,452.12
food,301.9
staff,294.42
service,268.28
good,246.59
time,200.09
great,190.87
meal,168.0
clean,167.2
view,156.71


In [None]:
# Bigram
text_results(quarantine_customer_review["customer review"], 2)

Unnamed: 0,Term Frequency
highly recommend,164
room clean,150
dorsett mongkok,122
mira moon,112
food delivery,110
room spacious,99
customer service,99
room service,90
staff friendly,83
view room,78

Unnamed: 0,Document Frequency
highly recommend,9.6
room clean,8.9
food delivery,6.3
room spacious,5.9
dorsett mongkok,5.4
staff friendly,5.0
customer service,4.7
room service,4.5
view room,4.5
friendly helpful,4.2

Unnamed: 0,TF-IDF
highly recommend,110.05
room clean,107.76
food delivery,77.09
dorsett mongkok,70.13
room spacious,70.0
room service,62.46
customer service,62.39
staff friendly,58.82
good service,54.36
mira moon,53.61


In [None]:
# Generate co-occurance matrix for unigram for Network Analysis
ngram_cooccur_matrix(quarantine_customer_review["customer review"], 50, (1,1), "../output/Cooccurrence_Matrix_Quarantine")

##### Section 4.2b: Non-Quarantine Hotel - Customer Review

In [None]:
# Unigram
text_results(non_quarantine_customer_review["customer review"], 1)

Unnamed: 0,Term Frequency
room,2241
staff,1319
service,1084
good,960
great,752
time,731
nice,704
staycation,592
view,589
check,532

Unnamed: 0,Document Frequency
room,69.0
staff,56.2
service,45.5
good,38.2
nice,32.3
great,32.0
time,31.6
staycation,27.6
view,27.4
clean,24.7

Unnamed: 0,TF-IDF
room,374.1
staff,264.66
service,252.03
good,233.71
great,202.4
nice,190.2
time,180.19
staycation,171.01
view,154.55
check,140.73


In [None]:
# Bigram
text_results(non_quarantine_customer_review["customer review"], 2)

Unnamed: 0,Term Frequency
regal kowloon,134
park lane,133
room clean,114
view room,96
staff friendly,95
swimming pool,92
room spacious,81
harbour view,80
highly recommend,76
excellent service,76

Unnamed: 0,Document Frequency
room clean,7.3
regal kowloon,6.1
park lane,6.0
staff friendly,6.0
view room,5.5
room spacious,5.2
swimming pool,5.0
highly recommend,4.7
friendly helpful,4.4
harbour view,4.4

Unnamed: 0,TF-IDF
room clean,86.74
park lane,75.92
regal kowloon,70.47
staff friendly,67.78
swimming pool,61.7
room spacious,58.65
highly recommend,55.32
view room,54.94
excellent service,52.22
harbour view,51.68


In [None]:
# Generate co-occurance matrix for unigram for Network Analysis
ngram_cooccur_matrix(non_quarantine_customer_review["customer review"], 50, (1,1), "../output/Cooccurrence_Matrix_NonQuarantine")

##### Section 4.3a: Quarantine Hotel - Hotel Reply

In [None]:
# Unigram
text_results(quarantine_hotel_reply["hotel reply text"], 1)

Unnamed: 0,Term Frequency
thank,1809
guest,1527
dear,1255
time,1156
experience,1113
service,920
forward,811
look,805
future,793
comment,783

Unnamed: 0,Document Frequency
dear,87.7
thank,83.8
guest,64.7
experience,58.2
forward,56.2
look,55.3
time,53.6
future,51.9
regard,51.6
service,49.5

Unnamed: 0,TF-IDF
thank,325.7
guest,299.01
time,273.38
experience,249.29
service,226.22
dear,220.61
comment,216.99
team,208.59
future,203.37
regard,200.39


In [None]:
# Bigram
text_results(quarantine_hotel_reply_unique["hotel reply text"], 2)

Unnamed: 0,Term Frequency
look forward,733
forward welcoming,533
taking time,331
time share,314
best regards,301
dorsett mongkok,285
near future,284
guest thank,275
warm regards,237
thank choosing,226

Unnamed: 0,Document Frequency
look forward,53.4
forward welcoming,38.9
taking time,24.2
time share,22.9
best regards,22.0
near future,20.7
guest thank,20.1
warm regards,17.3
thank choosing,16.2
dear valued,16.2

Unnamed: 0,TF-IDF
look forward,261.28
forward welcoming,210.94
near future,154.64
taking time,145.87
time share,140.63
best regards,132.79
warm regards,129.82
dorsett mongkok,114.73
delighted hear,114.72
thank choosing,112.54


##### Section 4.3b: Non-Quarantine Hotel - Hotel Reply

In [None]:
# Unigram
text_results(non_quarantine_hotel_reply_unique["hotel reply text"], 1)

Unnamed: 0,Term Frequency
dear,1154
thank,1138
forward,989
manager,960
look,839
review,755
experience,718
welcoming,657
service,626
guest,602

Unnamed: 0,Document Frequency
dear,97.4
forward,83.3
manager,79.4
thank,77.1
look,69.6
review,58.5
welcoming,55.3
experience,51.0
future,45.5
service,43.0

Unnamed: 0,TF-IDF
thank,249.62
dear,216.4
review,215.51
manager,214.17
forward,213.01
look,205.51
experience,189.15
welcoming,182.04
service,180.4
guest,170.6


In [None]:
# Bigram
text_results(non_quarantine_hotel_reply_unique["hotel reply text"], 2)

Unnamed: 0,Term Frequency
look forward,816
forward welcoming,589
sincerely luc,345
near future,342
luc bollengeneral,337
bollengeneral manager,337
future sincerely,314
best regards,290
taking time,283
general manager,233

Unnamed: 0,Document Frequency
look forward,68.8
forward welcoming,49.6
sincerely luc,29.1
near future,28.9
luc bollengeneral,28.4
bollengeneral manager,28.4
future sincerely,26.5
best regards,24.5
taking time,23.9
general manager,19.7

Unnamed: 0,TF-IDF
look forward,276.72
forward welcoming,234.14
best regards,177.41
near future,141.88
sincerely luc,128.22
luc bollengeneral,126.5
bollengeneral manager,126.5
taking time,126.43
future sincerely,120.3
general manager,102.99
