In [15]:
import time
import pandas as pd
import nltk as nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sloth\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sloth\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [16]:
df = pd.read_csv('data/Hotel_Reviews_Filtered.csv')

In [17]:
# We want to find the most useful tags to keep
# Remove opening and closing brackets
df.Tags = df.Tags.str.strip("[']")
# remove all quotes too
df.Tags = df.Tags.str.replace(" ', '", ",", regex = False)

In [18]:
tag_list_df = df.Tags.str.split(',', expand = True)

In [19]:
df["Tag_1"] = tag_list_df[0].str.strip()
df["Tag_2"] = tag_list_df[1].str.strip()
df["Tag_3"] = tag_list_df[2].str.strip()
df["Tag_4"] = tag_list_df[3].str.strip()
df["Tag_5"] = tag_list_df[4].str.strip()
df["Tag_6"] = tag_list_df[5].str.strip()

In [20]:
df_tags = df.melt(value_vars=["Tag_1", "Tag_2", "Tag_3", "Tag_4", "Tag_5", "Tag_6"])

In [21]:
df_tags

Unnamed: 0,variable,value
0,Tag_1,Leisure trip
1,Tag_1,Leisure trip
2,Tag_1,Leisure trip
3,Tag_1,Leisure trip
4,Tag_1,Leisure trip
...,...,...
3094423,Tag_6,
3094424,Tag_6,
3094425,Tag_6,
3094426,Tag_6,


In [22]:
# Get the value counts
tag_vc = df_tags.value.value_counts()
# print(tag_vc)
print("The shape of the tags with no filtering:", str(df_tags.shape))
# Drop rooms, suites, and length of stay, mobile device and anything with less count than a 1000
df_tags = df_tags[~df_tags.value.str.contains("Standard|room|Stayed|device|Beds|Suite|Studio|King|Superior|Double", na=False, case=False)]
tag_vc = df_tags.value.value_counts().reset_index(name="count").query("count > 1000")
# Print the top 10
print(tag_vc[:10])

The shape of the tags with no filtering: (3094428, 2)
                        index   count
0                Leisure trip  417778
1                      Couple  252294
2               Solo traveler  108545
3               Business trip   82939
4                       Group   65392
5  Family with young children   61015
6  Family with older children   26349
7      Travelers with friends    2143
8                  With a pet    1405


In [23]:
vader_sentiment = SentimentIntensityAnalyzer()

In [24]:
# There are 3 possibilities of input for a review:
# It could be "No Negative", in which case, return 0
# It could be "No Positive", in which case, return 0
# It could be a review, in which case calculate the sentiment
def calc_sentiment(review):    
    if review == "No Negative" or review == "No Positive":
        return 0
    return vader_sentiment.polarity_scores(review)["compound"]    

In [25]:
# Remove stop words
cache = set(stopwords.words("english"))
def remove_stopwords(review):
    text = " ".join([word for word in review.split() if word not in cache])
    return text

In [26]:
# Remove the stop words from both columns
df.Negative_Review = df.Negative_Review.apply(remove_stopwords)   
df.Positive_Review = df.Positive_Review.apply(remove_stopwords)

In [27]:
# Add a negative sentiment and positive sentiment column
print("Calculating sentiment columns for both positive and negative reviews")
start = time.time()
df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment)
df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment)
end = time.time()
print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds")

Calculating sentiment columns for both positive and negative reviews
Calculating sentiment took 129.36 seconds


In [28]:
df = df.sort_values(by=["Negative_Sentiment"], ascending=True)
print(df[["Negative_Review", "Negative_Sentiment"]])
df = df.sort_values(by=["Positive_Sentiment"], ascending=True)
print(df[["Positive_Review", "Positive_Sentiment"]])

                                          Negative_Review  Negative_Sentiment
186584  So bad experience memories I hotel The first n...             -0.9920
129503  First charged twice room booked booking second...             -0.9896
307286  The staff Had bad experience even booking Janu...             -0.9889
201953  Everything DO NOT STAY AT THIS HOTEL I never i...             -0.9886
452092  No WLAN room Incredibly rude restaurant staff ...             -0.9884
...                                                   ...                 ...
138365  Wifi terribly slow I speed test network upload...              0.9938
79215   I find anything hotel first I walked past hote...              0.9938
278506  The property great location There bakery next ...              0.9945
339189  Guys I like hotel I wish return next year Howe...              0.9948
480509  I travel lot far visited countless number hote...              0.9957

[515738 rows x 2 columns]
                                     

In [29]:
df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", "Family_with_young_children", "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"], axis=1)


In [30]:
df

Unnamed: 0,Hotel_Name,Hotel_Address,Total_Number_of_Reviews,Average_Score,Reviewer_Score,Negative_Sentiment,Positive_Sentiment,Reviewer_Nationality,Leisure_trip,Couple,Solo_traveler,Business_trip,Group,Family_with_young_children,Family_with_older_children,With_a_pet,Negative_Review,Positive_Review
137893,South Place Hotel,"London, United Kingdom",326,9.3,2.5,-0.9849,-0.9820,Brazil,0,1,0,0,0,0,0,0,No windows claustrophobic place even superior ...,Bathroom Shower We going stay twice hotel 2 ni...
5839,Park Plaza County Hall London,"London, United Kingdom",2223,8.4,3.8,-0.9780,-0.9780,Saudi Arabia,1,0,0,0,0,1,0,0,I completely disappointed mad since reception ...,I completely disappointed mad since reception ...
64158,Britannia International Hotel Canary Wharf,"London, United Kingdom",4789,6.8,3.3,-0.4767,-0.9751,Australia,1,1,0,0,0,0,0,0,everything terrible,get everything extra internet parking breakfas...
124178,Caesar Hotel,"London, United Kingdom",317,8.3,3.3,0.1082,-0.9721,United States Minor Outlying Islands,1,0,0,0,1,0,0,0,Everything I worst experience ever best friend...,I didnt like anythig Room small Asked upgrade ...
489137,Hotel Da Vinci,"Milan, Italy",1877,7.8,2.5,0.0000,-0.9703,United Kingdom,1,0,0,0,0,1,0,0,No Negative,Very rude manager abusive staff reception Dirt...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417442,The Langham London,"London, United Kingdom",559,9.3,10.0,0.9426,0.9985,United Kingdom,1,0,1,0,0,0,0,0,Absolutely nothing bad speak Langham excelled ...,We celebrated wedding night Langham I commend ...
322920,The Guesthouse Vienna,"Vienna, Austria",292,9.5,10.0,0.0000,0.9985,Ireland,1,1,0,0,0,0,0,0,No Negative,From moment stepped doors Guesthouse Hotel sta...
132492,Grand Pigalle Hotel,"Paris, France",114,9.2,9.6,0.6124,0.9987,United States of America,1,1,0,0,0,0,0,0,The highest floor elevator access take elevato...,We arrived super cute boutique hotel area expl...
287419,Thistle Holborn The Kingsley,"London, United Kingdom",709,8.6,10.0,0.0000,0.9987,United Kingdom,1,1,0,0,0,0,0,0,No Negative,When first arrived hotel staff incredibly frie...
