In [None]:
import pandas as pd
from google.cloud import language_v1
from google.oauth2 import service_account
from tqdm import tqdm
import ast
from collections import Counter
import re
import numpy as np
from pandas_profiling import ProfileReport

In [None]:
tqdm.pandas()

  from pandas import Panel


In [None]:
hotel = pd.read_csv("Hotel_Reviews.csv")

In [None]:
# balancing & sampling the dataset
rev_per_nat = 500
counts = pd.DataFrame(hotel['Reviewer_Nationality'].value_counts())
counts = counts[counts['Reviewer_Nationality'] > 2000] # 17 countries with 2k reviews and more
nationalities = counts.reset_index()['index']

hotel_nat = pd.DataFrame()
for n in nationalities:
    hotel_subset = hotel[hotel['Reviewer_Nationality'] == n].sample(rev_per_nat, random_state=123)
    hotel_nat = hotel_nat.append(hotel_subset)

In [None]:
hotel_nat['Reviewer_Nationality'].value_counts()

 Qatar                        500
 United Arab Emirates         500
 India                        500
 Hong Kong                    500
 Russia                       500
 Lebanon                      500
 Switzerland                  500
 Sweden                       500
 South Africa                 500
 United States of America     500
 Italy                        500
 Greece                       500
 Egypt                        500
 Australia                    500
 United Kingdom               500
 Spain                        500
 Germany                      500
 Saudi Arabia                 500
 Poland                       500
 Romania                      500
 Israel                       500
 Turkey                       500
 Norway                       500
 France                       500
 Belgium                      500
 Czech Republic               500
 New Zealand                  500
 Netherlands                  500
 Austria                      500
 Singapore    

In [None]:
hotel_nat.shape

(17000, 17)

In [None]:
hotel_nat['Review'] = hotel_nat['Positive_Review'] + '. ' + hotel_nat['Negative_Review']

In [None]:
creds = service_account.Credentials.from_service_account_file('./credentials.json')
client = language_v1.LanguageServiceClient(credentials=creds)

In [None]:
language = "en"
type_ = language_v1.Document.Type.PLAIN_TEXT
encoding_type = language_v1.EncodingType.UTF8

def get_sent(value):
    res = client.analyze_sentiment(request = {'document': {"content": value, "type_": type_, "language": language}, 'encoding_type': encoding_type})
    return res.document_sentiment.score

In [None]:
hotel_nat['Positive_Review_sent'] = hotel_nat['Positive_Review'].progress_apply(get_sent)
hotel_nat['Negative_Review_sent'] = hotel_nat['Negative_Review'].progress_apply(get_sent)
hotel_nat['Review_sent'] = hotel_nat['Review'].progress_apply(get_sent)

100%|██████████| 17000/17000 [55:29<00:00,  5.11it/s]  


In [None]:
for i,row in hotel_nat.iterrows():
    tags = ast.literal_eval(row['Tags'])
    for tag in tags:
        t = tag.strip()
            
        m = re.match('Stayed (\d+) nights?', t)
        if m:
            hotel_nat.loc[i, 'Stayed_Nights'] = m.group(1)
        if re.match('Couple', t):
            hotel_nat.loc[i,'N_Persons'] = 'couple'
        if re.match('Solo traveler', t):
            hotel_nat.loc[i,'N_Persons'] = 'solo'
        if re.match('Group', t) or re.match('Travelers with friends', t):
            hotel_nat.loc[i,'N_Persons'] = 'group'
        if re.match('Family', t):
            hotel_nat.loc[i,'N_Persons'] = 'family'
        if re.match('Business', t):
            hotel_nat.loc[i, 'Travel_type'] = 'business'
        if re.match('Leisure', t):
            hotel_nat.loc[i, 'Travel_type'] = 'leisure'
        hotel_nat.loc[i, 'From_Mobile'] = 1 if re.match('Submitted from a mobile device', t) else 0

In [None]:
hotel_nat

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng,Positive_Review_sent,Negative_Review_sent
247399,8 Avenue Rapp 7th arr 75007 Paris France,74,7/5/2016,8.9,Derby Alma,United Kingdom,Parking But it s all the same in any city I s...,21,509,Cracking view of the tower,7.0,3.0,8.3,"[' Leisure trip ', ' Family with young childre...",394 day,48.860503,2.300661,0.8,-0.2
361425,Lakeside Way Brent London HA9 0BU United Kingdom,1427,12/14/2016,8.8,Hilton London Wembley,United Kingdom,The second night with X factor personnel stay...,152,4305,First night was fine second night was a night...,25.0,1.0,7.9,"[' Leisure trip ', ' Group ', ' Deluxe King Ro...",232 day,51.557696,-0.283526,-0.7,-0.7
113726,225 Edgware Road Westminster Borough London W2...,1485,8/30/2016,7.5,Hilton London Metropole,United Kingdom,Rude staff room not ready at 3 30 pm Request ...,21,6977,No Positive,0.0,2.0,4.6,"[' Leisure trip ', ' Family with young childre...",338 day,51.519569,-0.170521,-0.7,-0.7
2067,1 3 Queens Garden Westminster Borough London W...,1058,5/28/2017,7.7,The Park Grand London Paddington,United Kingdom,No Negative,0,4380,The hotel was perfect for a short stay in the...,27.0,2.0,8.8,"[' Leisure trip ', ' Couple ', ' Deluxe Double...",67 days,51.514218,-0.180903,0.9,0.3
482082,Via Mauro Macchi 1 Central Station 20124 Milan...,65,11/2/2015,8.4,Hotel Mediolanum,United Kingdom,Unfriendly staff,3,885,The furniture,3.0,17.0,5.8,"[' Leisure trip ', ' Group ', ' Double or Twin...",640 day,45.481342,9.203949,0.1,-0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312766,Delflandlaan 15 Slotervaart 1062 EA Amsterdam ...,947,8/31/2015,8.7,Best Western Premier Hotel Couture,Czech Republic,Minibar not working Parking lot is not free,9,8177,Overall cleanness Good location New furnishin...,11.0,2.0,9.6,"[' Leisure trip ', ' Couple ', ' Standard Doub...",703 day,52.351114,4.841163,0.7,-0.7
434139,Rooseveltplatz 15 09 Alsergrund 1090 Vienna Au...,165,5/3/2016,8.4,Hotel Regina,Czech Republic,single room is really small and I do not beli...,16,1948,perfect location,3.0,12.0,6.7,"[' Business trip ', ' Solo traveler ', ' Singl...",457 day,48.216334,16.359554,0.9,-0.7
60292,152 Cricklewood Broadway Cricklewood London NW...,512,11/30/2015,8.0,Clayton Crown Hotel London,Czech Republic,it was little bit cold inside of our room and...,58,2491,we were very happy with kind hotel stuff very...,68.0,11.0,8.3,"[' Leisure trip ', ' Couple ', ' Deluxe Double...",612 day,51.556155,-0.214182,0.9,-0.7
403422,Pelzgasse 1 15 Rudolfsheim F nfhaus 1150 Vienn...,37,3/13/2017,8.3,Arthotel ANA Westbahn,Czech Republic,No Negative,0,450,i love everything I have done a lot of travel...,25.0,1.0,10.0,"[' Leisure trip ', ' Solo traveler ', ' Single...",143 day,48.197951,16.336318,0.9,0.3


In [None]:
hotel_nat.to_csv('Hotel_Reviews_pp4.csv')

In [None]:
hotel_nat = pd.read_csv('Hotel_Reviews_pp4.csv')

hotel_nat.drop(axis=1, columns=['Unnamed: 0'], inplace=True)
hotel_nat
hotel_nat.to_csv('Hotel_Reviews_pp5.csv', index=False)